This source file includes following definitions.
- fftss_map448
- fftss_map844
- fftss_map488
- fftss_map884
- fftss_use_kset
- fftss_try_kset
- fftss_plan_dft_1d_estimate
- fftss_plan_dft_1d_measure
- fftss_plan_dft_1d
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "libfftss.h"
extern void fftss_counter_clear(void);
extern void fftss_counter_start(void);
extern long fftss_counter_end(void);
int fftss_verbose = 0;
int fftss_map448(fftss_plan_1d p, long logn2)
{
long i;
long logn4, logn8;
logn8 = logn2 & 1;
logn4 = (logn2 - logn8 * 3) >> 1;
for (i = 0; i < logn4; i++) p->k[i].radix = 4;
for (i = 0; i < logn8; i++) p->k[i + logn4].radix = 8;
p->stages = logn4 + logn8;
return 0;
}
int fftss_map844(fftss_plan_1d p, long logn2)
{
long i;
long logn4, logn8;
logn8 = logn2 & 1;
logn4 = (logn2 - logn8 * 3) >> 1;
for (i = 0; i < logn8; i++) p->k[i].radix = 8;
for (i = 0; i < logn4; i++) p->k[i + logn8].radix = 4;
p->stages = logn4 + logn8;
if (logn8 == 0 || logn4 == 0) return -1;
return 0;
}
int fftss_map488(fftss_plan_1d p, long logn2)
{
long i;
long logn4, logn8;
logn4 = (3 - (logn2 % 3)) % 3;
logn8 = (logn2 - logn4 * 2) / 3;
for (i = 0; i < logn4; i++) p->k[i].radix = 4;
for (i = 0; i < logn8; i++) p->k[i + logn4].radix = 8;
p->stages = logn4 + logn8;
return 0;
}
int fftss_map884(fftss_plan_1d p, long logn2)
{
long i;
long logn4, logn8;
logn4 = (3 - (logn2 % 3)) % 3;
logn8 = (logn2 - logn4 * 2) / 3;
for (i = 0; i < logn8; i++) p->k[i].radix = 8;
for (i = 0; i < logn4; i++) p->k[i + logn8].radix = 4;
p->stages = logn4 + logn8;
if (logn8 == 0 || logn4 == 0) return -1;
return 0;
}
void fftss_use_kset(fftss_plan_1d p, fftss_kernel *kset4, fftss_kernel *kset8)
{
fftss_kernel *kset;
long i, j, k;
j = p->n; k = 1;
for (i = 0; i < p->stages; i++) {
switch (p->k[i].radix) {
case 4:
kset = kset4;
break;
case 8:
kset = kset8;
break;
}
j /= p->k[i].radix;
p->k[i].blocks = k;
p->k[i].bsize = j;
p->k[i].kern = j > 16 ? kset[KSET_ONE] :
(j == 1 ? kset[KSET_UNROLL1] :
(j == 4 ? kset[KSET_UNROLL4] :
(j == 8 ? kset[KSET_UNROLL8] : kset[KSET_NORMAL])));
k *= p->k[i].radix;
}
}
fftss_map fftss_map_list[] =
{fftss_map448, fftss_map844, fftss_map488, fftss_map884};
static void
fftss_try_kset(fftss_plan_1d p,
fftss_kernel *kset4, fftss_kernel *kset8,
long table_type, long kset_id, char *name)
{
long i;
if (fftss_verbose)
printf("%16s [table=%ld,code=%ld]\t", name, table_type, kset_id);
for (i = 0; i < FFTSS_MAP_MAX; i++) {
if (fftss_map_list[i](p, p->logn2) < 0) continue;
fftss_use_kset(p, kset4, kset8);
(*(p->fp))(p, p->in, p->out);
fftss_counter_start();
(*(p->fp))(p, p->in, p->out);
if (fftss_counter_end()) {
p->map_id = i;
p->table_type = table_type;
p->kset_id = kset_id;
}
}
if (fftss_verbose) printf("\n");
}
void fftss_plan_dft_1d_estimate(fftss_plan_1d p)
{
char *kset_name =
#if defined(USE_SSE3)
"SSE3"
#elif defined(USE_SSE2)
"SSE2 (1)"
#elif defined(__ia64__) && !defined(NO_ASM)
"IA-64 asm"
#elif defined(BlueGene) && !defined(NO_ASM) && !defined(USE_WRAP440D)
"Blue Gene asm"
#elif defined(BlueGene)
"Blue Gene (PL)"
#else
"FMA"
#endif
;
#ifdef HAVE_X86_CPUID
if (fftss_cpu < 0) fftss_check_cpu();
if (fftss_cpu & FFTSS_X86_AMD) kset_name = "FMA";
#endif
for (p->kset_id = 0; fftss_kset_list[p->kset_id].name; p->kset_id++)
if (strcmp(fftss_kset_list[p->kset_id].name, kset_name) == 0) break;
if (fftss_kset_list[p->kset_id].name == NULL) {
kset_name = "FMA";
for (p->kset_id = 0; fftss_kset_list[p->kset_id].name; p->kset_id++)
if (strcmp(fftss_kset_list[p->kset_id].name, kset_name) == 0) break;
if (fftss_kset_list[p->kset_id].name == NULL) {
printf("Internal Error in fftss_plan_dft_1d_estimate()\n");
return;
}
}
p->table_type = fftss_kset_list[p->kset_id].table_type;
p->map_id = 0;
}
void fftss_plan_dft_1d_measure(fftss_plan_1d p)
{
long i;
long tbl_type;
int available;
fftss_counter_clear();
if (fftss_cpu < 0) fftss_check_cpu();
available = fftss_cpu;
if (!(p->flags & FFTSS_UNALIGNED))
available |= FFTSS_ALIGN;
tbl_type = -1;
i = 0;
while (fftss_kset_list[i].name) {
if (fftss_kset_list[i].enabled == 0) { i++; continue; }
if ((p->flags & FFTSS_NO_SIMD) &&
(fftss_kset_list[i].required & FFTSS_SIMD)) { i++; continue; }
if ((fftss_kset_list[i].required & available)
!= fftss_kset_list[i].required)
{ i++; continue; }
if (fftss_kset_list[i].table_type != tbl_type) {
tbl_type = fftss_kset_list[i].table_type;
fftss_table_types[tbl_type].gen(p);
}
if (p->sign == FFTSS_FORWARD)
fftss_try_kset(p, fftss_kset_list[i].r4f, fftss_kset_list[i].r8f,
tbl_type, i, fftss_kset_list[i].name);
else
fftss_try_kset(p, fftss_kset_list[i].r4b, fftss_kset_list[i].r8b,
tbl_type, i, fftss_kset_list[i].name);
i++;
}
}
fftss_plan fftss_plan_dft_1d(long n, double *in, double *out,
long sign, long flags)
{
fftss_plan pp;
fftss_plan_1d p;
long k;
long logn2;
double *tbl;
fftss_verbose = flags & FFTSS_VERBOSE;
if (fftss_verbose)
printf("Planning %ld-point %s transform....\n",
n, sign == FFTSS_FORWARD ? "forward" : "backward");
pp = malloc(sizeof(fftss_plan_s));
pp->fp = NULL;
pp->p2 = NULL;
pp->p3 = NULL;
pp->nx = n;
pp->ny = 0;
pp->nz = 0;
p = malloc(sizeof(fftss_plan_1d_s));
pp->p1 = p;
p->n = n;
p->in = in;
p->out = out;
p->flags = flags;
p->sign = sign;
if ((p->flags & FFTSS_INOUT) || (in == out))
p->fp = fftss_execute_inplace_dft_1d;
else
p->fp = fftss_execute_dft_1d;
if (((size_t)in & 0xf) || ((size_t)out & 0xf)) {
p->flags |= FFTSS_UNALIGNED;
if (((size_t)in & 0x7) || ((size_t)out & 0x7))
printf("Warning: input/output array should be aligned \n"
"to at least 8byte boundary.\n");
}
logn2 = 0;
for (k = n; k > 1; k >>= 1, logn2 ++);
if (n != 1 << logn2) {
printf("n=%ld: not supported.\n", n);
return NULL;
}
p->logn2 = logn2;
p->k = NULL;
p->work = NULL;
if (logn2 < 2) return pp;
p->stages = 0;
p->k = malloc(sizeof(fftss_kern) * p->logn2);
p->w = fftss_malloc(sizeof(double) * n * 2);
if ((flags & FFTSS_PRESERVE_INPUT) || (in == out)) {
#if 0
if ((tbl = fftss_table_alloc(p->n, -1)) != NULL) {
p->work = tbl;
if (p->flags & FFTSS_VERBOSE)
printf("Using shared work space.\n");
} else {
#endif
p->work = fftss_malloc(sizeof(double) * n * 2);
#if 0
fftss_table_add(p->n, -1, p->work);
}
#endif
} else p->work = NULL;
if (flags & FFTSS_ESTIMATE)
fftss_plan_dft_1d_estimate(p);
else
fftss_plan_dft_1d_measure(p);
if (fftss_verbose) printf("%s (map %ld) is selected\n",
fftss_kset_list[p->kset_id].name, p->map_id);
if ((tbl = fftss_table_alloc(p->n, p->table_type)) != NULL) {
if (fftss_verbose)
printf("Using shared table.\n");
fftss_free(p->w);
p->w = tbl;
} else {
fftss_table_types[p->table_type].gen(p);
fftss_table_add(p->n, p->table_type, p->w);
}
fftss_map_list[p->map_id](p, p->logn2);
if (sign == FFTSS_FORWARD) {
fftss_use_kset(p, fftss_kset_list[p->kset_id].r4f,
fftss_kset_list[p->kset_id].r8f);
} else {
fftss_use_kset(p, fftss_kset_list[p->kset_id].r4b,
fftss_kset_list[p->kset_id].r8b);
}
return pp;
}