This source file includes following definitions.
- fftss_execute_dft_2d_bb
 
- fftss_execute_dft_2d_bc
 
- fftss_execute_dft_2d_cc
 
- fftss_execute_dft_2d_cb
 
- fftss_execute_dft_2d_preserve
 
- fftss_plan_dft_2d
 
- fftss_destroy_plan
 
#include <stdio.h>
#ifdef _OPENMP
#include <omp.h>
#endif
#include "libfftss.h"
void fftss_execute_dft_2d_bb(fftss_plan p, double *in, double *out)
{
  long i;
#ifdef _OPENMP
#pragma omp parallel for schedule(static)
#endif
  for (i = 0; i < p->ny; i++)
    fftss_execute_inplace_dft_1d(p->p1, in + i * p->py * 2,
                         p->b[2 * omp_get_thread_num()]);
  
#ifdef _OPENMP
#pragma omp parallel for schedule(static)
#endif
  for (i = 0; i < p->nx; i++) {
    double *b0, *b1;
    b0 = p->b[2 * omp_get_thread_num()];
    b1 = p->b[2 * omp_get_thread_num() + 1];
    fftss_copyin(b0, in + i * 2, p->py, p->ny);
    fftss_execute_inplace_dft_1d(p->p2, b0, b1);
    fftss_copyout(out + i * 2, b0, p->py, p->ny);
  }
  return;
}
void fftss_execute_dft_2d_bc(fftss_plan p, double *in, double *out)
{
  long i;
#ifdef _OPENMP
#pragma omp parallel for schedule(static)
#endif
  for (i = 0; i < p->ny; i++)
    fftss_execute_inplace_dft_1d(p->p1, in + i * p->py * 2,
                         p->b[2 * omp_get_thread_num()]);
  
#ifdef _OPENMP
#pragma omp parallel for schedule(static,1)
#endif
  for (i = 0; i < p->nx; i++) {
    double *b0, *b1;
    b0 = p->b[2 * omp_get_thread_num()];
    b1 = p->b[2 * omp_get_thread_num() + 1];
    fftss_copyin(b0, in + i * 2, p->py, p->ny);
    fftss_execute_inplace_dft_1d(p->p2, b0, b1);
    fftss_copyout(out + i * 2, b0, p->py, p->ny);
  }
  return;
}
void fftss_execute_dft_2d_cc(fftss_plan p, double *in, double *out)
{
  long i;
#ifdef _OPENMP
#pragma omp parallel for schedule(static,1)
#endif
  for (i = 0; i < p->ny; i++)
    fftss_execute_inplace_dft_1d(p->p1, in + i * p->py * 2,
                         p->b[2 * omp_get_thread_num()]);
  
#ifdef _OPENMP
#pragma omp parallel for schedule(static,1)
#endif
  for (i = 0; i < p->nx; i++) {
    double *b0, *b1;
    b0 = p->b[2 * omp_get_thread_num()];
    b1 = p->b[2 * omp_get_thread_num() + 1];
    fftss_copyin(b0, in + i * 2, p->py, p->ny);
    fftss_execute_inplace_dft_1d(p->p2, b0, b1);
    fftss_copyout(out + i * 2, b0, p->py, p->ny);
  }
  return;
}
void fftss_execute_dft_2d_cb(fftss_plan p, double *in, double *out)
{
  long i;
#ifdef _OPENMP
#pragma omp parallel for schedule(static,1)
#endif
  for (i = 0; i < p->ny; i++)
    fftss_execute_inplace_dft_1d(p->p1, in + i * p->py * 2,
                         p->b[2 * omp_get_thread_num()]);
  
#ifdef _OPENMP
#pragma omp parallel for schedule(static)
#endif
  for (i = 0; i < p->nx; i++) {
    double *b0, *b1;
    b0 = p->b[2 * omp_get_thread_num()];
    b1 = p->b[2 * omp_get_thread_num() + 1];
    fftss_copyin(b0, in + i * 2, p->py, p->ny);
    fftss_execute_inplace_dft_1d(p->p2, b0, b1);
    fftss_copyout(out + i * 2, b0, p->py, p->ny);
  }
  return;
}
void fftss_execute_dft_2d_preserve(fftss_plan p, double *in, double *out)
{
  long i;
#ifdef _OPENMP
#pragma omp parallel for
#endif
  for (i = 0; i < p->nx; i++) {
    double *b0, *b1;
    b0 = p->b[2 * omp_get_thread_num()];
    b1 = p->b[2 * omp_get_thread_num() + 1];
    fftss_copyin(b0, in + i * 2, p->py, p->ny);
    fftss_execute_inplace_dft_1d(p->p2, b0, b1);
    fftss_copyout(out + i * 2, b0, p->py, p->ny);
  }
#ifdef _OPENMP
#pragma omp parallel for
#endif
  for (i = 0; i < p->ny; i++)
    fftss_execute_inplace_dft_1d(p->p1, out + i * p->py * 2, 
                                 p->b[2 * omp_get_thread_num()]);
  
  return;
}
extern void fftss_destroy_plan_1d(fftss_plan_1d);
fftss_plan 
fftss_plan_dft_2d(long nx, long ny, long py,
                  double *in, double *out, long sign, long flags)
{
  fftss_plan p, pp;
  long l;
  long i;
  long flags_for_1d;
  double fastest = 1000000000.0;
  double t0, t1;
  p = malloc(sizeof(fftss_plan_s));
  p->nx = nx; p->ny = ny; p->py = py; p->nz = 0;
  p->in = in;
  p->out = out;
  if (flags & FFTSS_INOUT)
    p->out = p->in;
  flags_for_1d = flags &
    (FFTSS_NO_SIMD|FFTSS_VERBOSE|FFTSS_UNALIGNED|FFTSS_CONSERVE_MEMORY|
     FFTSS_EXHAUSTIVE|FFTSS_PATIENT|FFTSS_ESTIMATE|FFTSS_MEASURE);
  l = (nx > ny) ? nx : ny;
  p->b = malloc(sizeof(double *) * 2 * omp_get_max_threads());
  for (i = 0; i < omp_get_max_threads(); i++) {
    p->b[i * 2] = fftss_malloc(sizeof(double) * 2 * l);
    p->b[i * 2 + 1] = fftss_malloc(sizeof(double) * 2 * l);
  }
#ifdef _OPENMP
#pragma omp parallel private(i)
#endif
  {
    double *b0, *b1;
    int id;
    id = omp_get_thread_num();
    b0 = p->b[id * 2];
    b1 = p->b[id * 2 + 1];
    for (i = 0; i < l * 2; i++) b0[i] = 0.0;
    for (i = 0; i < l * 2; i++) b1[i] = 0.0;
  } 
  
  pp = fftss_plan_dft_1d(nx, p->b[0], p->b[1], sign, 
                         FFTSS_INOUT | flags_for_1d);
  if (pp == NULL) {
    for (i = 0; i < omp_get_max_threads(); i++) {
      free(p->b[i * 2]);
      free(p->b[i * 2 + 1]);
    }
    free(p->b);
    free(p);
  }
  p->p1 = pp->p1;
  free(pp);
  if (nx != ny) {
    pp = fftss_plan_dft_1d(ny, p->b[0], p->b[1], sign, 
                           FFTSS_INOUT | flags_for_1d);
    if (pp == NULL) {
      fftss_destroy_plan_1d(p->p1);
      for (i = 0; i < omp_get_max_threads(); i++) {
        free(p->b[i * 2]);
        free(p->b[i * 2 + 1]);
      }
      free(p->b);
      free(p);
    }
    p->p2 = pp->p1;
    free(pp);
  } else p->p2 = p->p1;
  p->p3 = NULL;
  
#define TRY(name, func) \
  {                                   \
    t0 = fftss_get_wtime();           \
    func(p, in,out);                  \
    t1 = fftss_get_wtime() - t0;      \
    if (flags & FFTSS_VERBOSE)        \
      printf("%s : %lf.\n", name, t1);\
    if (t1 < fastest) {               \
      fastest = t1;                   \
      p->fp = func;                   \
    }                                 \
  }
  if (flags & FFTSS_PRESERVE_INPUT)
    p->fp = fftss_execute_dft_2d_preserve;
  else {
    TRY("Block-Block", fftss_execute_dft_2d_bb);
    TRY("Block-Cyclic", fftss_execute_dft_2d_bc);
    TRY("Cyclic-Block", fftss_execute_dft_2d_cb);
    TRY("Cyclic-Cyclic", fftss_execute_dft_2d_cc);
  }
  return p;
}
extern void fftss_execute_dft_3d(fftss_plan, double *, double *);
  
void fftss_destroy_plan(fftss_plan p)
{
  long i;
  if (p->nz != 0) {
    fftss_destroy_plan_1d(p->p1);
    if (p->nx != p->ny) fftss_destroy_plan_1d(p->p2);
    if (p->nx != p->nz && p->ny != p->nz) fftss_destroy_plan_1d(p->p3);
    for (i = 0; i < omp_get_max_threads(); i++) {
      fftss_free(p->b[i * 2]);
      fftss_free(p->b[i * 2 + 1]);
    }
    free(p->b);
    free(p);
  } else if (p->ny != 0) {
    fftss_destroy_plan_1d(p->p1);
    if (p->nx != p->ny) fftss_destroy_plan_1d(p->p2);
    for (i = 0; i < omp_get_max_threads(); i++) {
      fftss_free(p->b[i * 2]);
      fftss_free(p->b[i * 2 + 1]);
    }
    free(p->b);
    free(p);
  } else if (p->nx != 0) {
    fftss_destroy_plan_1d(p->p1);
    free(p);
  }
  return;
}