This source file includes following definitions.
- trans_x
- trans_y
- alltoall0
- alltoalls0
- alltoallp0
- alltoallv0
- alltoallvs0
- alltoallvp0
- alltoall1
- alltoalls1
- alltoallp1
- alltoallv1
- alltoallvs1
- alltoallvp1
- pack0
- packv0
- unpack1
- unpackv1
- pfftss_execute_pdft_2d_a
- pfftss_execute_pdft_2d_s
- pfftss_execute_pdft_2d_p
- pfftss_execute_pdft_2d_va
- pfftss_execute_pdft_2d_vs
- pfftss_execute_pdft_2d_vp
- pfftss_execute
- pfftss_execute_dft
- pfftss_destroy_plan
- test_run
- pfftss_plan_dft_2d
#ifdef HAVE_CONFIG_H
#include "config.h"
#elif defined(_WIN32)
#include "win32config.h"
#endif
#include <stdio.h>
#include <malloc.h>
#ifdef _OPENMP
#include <omp.h>
#endif
#include <mpi.h>
#include <fftss.h>
#ifndef _OPENMP
#define omp_get_max_threads() 1
#define omp_get_thread_num() 0
#endif
typedef struct _pfftss_plan_s {
void (*fp)(struct _pfftss_plan_s *, double *);
long nx, ny, oy, ly, ox, lx;
int id, npe;
long py, plx;
double **b;
double *inout, *sb, *rb;
long *oxp, *oyp;
int *soff, *roff;
int *rsoff, *rroff;
int *ssize, *rsize;
MPI_Request *sreq, *rreq;
MPI_Status *st;
fftss_plan p1, p2;
MPI_Comm comm;
#ifdef HAVE_MPI2
MPI_Win rwin, swin;
#endif
long flags;
} pfftss_plan_s;
typedef pfftss_plan_s *pfftss_plan;
#if 1
extern void fftss_copyin(double *, double *, long, long);
extern void fftss_copyout(double *, double *, long, long);
#endif
static void trans_x(pfftss_plan p, double *inout)
{
long i;
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (i = 0; i < p->ly; i++)
fftss_execute_dft(p->p1, inout + i * p->py * 2,
p->b[2 * omp_get_thread_num()]);
}
static void trans_y(pfftss_plan p)
{
long i;
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (i = 0; i < p->lx; i++) {
double *b0, *b1;
b0 = p->b[2 * omp_get_thread_num()];
b1 = p->b[2 * omp_get_thread_num() + 1];
fftss_copyin(b0, p->rb + i * 2, p->plx, p->ny);
fftss_execute_dft(p->p2, b0, b1);
fftss_copyout(p->rb + i * 2, b0, p->plx, p->ny);
}
}
static void alltoall0(pfftss_plan p)
{
MPI_Alltoall(p->sb, p->plx * p->ly * 2, MPI_DOUBLE,
p->rb, p->plx * p->ly * 2, MPI_DOUBLE,
p->comm);
}
static void alltoalls0(pfftss_plan p)
{
long i;
int bsize;
bsize = p->plx * p->ly * 2;
for (i = 1; i < p->npe; i++) {
int d;
d = (p->id + i) % p->npe;
MPI_Irecv(p->rb + bsize * d, bsize, MPI_DOUBLE, d,
MPI_ANY_TAG, p->comm, &(p->rreq[i - 1]));
}
for (i = 1; i < p->npe; i++) {
int d;
d = (p->id + i) % p->npe;
MPI_Isend(p->sb + bsize * d, bsize, MPI_DOUBLE, d,
0, p->comm, &(p->sreq[i - 1]));
}
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (i = 0; i < bsize; i++)
p->rb[bsize * p->id + i] = p->sb[bsize * p->id + i];
MPI_Waitall(p->npe - 1, p->rreq, p->st);
MPI_Waitall(p->npe - 1, p->sreq, p->st);
}
#ifdef HAVE_MPI2
static void alltoallp0(pfftss_plan p)
{
long i;
int bsize;
MPI_Win_fence(MPI_MODE_NOPRECEDE, p->rwin);
bsize = p->plx * p->ly * 2;
for (i = 1; i < p->npe; i++) {
int d;
d = (p->id + i) % p->npe;
MPI_Put(p->sb + bsize * d, bsize, MPI_DOUBLE, d,
bsize * p->id, bsize, MPI_DOUBLE, p->rwin);
}
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (i = 0; i < bsize; i++)
p->rb[bsize * p->id + i] = p->sb[bsize * p->id + i];
MPI_Win_fence(MPI_MODE_NOSUCCEED, p->rwin);
}
#endif
static void alltoallv0(pfftss_plan p)
{
MPI_Alltoallv(p->sb, p->ssize, p->soff, MPI_DOUBLE,
p->rb, p->rsize, p->roff, MPI_DOUBLE,
p->comm);
}
static void alltoallvs0(pfftss_plan p)
{
long i;
for (i = 1; i < p->npe; i++) {
int d;
d = (p->id + i) % p->npe;
MPI_Irecv(p->rb + p->roff[d], p->rsize[d], MPI_DOUBLE, d,
MPI_ANY_TAG, p->comm, &(p->rreq[i - 1]));
}
for (i = 1; i < p->npe; i++) {
int d;
d = (p->id + i) % p->npe;
MPI_Isend(p->sb + p->soff[d], p->ssize[d], MPI_DOUBLE, d,
0, p->comm, &(p->sreq[i - 1]));
}
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (i = 0; i < p->ssize[p->id]; i++)
p->rb[p->roff[p->id] + i] = p->sb[p->soff[p->id] + i];
MPI_Waitall(p->npe - 1, p->rreq, p->st);
MPI_Waitall(p->npe - 1, p->sreq, p->st);
}
#ifdef HAVE_MPI2
static void alltoallvp0(pfftss_plan p)
{
long i;
MPI_Win_fence(MPI_MODE_NOPRECEDE, p->rwin);
for (i = 1; i < p->npe; i++) {
int d;
d = (p->id + i) % p->npe;
MPI_Put(p->sb + p->soff[d], p->ssize[d], MPI_DOUBLE, d,
p->rroff[d], p->ssize[d], MPI_DOUBLE, p->rwin);
}
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (i = 0; i < p->ssize[p->id]; i++)
p->rb[p->roff[p->id] + i] = p->sb[p->soff[p->id] + i];
MPI_Win_fence(MPI_MODE_NOSUCCEED, p->rwin);
}
#endif
static void alltoall1(pfftss_plan p)
{
MPI_Alltoall(p->rb, p->plx * p->ly * 2, MPI_DOUBLE,
p->sb, p->plx * p->ly * 2, MPI_DOUBLE,
p->comm);
}
static void alltoalls1(pfftss_plan p)
{
long i;
int bsize;
bsize = p->plx * p->ly * 2;
for (i = 1; i < p->npe; i++) {
int d;
d = (p->id + i) % p->npe;
MPI_Irecv(p->sb + bsize * d, bsize, MPI_DOUBLE, d,
MPI_ANY_TAG, p->comm, &(p->rreq[i - 1]));
}
for (i = 1; i < p->npe; i++) {
int d;
d = (p->id + i) % p->npe;
MPI_Isend(p->rb + bsize * d, bsize, MPI_DOUBLE, d,
0, p->comm, &(p->sreq[i - 1]));
}
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (i = 0; i < bsize; i++)
p->sb[bsize * p->id + i] = p->rb[bsize * p->id + i];
MPI_Waitall(p->npe - 1, p->rreq, p->st);
MPI_Waitall(p->npe - 1, p->sreq, p->st);
}
#ifdef HAVE_MPI2
static void alltoallp1(pfftss_plan p)
{
long i;
int bsize;
MPI_Win_fence(MPI_MODE_NOPRECEDE, p->swin);
bsize = p->plx * p->ly * 2;
for (i = 1; i < p->npe; i++) {
int d;
d = (p->id + i) % p->npe;
MPI_Put(p->rb + bsize * d, bsize, MPI_DOUBLE, d,
bsize * p->id, bsize, MPI_DOUBLE, p->swin);
}
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (i = 0; i < bsize; i++)
p->sb[bsize * p->id + i] = p->rb[bsize * p->id + i];
MPI_Win_fence(MPI_MODE_NOSUCCEED, p->swin);
}
#endif
static void alltoallv1(pfftss_plan p)
{
MPI_Alltoallv(p->rb, p->rsize, p->roff, MPI_DOUBLE,
p->sb, p->ssize, p->soff, MPI_DOUBLE,
p->comm);
}
static void alltoallvs1(pfftss_plan p)
{
long i;
for (i = 1; i < p->npe; i++) {
int d;
d = (p->id + i) % p->npe;
MPI_Irecv(p->sb + p->soff[d], p->ssize[d], MPI_DOUBLE, d,
MPI_ANY_TAG, p->comm, &(p->rreq[i - 1]));
}
for (i = 1; i < p->npe; i++) {
int d;
d = (p->id + i) % p->npe;
MPI_Isend(p->rb + p->roff[d], p->rsize[d], MPI_DOUBLE, d,
0, p->comm, &(p->sreq[i - 1]));
}
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (i = 0; i < p->rsize[p->id]; i++)
p->sb[p->soff[p->id] + i] = p->rb[p->roff[p->id] + i];
MPI_Waitall(p->npe - 1, p->rreq, p->st);
MPI_Waitall(p->npe - 1, p->sreq, p->st);
}
#ifdef HAVE_MPI2
static void alltoallvp1(pfftss_plan p)
{
long i;
MPI_Win_fence(MPI_MODE_NOPRECEDE, p->swin);
for (i = 1; i < p->npe; i++) {
int d;
d = (p->id + i) % p->npe;
MPI_Put(p->rb + p->roff[d], p->rsize[d], MPI_DOUBLE, d,
p->rsoff[d], p->rsize[d], MPI_DOUBLE, p->swin);
}
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (i = 0; i < p->rsize[p->id]; i++)
p->sb[p->soff[p->id] + i] = p->rb[p->roff[p->id] + i];
MPI_Win_fence(MPI_MODE_NOSUCCEED, p->swin);
}
#endif
static void pack0(pfftss_plan p, double *inout)
{
long i;
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (i = 0; i < p->npe; i++) {
long j, k;
long lxp;
double *src, *dst;
lxp = p->plx;
for (j = 0; j < p->ly; j++) {
src = inout + p->lx * i * 2 + p->py * j * 2;
dst = p->sb + p->ly * lxp * i * 2 + lxp * j * 2;
for (k = 0; k < p->lx * 2; k++)
dst[k] = src[k];
}
}
}
static void packv0(pfftss_plan p, double *inout)
{
long i;
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (i = 0; i < p->npe; i++) {
long j, k;
long lx, lxp;
double *src, *dst;
lx = p->oxp[i + 1] - p->oxp[i];
lxp = p->plx;
for (j = 0; j < p->ly; j++) {
src = inout + p->oxp[i] * 2 + p->py * j * 2;
dst = p->sb + p->soff[i] + lxp * j * 2;
for (k = 0; k < lx * 2; k++)
dst[k] = src[k];
}
}
}
static void unpack1(pfftss_plan p, double *inout)
{
long i;
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (i = 0; i < p->npe; i++) {
long j, k;
long lxp;
double *src, *dst;
lxp = p->plx;
for (j = 0; j < p->ly; j++) {
dst = inout + p->lx * i * 2 + p->py * j * 2;
src = p->sb + lxp * p->ly * i * 2 + lxp * j * 2;
for (k = 0; k < p->lx * 2; k++)
dst[k] = src[k];
}
}
}
static void unpackv1(pfftss_plan p, double *inout)
{
long i;
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (i = 0; i < p->npe; i++) {
long j, k;
long lx, lxp;
double *src, *dst;
lx = p->oxp[i + 1] - p->oxp[i];
lxp = p->plx;
for (j = 0; j < p->ly; j++) {
dst = inout + p->oxp[i] * 2 + p->py * j * 2;
src = p->sb + p->soff[i] + lxp * j * 2;
for (k = 0; k < lx * 2; k++)
dst[k] = src[k];
}
}
}
void pfftss_execute_pdft_2d_a(pfftss_plan p, double *inout)
{
trans_x(p, inout);
pack0(p, inout);
alltoall0(p);
trans_y(p);
alltoall1(p);
unpack1(p, inout);
return;
}
void pfftss_execute_pdft_2d_s(pfftss_plan p, double *inout)
{
trans_x(p, inout);
pack0(p, inout);
alltoalls0(p);
trans_y(p);
alltoalls1(p);
unpack1(p, inout);
return;
}
#ifdef HAVE_MPI2
void pfftss_execute_pdft_2d_p(pfftss_plan p, double *inout)
{
trans_x(p, inout);
pack0(p, inout);
alltoallp0(p);
trans_y(p);
alltoallp1(p);
unpack1(p, inout);
return;
}
#endif
void pfftss_execute_pdft_2d_va(pfftss_plan p, double *inout)
{
trans_x(p, inout);
packv0(p, inout);
alltoallv0(p);
trans_y(p);
alltoallv1(p);
unpackv1(p, inout);
return;
}
void pfftss_execute_pdft_2d_vs(pfftss_plan p, double *inout)
{
trans_x(p, inout);
packv0(p, inout);
alltoallvs0(p);
trans_y(p);
alltoallvs1(p);
unpackv1(p, inout);
return;
}
#ifdef HAVE_MPI2
void pfftss_execute_pdft_2d_vp(pfftss_plan p, double *inout)
{
trans_x(p, inout);
packv0(p, inout);
alltoallvp0(p);
trans_y(p);
alltoallvp1(p);
unpackv1(p, inout);
return;
}
#endif
void pfftss_execute(pfftss_plan p)
{
(*(p->fp))(p, p->inout);
return;
}
void pfftss_execute_dft(pfftss_plan p, double *inout)
{
(*(p->fp))(p, inout);
return;
}
void pfftss_destroy_plan(pfftss_plan p)
{
long i;
#ifdef HAVE_MPI_ALLOC_MEM
MPI_Free_mem(p->sb);
MPI_Free_mem(p->rb);
#else
fftss_free(p->sb);
fftss_free(p->rb);
#endif
free(p->oxp);
free(p->oyp);
if (p->soff) {
free(p->soff);
free(p->roff);
free(p->rsoff);
free(p->rroff);
free(p->ssize);
free(p->rsize);
}
free(p->sreq);
free(p->rreq);
free(p->st);
for (i = 0; i < omp_get_max_threads() * 2; i++)
fftss_free(p->b[i]);
free(p->b);
if (p->p1 != p->p2) fftss_destroy_plan(p->p2);
fftss_destroy_plan(p->p1);
#ifdef HAVE_MPI2
MPI_Win_free(&p->rwin);
MPI_Win_free(&p->swin);
#endif
free(p);
}
static double test_run(pfftss_plan p, void (*fp)(pfftss_plan, double *))
{
double t;
MPI_Barrier(p->comm);
t = fftss_get_wtime();
(*fp)(p, p->inout);
MPI_Barrier(p->comm);
t = fftss_get_wtime() - t;
if ((p->flags & FFTSS_VERBOSE) && p->id == 0) printf("%lf sec.\n", t);
return t;
}
pfftss_plan
pfftss_plan_dft_2d(long nx, long ny, long py, long oy, long ly,
double *inout, long sign, long flags, MPI_Comm comm)
{
pfftss_plan p;
long l;
long i;
double t0, t1;
p = (pfftss_plan)malloc(sizeof(pfftss_plan_s));
p->nx = nx; p->ny = ny; p->py = py;
p->oy = oy; p->ly = ly;
p->comm = comm;
p->flags = flags;
MPI_Comm_size(comm, &(p->npe));
MPI_Comm_rank(comm, &(p->id));
p->ox = nx * p->id / p->npe;
p->lx = nx * (p->id + 1) / p->npe - p->ox;
p->plx = (nx + p->npe - 1) / p->npe;
if ((p->plx & 1) == 0) p->plx++;
p->inout = inout;
l = (nx > ny) ? nx : ny;
p->b = (double **)malloc(sizeof(double *) * 2 * omp_get_max_threads());
for (i = 0; i < omp_get_max_threads(); i++) {
p->b[i * 2] = (double *)fftss_malloc(sizeof(double) * 2 * l);
p->b[i * 2 + 1] = (double *)fftss_malloc(sizeof(double) * 2 * l);
}
p->sreq = (MPI_Request *)malloc(sizeof(MPI_Request) * p->npe);
p->rreq = (MPI_Request *)malloc(sizeof(MPI_Request) * p->npe);
p->st = (MPI_Status *)malloc(sizeof(MPI_Status) * p->npe);
#ifdef _OPENMP
#pragma omp parallel private(i)
#endif
{
double *b0, *b1;
int id;
id = omp_get_thread_num();
b0 = p->b[id * 2];
b1 = p->b[id * 2 + 1];
for (i = 0; i < l * 2; i++) b0[i] = 0.0;
for (i = 0; i < l * 2; i++) b1[i] = 0.0;
}
p->p1 = fftss_plan_dft_1d(nx, p->b[0], p->b[1], sign, FFTSS_INOUT);
if (nx != ny)
p->p2 = fftss_plan_dft_1d(ny, p->b[0], p->b[1], sign, FFTSS_INOUT);
else p->p2 = p->p1;
p->oxp = (long *)malloc(sizeof(long) * (p->npe + 1));
p->oyp = (long *)malloc(sizeof(long) * (p->npe + 1));
MPI_Allgather(&(p->ox), 1, MPI_LONG, p->oxp, 1, MPI_LONG, comm);
MPI_Allgather(&(p->oy), 1, MPI_LONG, p->oyp, 1, MPI_LONG, comm);
p->oxp[p->npe] = nx;
p->oyp[p->npe] = ny;
l = p->ly;
for (i = 0; i < p->npe; i++)
if (p->oyp[i + 1] - p->oyp[i] != l) break;
if (nx % p->npe == 0 && ny % p->npe == 0 && i == p->npe) {
if ((p->flags & FFTSS_VERBOSE) && p->id == 0) printf("uniform mode\n");
p->soff = NULL;
p->roff = NULL;
p->rsoff = NULL;
p->rroff = NULL;
p->ssize = NULL;
p->rsize = NULL;
#ifdef HAVE_MPI_ALLOC_MEM
MPI_Alloc_mem(p->plx * p->ny * sizeof(double) * 2,
MPI_INFO_NULL, &(p->sb));
MPI_Alloc_mem(p->plx * p->ny * sizeof(double) * 2,
MPI_INFO_NULL, &(p->rb));
#else
p->sb =
(double *)fftss_malloc(p->plx * p->ny * sizeof(double) * 2);
p->rb =
(double *)fftss_malloc(p->plx * p->ny * sizeof(double) * 2);
#endif
#ifdef HAVE_MPI2
MPI_Win_create(p->sb,
p->plx * p->ny * sizeof(double) * 2,
sizeof(double), MPI_INFO_NULL, comm, &(p->swin));
MPI_Win_create(p->rb,
p->plx * p->ny * sizeof(double) * 2,
sizeof(double), MPI_INFO_NULL, comm, &(p->rwin));
#endif
p->fp = pfftss_execute_pdft_2d_a;
t0 = test_run(p, pfftss_execute_pdft_2d_a);
t1 = test_run(p, pfftss_execute_pdft_2d_a);
if (t1 < t0) { t0 = t1; }
t1 = test_run(p, pfftss_execute_pdft_2d_s);
if (t1 < t0) { p->fp = pfftss_execute_pdft_2d_s; t0 = t1; }
t1 = test_run(p, pfftss_execute_pdft_2d_s);
if (t1 < t0) { p->fp = pfftss_execute_pdft_2d_s; t0 = t1; }
#ifdef HAVE_MPI2
t1 = test_run(p, pfftss_execute_pdft_2d_p);
if (t1 < t0) { p->fp = pfftss_execute_pdft_2d_p; t0 = t1; }
t1 = test_run(p, pfftss_execute_pdft_2d_p);
if (t1 < t0) { p->fp = pfftss_execute_pdft_2d_p; t0 = t1; }
#endif
} else {
p->ssize = (int *)malloc(sizeof(int) * p->npe);
p->rsize = (int *)malloc(sizeof(int) * p->npe);
p->soff = (int *)malloc(sizeof(int) * (p->npe + 1));
p->roff = (int *)malloc(sizeof(int) * (p->npe + 1));
for (i = 0; i < p->npe; i++) {
p->ssize[i] = p->plx * ly * 2;
p->rsize[i] = (p->oyp[i + 1] - p->oyp[i]) * p->plx * 2;
}
p->soff[0] = 0; p->roff[0] = 0;
for (i = 0; i < p->npe; i++) {
p->soff[i + 1] = p->soff[i] + p->ssize[i];
p->roff[i + 1] = p->roff[i] + p->rsize[i];
}
#ifdef HAVE_MPI_ALLOC_MEM
MPI_Alloc_mem(p->soff[p->npe] * sizeof(double), MPI_INFO_NULL, &(p->sb));
MPI_Alloc_mem(p->roff[p->npe] * sizeof(double), MPI_INFO_NULL, &(p->rb));
#else
p->sb = (double *)fftss_malloc(p->soff[p->npe] * sizeof(double));
p->rb = (double *)fftss_malloc(p->roff[p->npe] * sizeof(double));
#endif
p->rroff = (int *)malloc(sizeof(int) * p->npe);
p->rsoff = (int *)malloc(sizeof(int) * p->npe);
MPI_Alltoall(p->roff, 1, MPI_INT, p->rroff, 1, MPI_INT, comm);
MPI_Alltoall(p->soff, 1, MPI_INT, p->rsoff, 1, MPI_INT, comm);
#ifdef HAVE_MPI2
MPI_Win_create(p->sb, p->soff[p->npe] * sizeof(double), sizeof(double),
MPI_INFO_NULL, comm, &(p->swin));
MPI_Win_create(p->rb, p->roff[p->npe] * sizeof(double), sizeof(double),
MPI_INFO_NULL, comm, &(p->rwin));
#endif
p->fp = pfftss_execute_pdft_2d_va;
t0 = test_run(p, pfftss_execute_pdft_2d_va);
t1 = test_run(p, pfftss_execute_pdft_2d_va);
if (t1 < t0) { t0 = t1; }
t1 = test_run(p, pfftss_execute_pdft_2d_vs);
if (t1 < t0) { p->fp = pfftss_execute_pdft_2d_vs; t0 = t1; }
t1 = test_run(p, pfftss_execute_pdft_2d_vs);
if (t1 < t0) { p->fp = pfftss_execute_pdft_2d_vs; t0 = t1; }
#ifdef HAVE_MPI2
t1 = test_run(p, pfftss_execute_pdft_2d_vp);
if (t1 < t0) { p->fp = pfftss_execute_pdft_2d_vp; t0 = t1; }
t1 = test_run(p, pfftss_execute_pdft_2d_vp);
if (t1 < t0) { p->fp = pfftss_execute_pdft_2d_vp; t0 = t1; }
#endif
}
return p;
}