Logo Search packages:      
Sourcecode: k6fftwgel version File versions  Download package

fftwnd.c

/*
 * Copyright (c) 1997-1999 Massachusetts Institute of Technology
 * Copyright (c) 2000-2001 Stefan Kral
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 */

#include <fftw-int.h>
#include <assert.h>

/* the number of buffers to use for buffered transforms: */
#define FFTWND_NBUFFERS 8

/* the default number of buffers to use: */
#define FFTWND_DEFAULT_NBUFFERS 0

/* the number of "padding" elements between consecutive buffer lines */
#define FFTWND_BUFFER_PADDING 8

static void destroy_plan_array(int rank, fftw_plan *plans);

static void init_test_array(fftw_complex *arr, int stride, int n)
{
     int j;

     for (j = 0; j < n; ++j) {
        c_re(arr[stride * j]) = 0.0;
        c_im(arr[stride * j]) = 0.0;
     }
}

/*
 * Same as fftw_measure_runtime, except for fftwnd plan.
 */
double fftwnd_measure_runtime(fftwnd_plan plan,
                        fftw_complex *in, int istride,
                        fftw_complex *out, int ostride)
{
     fftw_time begin, end, start;
     double t, tmax, tmin;
     int i, iter;
     int n;
     int repeat;

     if (plan->rank == 0)
        return 0.0;

     n = 1;
     for (i = 0; i < plan->rank; ++i)
        n *= plan->n[i];

     iter = 1;

     for (;;) {
        tmin = 1.0E10;
        tmax = -1.0E10;
        init_test_array(in, istride, n);

        start = fftw_get_time();
        /* repeat the measurement FFTW_TIME_REPEAT times */
        for (repeat = 0; repeat < FFTW_TIME_REPEAT; ++repeat) {
             begin = fftw_get_time();
             for (i = 0; i < iter; ++i) {
                fftwnd(plan, 1, in, istride, 0, out, ostride, 0);
             }
             end = fftw_get_time();

             t = fftw_time_to_sec(fftw_time_diff(end, begin));
             if (t < tmin)
                tmin = t;
             if (t > tmax)
                tmax = t;

             /* do not run for too long */
             t = fftw_time_to_sec(fftw_time_diff(end, start));
             if (t > FFTW_TIME_LIMIT)
                break;
        }

        if (tmin >= FFTW_TIME_MIN)
             break;

        iter *= 2;
     }

     tmin /= (double) iter;
     tmax /= (double) iter;

     return tmin;
}

/********************** Initializing the FFTWND Plan ***********************/

/* Initialize everything except for the 1D plans and the work array: */
fftwnd_plan fftwnd_create_plan_aux(int rank, const int *n,
                           fftw_direction dir, int flags)
{
     int i;
     fftwnd_plan p;

     if (rank < 0)
        return 0;

     for (i = 0; i < rank; ++i)
        if (n[i] <= 0)
             return 0;

     p = (fftwnd_plan) fftw_malloc(sizeof(fftwnd_data));
     p->n = 0;
     p->n_before = 0;
     p->n_after = 0;
     p->plans = 0;
     p->work = 0;
     p->dir = dir;

     p->rank = rank;
     p->is_in_place = flags & FFTW_IN_PLACE;

     p->nwork = 0;
     p->nbuffers = 0;

     if (rank == 0)
        return 0;

     p->n = (int *) fftw_malloc(sizeof(int) * rank);
     p->n_before = (int *) fftw_malloc(sizeof(int) * rank);
     p->n_after = (int *) fftw_malloc(sizeof(int) * rank);
     p->n_before[0] = 1;
     p->n_after[rank - 1] = 1;

     for (i = 0; i < rank; ++i) {
        p->n[i] = n[i];

        if (i) {
             p->n_before[i] = p->n_before[i - 1] * n[i - 1];
             p->n_after[rank - 1 - i] = p->n_after[rank - i] * n[rank - i];
        }
     }

     return p;
}

/* create an empty new array of rank 1d plans */
fftw_plan *fftwnd_new_plan_array(int rank)
{
     fftw_plan *plans;
     int i;

     plans = (fftw_plan *) fftw_malloc(rank * sizeof(fftw_plan));
     if (!plans)
        return 0;
     for (i = 0; i < rank; ++i)
        plans[i] = 0;
     return plans;
}

/* 
 * create an array of plans using the ordinary 1d fftw_create_plan,
 * which allocates its own array and creates plans optimized for
 * contiguous data. 
 */
fftw_plan *fftwnd_create_plans_generic(fftw_plan *plans,
                               int rank, const int *n,
                               fftw_direction dir, int flags)
{
     if (rank <= 0)
        return 0;

     if (plans) {
        int i, j;
        int cur_flags;

        for (i = 0; i < rank; ++i) {
             if (i < rank - 1 || (flags & FFTW_IN_PLACE)) {
                /* 
                 * fft's except the last dimension are always in-place 
                 */
                cur_flags = flags | FFTW_IN_PLACE;
                for (j = i - 1; j >= 0 && n[i] != n[j]; --j);
             } else {
                cur_flags = flags;
                /* 
                 * we must create a separate plan for the last
                 * dimension 
                 */
                j = -1;
             }

             if (j >= 0) {
                /* 
                 * If a plan already exists for this size
                 * array, reuse it: 
                 */
                plans[i] = plans[j];
             } else {
                /* generate a new plan: */
                plans[i] = fftw_create_plan(n[i], dir, cur_flags);
                if (!plans[i]) {
                   destroy_plan_array(rank, plans);
                   return 0;
                }
             }
        }
     }
     return plans;
}

static int get_maxdim(int rank, const int *n, int flags)
{
     int i;
     int maxdim = 0;

     for (i = 0; i < rank - 1; ++i)
        if (n[i] > maxdim)
             maxdim = n[i];
     if (rank > 0 && flags & FFTW_IN_PLACE && n[rank - 1] > maxdim)
        maxdim = n[rank - 1];

     return maxdim;
}

/* compute number of elements required for work array (has to
   be big enough to hold ncopies of the largest dimension in
   n that will need an in-place transform. */
int fftwnd_work_size(int rank, const int *n, int flags, int ncopies)
{
     return (ncopies * get_maxdim(rank, n, flags)
           + (ncopies - 1) * FFTWND_BUFFER_PADDING);
}

/*
 * create plans using the fftw_create_plan_specific planner, which
 * allows us to create plans for each dimension that are specialized
 * for the strides that we are going to use. 
 */
fftw_plan *fftwnd_create_plans_specific(fftw_plan *plans,
                              int rank, const int *n,
                              const int *n_after,
                              fftw_direction dir, int flags,
                              fftw_complex *in, int istride,
                              fftw_complex *out, int ostride)
{
     if (rank <= 0)
        return 0;

     if (plans) {
        int i, stride, cur_flags;
        fftw_complex *work = 0;
        int nwork;

        nwork = fftwnd_work_size(rank, n, flags, 1);
        if (nwork)
             work = (fftw_complex*)fftw_malloc(nwork * sizeof(fftw_complex));

        for (i = 0; i < rank; ++i) {
             /* fft's except the last dimension are always in-place */
             if (i < rank - 1)
                cur_flags = flags | FFTW_IN_PLACE;
             else
                cur_flags = flags;

             /* stride for transforming ith dimension */
             stride = n_after[i];

             if (cur_flags & FFTW_IN_PLACE)
                plans[i] = fftw_create_plan_specific(n[i], dir, cur_flags,
                                        in, istride * stride,
                                           work, 1);
             else
                plans[i] = fftw_create_plan_specific(n[i], dir, cur_flags,
                                        in, istride * stride,
                                      out, ostride * stride);
             if (!plans[i]) {
                destroy_plan_array(rank, plans);
                fftw_free(work);
                return 0;
             }
        }

        if (work)
             fftw_free(work);
     }
     return plans;
}

/*
 * Create an fftwnd_plan specialized for specific arrays.  (These
 * arrays are ignored, however, if they are NULL or if the flags do
 * not include FFTW_MEASURE.)  The main advantage of being provided
 * arrays like this is that we can do runtime timing measurements of
 * our options, without worrying about allocating excessive scratch
 * space.
 */
fftwnd_plan fftwnd_create_plan_specific(int rank, const int *n,
                              fftw_direction dir, int flags,
                              fftw_complex *in, int istride,
                              fftw_complex *out, int ostride)
{
     fftwnd_plan p;

     if (!(p = fftwnd_create_plan_aux(rank, n, dir, flags)))
        return 0;

     if (!(flags & FFTW_MEASURE) || in == 0
       || (!p->is_in_place && out == 0)) {

/**** use default plan ****/

        p->plans = fftwnd_create_plans_generic(fftwnd_new_plan_array(rank),
                                     rank, n, dir, flags);
        if (!p->plans) {
             fftwnd_destroy_plan(p);
             return 0;
        }

        if (flags & FFTWND_FORCE_BUFFERED)
             p->nbuffers = FFTWND_NBUFFERS;
        else
             p->nbuffers = FFTWND_DEFAULT_NBUFFERS;

        p->nwork = fftwnd_work_size(rank, n, flags, p->nbuffers + 1);
        if (p->nwork && !(flags & FFTW_THREADSAFE)) {
             p->work = (fftw_complex*) fftw_malloc(p->nwork 
                                         * sizeof(fftw_complex));
             if (!p->work) {
                fftwnd_destroy_plan(p);
                return 0;
             }
        }
     } else {
/**** use runtime measurements to pick plan ****/

        fftw_plan *plans_buf, *plans_nobuf;
        double t_buf, t_nobuf;

        p->nwork = fftwnd_work_size(rank, n, flags, FFTWND_NBUFFERS + 1);
        if (p->nwork && !(flags & FFTW_THREADSAFE)) {
             p->work = (fftw_complex*) fftw_malloc(p->nwork 
                                         * sizeof(fftw_complex));
             if (!p->work) {
                fftwnd_destroy_plan(p);
                return 0;
             }
        }
        else
             p->work = (fftw_complex*) NULL;

        /* two possible sets of 1D plans: */
        plans_buf = fftwnd_create_plans_generic(fftwnd_new_plan_array(rank),
                                      rank, n, dir, flags);
        plans_nobuf = 
             fftwnd_create_plans_specific(fftwnd_new_plan_array(rank),
                                  rank, n, p->n_after, dir,
                                  flags, in, istride,
                                  out, ostride);
        if (!plans_buf || !plans_nobuf) {
             destroy_plan_array(rank, plans_nobuf);
             destroy_plan_array(rank, plans_buf);
             fftwnd_destroy_plan(p);
             return 0;
        }
        /* time the two possible plans */
        p->plans = plans_nobuf;
        p->nbuffers = 0;
        p->nwork = fftwnd_work_size(rank, n, flags, p->nbuffers + 1);
        t_nobuf = fftwnd_measure_runtime(p, in, istride, out, ostride);
        p->plans = plans_buf;
        p->nbuffers = FFTWND_NBUFFERS;
        p->nwork = fftwnd_work_size(rank, n, flags, p->nbuffers + 1);
        t_buf = fftwnd_measure_runtime(p, in, istride, out, ostride);

        /* pick the better one: */
        if (t_nobuf < t_buf) {      /* use unbuffered transform */
             p->plans = plans_nobuf;
             p->nbuffers = 0;

             /* work array is unnecessarily large */
             if (p->work)
                fftw_free(p->work);
             p->work = 0;

             destroy_plan_array(rank, plans_buf);

             /* allocate a work array of the correct size: */
             p->nwork = fftwnd_work_size(rank, n, flags, p->nbuffers + 1);
             if (p->nwork && !(flags & FFTW_THREADSAFE)) {
                p->work = (fftw_complex*) fftw_malloc(p->nwork 
                                           * sizeof(fftw_complex));
                if (!p->work) {
                   fftwnd_destroy_plan(p);
                   return 0;
                }
             }
        } else {        /* use buffered transform */
             destroy_plan_array(rank, plans_nobuf);
        }
     }

     return p;
}

fftwnd_plan fftw2d_create_plan_specific(int nx, int ny,
                              fftw_direction dir, int flags,
                              fftw_complex *in, int istride,
                              fftw_complex *out, int ostride)
{
     int n[2];

     n[0] = nx;
     n[1] = ny;

     return fftwnd_create_plan_specific(2, n, dir, flags,
                              in, istride, out, ostride);
}

fftwnd_plan fftw3d_create_plan_specific(int nx, int ny, int nz,
                              fftw_direction dir, int flags,
                              fftw_complex *in, int istride,
                              fftw_complex *out, int ostride)
{
     int n[3];

     n[0] = nx;
     n[1] = ny;
     n[2] = nz;

     return fftwnd_create_plan_specific(3, n, dir, flags,
                              in, istride, out, ostride);
}

/* Create a generic fftwnd plan: */

fftwnd_plan fftwnd_create_plan(int rank, const int *n,
                         fftw_direction dir, int flags)
{
     return fftwnd_create_plan_specific(rank, n, dir, flags, 0, 1, 0, 1);
}

fftwnd_plan fftw2d_create_plan(int nx, int ny,
                         fftw_direction dir, int flags)
{
     return fftw2d_create_plan_specific(nx, ny, dir, flags, 0, 1, 0, 1);
}

fftwnd_plan fftw3d_create_plan(int nx, int ny, int nz,
                         fftw_direction dir, int flags)
{
     return fftw3d_create_plan_specific(nx, ny, nz, dir, flags, 0, 1, 0, 1);
}

/************************ Freeing the FFTWND Plan ************************/

static void destroy_plan_array(int rank, fftw_plan *plans)
{
     if (plans) {
        int i, j;

        for (i = 0; i < rank; ++i) {
             for (j = i - 1;
                j >= 0 && plans[i] != plans[j];
                --j);
             if (j < 0 && plans[i])
                fftw_destroy_plan(plans[i]);
        }
        fftw_free(plans);
     }
}

void fftwnd_destroy_plan(fftwnd_plan plan)
{
     if (plan) {
        destroy_plan_array(plan->rank, plan->plans);

        if (plan->n)
             fftw_free(plan->n);

        if (plan->n_before)
             fftw_free(plan->n_before);

        if (plan->n_after)
             fftw_free(plan->n_after);

        if (plan->work)
             fftw_free(plan->work);

        fftw_free(plan);
     }
}

/************************ Printing the FFTWND Plan ************************/

void fftwnd_fprint_plan(FILE *f, fftwnd_plan plan)
{
     if (plan) {
        int i, j;

        if (plan->rank == 0) {
             fprintf(f, "plan for rank 0 (null) transform.\n");
             return;
        }
        fprintf(f, "plan for ");
        for (i = 0; i < plan->rank; ++i)
             fprintf(f, "%s%d", i ? "x" : "", plan->n[i]);
        fprintf(f, " transform:\n");

        if (plan->nbuffers > 0)
             fprintf(f, "  -- using buffered transforms (%d buffers)\n",
                   plan->nbuffers);
        else
             fprintf(f, "  -- using unbuffered transform\n");

        for (i = 0; i < plan->rank; ++i) {
             fprintf(f, "* dimension %d (size %d) ", i, plan->n[i]);

             for (j = i - 1; j >= 0; --j)
                if (plan->plans[j] == plan->plans[i])
                   break;

             if (j < 0)
                fftw_fprint_plan(f, plan->plans[i]);
             else
                fprintf(f, "plan is same as dimension %d plan.\n", j);
        }
     }
}

void fftwnd_print_plan(fftwnd_plan plan)
{
  fftwnd_fprint_plan(stdout, plan);
}

/********************* Buffered FFTW (in-place) *********************/

#if FFTW_ENABLE_FLOAT && (FFTWND_NBUFFERS <= 8)  

/* accelerated version of fftw_buffered */

/* 0 */
void fftw_buffered_copyto0(double *data, double *buffers, 
                     int istride, int idist, int nb, int n)
{
  assert(1==2);
}

void fftw_buffered_copyfrom0(double *data, double *buffers, 
                       int istride, int idist, int nb, int n)
{
  assert(2==1);
}


/* 1 */
void fftw_buffered_copyto1(double *data, double *buffers, 
                     int istride, int idist, int nb, int n)
{
  do {
    buffers[0] = data[0];

    buffers++;
    data += istride;
    n--;
  } while (n != 0);
}

void fftw_buffered_copyfrom1(double *data, double *buffers,
                       int istride, int idist, int nb, int n)
{
  do {
    data[0] = buffers[0];

    buffers++;
    data += istride; 
    n--;
  } while (n != 0);
}

/* 2 */
void fftw_buffered_copyto2(double *data, double *buffers,
                     int istride, int idist, int nb, int n)
{
  do {
    buffers[0]  = data[0];
    buffers[nb] = data[idist];

    buffers++;
    data += istride;
    n--;
  } while (n != 0);
}

void fftw_buffered_copyfrom2(double *data, double *buffers,
                       int istride, int idist, int nb, int n)
{
  do {
    data[0] = buffers[0];
    data[idist] = buffers[nb];

    buffers++;
    data += istride; 
    n--;
  } while (n != 0);
}

/* 3 */
void fftw_buffered_copyto3(double *data, double *buffers,
                     int istride, int idist, int nb, int n)
{
  do {
    buffers[0] = data[0];
    buffers[nb] = data[idist];
    buffers[2*nb] = data[2*idist];

    buffers++;
    data += istride;
    n--;
  } while (n != 0);
}

void fftw_buffered_copyfrom3(double *data, double *buffers,
                       int istride, int idist, int nb, int n)
{
  do {
    data[0] = buffers[0];
    data[idist] = buffers[nb];
    data[2*idist] = buffers[2*nb];

    buffers++;
    data += istride; 
    n--;
  } while (n != 0);
}

/* 4 */
void fftw_buffered_copyto4(double *data, double *buffers,
                     int istride, int idist, int nb, int n)
{
  do {
    double *cur_data = data;
    double *cur_buf = buffers;

    *cur_buf = *cur_data, cur_buf += nb, cur_data += idist;
    *cur_buf = *cur_data, cur_buf += nb, cur_data += idist;
    *cur_buf = *cur_data, cur_buf += nb, cur_data += idist;
    *cur_buf = *cur_data, cur_buf += nb, cur_data += idist;

    buffers++;
    data += istride;
    n--;
  } while (n != 0);
}

void fftw_buffered_copyfrom4(double *data, double *buffers,
                       int istride, int idist, int nb, int n)
{
  do {
    double *cur_data = data;
    double *cur_buf = buffers;

    *cur_data = *cur_buf, cur_data += idist, cur_buf += nb;
    *cur_data = *cur_buf, cur_data += idist, cur_buf += nb;
    *cur_data = *cur_buf, cur_data += idist, cur_buf += nb;
    *cur_data = *cur_buf, cur_data += idist, cur_buf += nb;

    buffers++;
    data += istride; 
    n--;
  } while (n != 0);
}

/* 5 */
void fftw_buffered_copyto5(double *data, double *buffers,
                     int istride, int idist, int nb, int n)
{
  do {
    double *cur_data = data;
    double *cur_buf = buffers;

    *cur_buf = *cur_data, cur_buf += nb, cur_data += idist;
    *cur_buf = *cur_data, cur_buf += nb, cur_data += idist;
    *cur_buf = *cur_data, cur_buf += nb, cur_data += idist;
    *cur_buf = *cur_data, cur_buf += nb, cur_data += idist;
    *cur_buf = *cur_data, cur_buf += nb, cur_data += idist;

    buffers++;
    data += istride;
    n--;
  } while (n != 0);
}

void fftw_buffered_copyfrom5(double *data, double *buffers,
                       int istride, int idist, int nb, int n)
{
  do {
    double *cur_data = data;
    double *cur_buf = buffers;

    *cur_data = *cur_buf, cur_data += idist, cur_buf += nb;
    *cur_data = *cur_buf, cur_data += idist, cur_buf += nb;
    *cur_data = *cur_buf, cur_data += idist, cur_buf += nb;
    *cur_data = *cur_buf, cur_data += idist, cur_buf += nb;
    *cur_data = *cur_buf, cur_data += idist, cur_buf += nb;

    buffers++;
    data += istride; 
    n--;
  } while (n != 0);
}

/* 6 */
void fftw_buffered_copyto6(double *data, double *buffers,
                     int istride, int idist, int nb, int n)
{
  do {
    double *cur_data = data;
    double *cur_buf = buffers; 

    *cur_buf = *cur_data, cur_buf += nb, cur_data += idist;
    *cur_buf = *cur_data, cur_buf += nb, cur_data += idist;
    *cur_buf = *cur_data, cur_buf += nb, cur_data += idist;
    *cur_buf = *cur_data, cur_buf += nb, cur_data += idist;
    *cur_buf = *cur_data, cur_buf += nb, cur_data += idist;
    *cur_buf = *cur_data, cur_buf += nb, cur_data += idist;

    buffers++;
    data += istride;
    n--;
  } while (n != 0);
}

void fftw_buffered_copyfrom6(double *data, double *buffers,
                       int istride, int idist, int nb, int n)
{
  do {
    double *cur_data = data;
    double *cur_buf = buffers;

    *cur_data = *cur_buf, cur_data += idist, cur_buf += nb;
    *cur_data = *cur_buf, cur_data += idist, cur_buf += nb;
    *cur_data = *cur_buf, cur_data += idist, cur_buf += nb;
    *cur_data = *cur_buf, cur_data += idist, cur_buf += nb;
    *cur_data = *cur_buf, cur_data += idist, cur_buf += nb;
    *cur_data = *cur_buf, cur_data += idist, cur_buf += nb;

    buffers++;
    data += istride; 
    n--;
  } while (n != 0);
}
 
/* 7 */
void fftw_buffered_copyto7(double *data, double *buffers,
                     int istride, int idist, int nb, int n)
{
  do {
    double *cur_data = data; 
    double *cur_buf = buffers;

    *cur_buf = *cur_data, cur_buf += nb, cur_data += idist;
    *cur_buf = *cur_data, cur_buf += nb, cur_data += idist;
    *cur_buf = *cur_data, cur_buf += nb, cur_data += idist;
    *cur_buf = *cur_data, cur_buf += nb, cur_data += idist;
    *cur_buf = *cur_data, cur_buf += nb, cur_data += idist;
    *cur_buf = *cur_data, cur_buf += nb, cur_data += idist;
    *cur_buf = *cur_data, cur_buf += nb, cur_data += idist;

    buffers++;
    data += istride;
    n--;
  } while (n != 0);
}

void fftw_buffered_copyfrom7(double *data, double *buffers,
                       int istride, int idist, int nb, int n)
{
  do {
    double *cur_data = data;
    double *cur_buf = buffers;

    *cur_data = *cur_buf, cur_data += idist, cur_buf += nb;
    *cur_data = *cur_buf, cur_data += idist, cur_buf += nb;
    *cur_data = *cur_buf, cur_data += idist, cur_buf += nb;
    *cur_data = *cur_buf, cur_data += idist, cur_buf += nb;
    *cur_data = *cur_buf, cur_data += idist, cur_buf += nb;
    *cur_data = *cur_buf, cur_data += idist, cur_buf += nb;
    *cur_data = *cur_buf, cur_data += idist, cur_buf += nb;

    buffers++;
    data += istride; 
    n--;
  } while (n != 0);
}

/* 8 */
void fftw_buffered_copyto8(double *data, double *buffers,
                     int istride, int idist, int nb, int n)
{
  do {
    double *cur_data = data;
    double *cur_buf = buffers;

    *cur_buf = *cur_data, cur_buf += nb, cur_data += idist;
    *cur_buf = *cur_data, cur_buf += nb, cur_data += idist;
    *cur_buf = *cur_data, cur_buf += nb, cur_data += idist;
    *cur_buf = *cur_data, cur_buf += nb, cur_data += idist;
    *cur_buf = *cur_data, cur_buf += nb, cur_data += idist;
    *cur_buf = *cur_data, cur_buf += nb, cur_data += idist;
    *cur_buf = *cur_data, cur_buf += nb, cur_data += idist;
    *cur_buf = *cur_data, cur_buf += nb, cur_data += idist;

    buffers++;
    data += istride;
    n--;
  } while (n != 0);
}

void fftw_buffered_copyfrom8(double *data, double *buffers,
                       int istride, int idist, int nb, int n)
{
  do {
    double *cur_data = data;
    double *cur_buf = buffers;

    *cur_data = *cur_buf, cur_data += idist, cur_buf += nb;
    *cur_data = *cur_buf, cur_data += idist, cur_buf += nb;
    *cur_data = *cur_buf, cur_data += idist, cur_buf += nb;
    *cur_data = *cur_buf, cur_data += idist, cur_buf += nb;
    *cur_data = *cur_buf, cur_data += idist, cur_buf += nb;
    *cur_data = *cur_buf, cur_data += idist, cur_buf += nb;
    *cur_data = *cur_buf, cur_data += idist, cur_buf += nb;
    *cur_data = *cur_buf, cur_data += idist, cur_buf += nb;

    buffers++;
    data += istride; 
    n--;
  } while (n != 0);
}


typedef void (fncopyto)(double *, double *, int, int, int, int);
typedef void (fncopyfrom)(double *, double *, int, int, int, int);

static fncopyto *fnsCopyTo[] = { 
  &fftw_buffered_copyto0, &fftw_buffered_copyto1, &fftw_buffered_copyto2,
  &fftw_buffered_copyto3, &fftw_buffered_copyto4, &fftw_buffered_copyto5,
  &fftw_buffered_copyto6, &fftw_buffered_copyto7, &fftw_buffered_copyto8
};

static fncopyfrom *fnsCopyFrom[] = { 
  &fftw_buffered_copyfrom0, &fftw_buffered_copyfrom1, &fftw_buffered_copyfrom2,
  &fftw_buffered_copyfrom3, &fftw_buffered_copyfrom4, &fftw_buffered_copyfrom5,
  &fftw_buffered_copyfrom6, &fftw_buffered_copyfrom7, &fftw_buffered_copyfrom8
};

void fftw_buffered(fftw_plan p, int howmany,
               fftw_complex *in, int istride, int idist,
               fftw_complex *work,
               int nbuffers, fftw_complex *buffers)
{
  int i = 0, n, nb;

  assert(nbuffers <= 8);

  n = p->n;
  nb = n + FFTWND_BUFFER_PADDING;

  do {
    for (; i <= howmany - nbuffers; i += nbuffers) {
      double *cur_in = (double *) in + i * idist;

      /* 
       * First, copy nbuffers strided arrays to the contiguous buffer arrays
       * (reading consecutive locations, assuming that idist is 1):
       */
      (fnsCopyTo[nbuffers])(cur_in,(double *) buffers,istride,idist,nb,n);

      /* 
       * Now, compute the FFTs in the buffers (in-place using work): 
       */
      fftw(p, nbuffers, buffers, 1, nb, work, 1, 0);
      
      /* 
       * Finally, copy the results back from the contiguous buffers to the
       * strided arrays (writing consecutive locations):
       */
      (fnsCopyFrom[nbuffers])(cur_in,(double *) buffers,istride,idist,nb,n);
    }
    
    /* 
     * we skip howmany % nbuffers ffts at the end of the loop,
     * so we have to go back and do them: 
     */
    nbuffers = howmany - i;
  } while (i < howmany);
}

#else

/* original version of fftw_buffered */
void fftw_buffered(fftw_plan p, int howmany,
               fftw_complex *in, int istride, int idist,
               fftw_complex *work,
               int nbuffers, fftw_complex *buffers)
{
     int i = 0, n, nb;

     n = p->n;
     nb = n + FFTWND_BUFFER_PADDING;

     do {
        for (; i <= howmany - nbuffers; i += nbuffers) {
             fftw_complex *cur_in = in + i * idist;
             int j, buf;

             /* 
              * First, copy nbuffers strided arrays to the
              * contiguous buffer arrays (reading consecutive
              * locations, assuming that idist is 1):
              */
             for (j = 0; j < n; ++j) {
                fftw_complex *cur_in2 = cur_in + j * istride;
                fftw_complex *cur_buffers = buffers + j;

                for (buf = 0; buf <= nbuffers - 4; buf += 4) {
                   *cur_buffers = *cur_in2;
                   *(cur_buffers += nb) = *(cur_in2 += idist);
                   *(cur_buffers += nb) = *(cur_in2 += idist);
                   *(cur_buffers += nb) = *(cur_in2 += idist);
                   cur_buffers += nb;
                   cur_in2 += idist;
                }
                for (; buf < nbuffers; ++buf) {
                   *cur_buffers = *cur_in2;
                   cur_buffers += nb;
                   cur_in2 += idist;
                }
             }

             /* 
              * Now, compute the FFTs in the buffers (in-place
              * using work): 
              */
             fftw(p, nbuffers, buffers, 1, nb, work, 1, 0);

             /* 
              * Finally, copy the results back from the contiguous
              * buffers to the strided arrays (writing consecutive
              * locations):
              */
             for (j = 0; j < n; ++j) {
                fftw_complex *cur_in2 = cur_in + j * istride;
                fftw_complex *cur_buffers = buffers + j;

                for (buf = 0; buf <= nbuffers - 4; buf += 4) {
                   *cur_in2 = *cur_buffers;
                   *(cur_in2 += idist) = *(cur_buffers += nb);
                   *(cur_in2 += idist) = *(cur_buffers += nb);
                   *(cur_in2 += idist) = *(cur_buffers += nb);
                   cur_buffers += nb;
                   cur_in2 += idist;
                }
                for (; buf < nbuffers; ++buf) {
                   *cur_in2 = *cur_buffers;
                   cur_buffers += nb;
                   cur_in2 += idist;
                }
             }
        }

        /* 
         * we skip howmany % nbuffers ffts at the end of the loop,
         * so we have to go back and do them: 
         */
        nbuffers = howmany - i;
     } while (i < howmany);
}
#endif


/********************* Computing the N-Dimensional FFT *********************/

void fftwnd_aux(fftwnd_plan p, int cur_dim,
            fftw_complex *in, int istride,
            fftw_complex *out, int ostride,
            fftw_complex *work)
{
     int n_after = p->n_after[cur_dim], n = p->n[cur_dim];

     if (cur_dim == p->rank - 2) {
        /* just do the last dimension directly: */
        if (p->is_in_place)
             fftw(p->plans[p->rank - 1], n,
                in, istride, n_after * istride,
                work, 1, 0);
        else
             fftw(p->plans[p->rank - 1], n,
                in, istride, n_after * istride,
                out, ostride, n_after * ostride);
     } else {                 /* we have at least two dimensions to go */
        int i;

        /* 
         * process the subsequent dimensions recursively, in hyperslabs,
         * to get maximum locality: 
         */
        for (i = 0; i < n; ++i)
             fftwnd_aux(p, cur_dim + 1,
                    in + i * n_after * istride, istride,
                    out + i * n_after * ostride, ostride, work);
     }

     /* do the current dimension (in-place): */
     if (p->nbuffers == 0) {
        fftw(p->plans[cur_dim], n_after,
             out, n_after * ostride, ostride,
             work, 1, 0);
     } else             /* using contiguous copy buffers: */
        fftw_buffered(p->plans[cur_dim], n_after,
                  out, n_after * ostride, ostride,
                  work, p->nbuffers, work + n);
}

/*
 * alternate version of fftwnd_aux -- this version pushes the howmany
 * loop down to the leaves of the computation, for greater locality in
 * cases where dist < stride
 */
void fftwnd_aux_howmany(fftwnd_plan p, int cur_dim,
                  int howmany,
                  fftw_complex *in, int istride, int idist,
                  fftw_complex *out, int ostride, int odist,
                  fftw_complex *work)
{
     int n_after = p->n_after[cur_dim], n = p->n[cur_dim];
     int k;

     if (cur_dim == p->rank - 2) {
        /* just do the last dimension directly: */
        if (p->is_in_place)
             for (k = 0; k < n; ++k)
                fftw(p->plans[p->rank - 1], howmany,
                   in + k * n_after * istride, istride, idist,
                   work, 1, 0);
        else
             for (k = 0; k < n; ++k)
                fftw(p->plans[p->rank - 1], howmany,
                   in + k * n_after * istride, istride, idist,
                   out + k * n_after * ostride, ostride, odist);
     } else {                 /* we have at least two dimensions to go */
        int i;

        /* 
         * process the subsequent dimensions recursively, in
         * hyperslabs, to get maximum locality:
         */
        for (i = 0; i < n; ++i)
             fftwnd_aux_howmany(p, cur_dim + 1, howmany,
                        in + i * n_after * istride, istride, idist,
                          out + i * n_after * ostride, ostride, odist,
                          work);
     }

     /* do the current dimension (in-place): */
     if (p->nbuffers == 0)
        for (k = 0; k < n_after; ++k)
             fftw(p->plans[cur_dim], howmany,
                out + k * ostride, n_after * ostride, odist,
                work, 1, 0);
     else               /* using contiguous copy buffers: */
        for (k = 0; k < n_after; ++k)
             fftw_buffered(p->plans[cur_dim], howmany,
                       out + k * ostride, n_after * ostride, odist,
                       work, p->nbuffers, work + n);
}

void fftwnd(fftwnd_plan p, int howmany,
          fftw_complex *in, int istride, int idist,
          fftw_complex *out, int ostride, int odist)
{
     fftw_complex *work;

#ifdef FFTW_DEBUG
     if (p->rank > 0 && (p->plans[0]->flags & FFTW_THREADSAFE)
       && p->nwork && p->work)
        fftw_die("bug with FFTW_THREADSAFE flag\n");
#endif

     if (p->nwork && !p->work)
        work = (fftw_complex *) fftw_malloc(p->nwork * sizeof(fftw_complex));
     else
        work = p->work;

     switch (p->rank) {
       case 0:
            break;
       case 1:
            if (p->is_in_place)     /* fft is in-place */
               fftw(p->plans[0], howmany, in, istride, idist,
                  work, 1, 0);
            else
               fftw(p->plans[0], howmany, in, istride, idist,
                  out, ostride, odist);
            break;
       default:         /* rank >= 2 */
            {
               if (p->is_in_place) {
                  out = in;
                  ostride = istride;
                  odist = idist;
               }
               if (howmany > 1 && odist < ostride)
                  fftwnd_aux_howmany(p, 0, howmany,
                                 in, istride, idist,
                                 out, ostride, odist,
                                 work);
               else {
                  int i;

                  for (i = 0; i < howmany; ++i)
                       fftwnd_aux(p, 0,
                              in + i * idist, istride,
                              out + i * odist, ostride,
                              work);
               }
            }
     }

     if (p->nwork && !p->work)
        fftw_free(work);

}

void fftwnd_one(fftwnd_plan p, fftw_complex *in, fftw_complex *out)
{
     fftwnd(p, 1, in, 1, 1, out, 1, 1);
}

Generated by  Doxygen 1.6.0   Back to index