/*
 * This C shared library source code is for interpolating
 * the 2D radially sampled FFT values of an image. It is used
 * in context with the gridRec algorithm which provides tomographic
 * reconstruction, but the subroutine could also be used in different context.
 * Signature: The function 'interpolateUsingPSWF' needs as input data:
 * - The 2D array 'fft1dOfSinogram' which contains the radially sampled data.
 * - The 1D convolvent 'lookupTableOfConvolventInFourierSpace' (The full 2D
 *   convolvent is the tensor product of two 1D convolvents).
 * - The size of the sinogram (numberOfAngularPositions, numberOfFrequencies)
 * - A constant 'C' related to the properties of the prolate spheroidal
 * wavefunctions. (The preferred choice of the convolvent).
 * - The array 'cartesianGridInterpolatedFFT' holds the result (the FFT
 * defined on the Cartesian grid), it needs to be allocated by the caller
 * and is passed by reference.
 * Return value is the exit status. (This should be improved, e.g.
 * there is no way to handle a array-out-of-boundary exception).
 * Note that the (logical) 2D arrays are implemented as (computational)
 * 1D arrays (so float* not float**), since this is the way numpy ndarrays are implemented.
 *
 * Compile with GNU C compiler:
 * gcc -fPIC -shared -Wall interpolate.c -o libinterpolate.so
 * With OpenMP suport
 * gcc -fPIC -O3 -fopenmp -shared -Wall interpolate.c -o libinterpolate.so
 * Type in shell 'export OMP_NUM_THREADS=<number of threads to use>' before running
 * If a Intel C compiler is available (e.g. 'module load intel/intel-13.0-1_intel64')
 * icc -fPIC -shared -Wall interpolate.c -o libinterpolate.so
 * On a TOMCAT compute node, this gives factor 2 performance gain.
 * (compared to standard (old) gcc, a newer gcc gives similar performance boost).
 * Numerically it is important that all three options (forward, backward, inverse)
 * have the same code base. (if you combine them as in a iterative reconstruction)
 * Author: alain.studer@psi.ch
 */
#include <stdlib.h>
#include <math.h>
#include <stdio.h>
#include <complex.h>

int interpolateUsingPSWF(int doBackward, int doInverse, complex float* fft2dOfImage, complex float* fft1dOfSinogram, float* lookupTableOfConvolventInFourierSpace, int numberOfAngularPositions, float* angularArray, int numberOfSubsampledAngles, int* subsampledAngles, int numberOfFrequencies, float C)
{
//    printf("start running C code\n");
    const float pi = 3.14159265359;
    float supportLengthInPoints = 2.0*C/pi; //size of convolution support
    float halfSupportLengthInPoints = supportLengthInPoints/2.0;
    float tblspcg = (numberOfFrequencies/2)/halfSupportLengthInPoints; //lookup table length <= numberOfFrequencies/2
    int numberOfSupportNeighbours = ceil(halfSupportLengthInPoints);
    int cutOffFrequency = numberOfFrequencies/2 - 2*numberOfSupportNeighbours;
    int cutOffFrequencySquared = cutOffFrequency*cutOffFrequency;
    int thetaIndex, qIndex, subSampleIndex;
    float k, theta;
    float kx_p, ky_p;
    int kx_p_NN, ky_p_NN;
    int local_kx_range_min, local_ky_range_min;
    int local_kx_range_max, local_ky_range_max;
    int kx, ky;
    float delta_kx, delta_ky;
    float convolveArg_x, convolveArg_y;
    float weight_kx, weight_ky, weight;
    complex float contribution;
    int rowIndex, columnIndex;
    float ct,st;

    #pragma omp parallel for private(qIndex, k, theta, kx_p, ky_p, kx_p_NN, ky_p_NN, local_kx_range_min, local_ky_range_min, local_kx_range_max, local_ky_range_max, kx, ky, delta_kx, delta_ky, convolveArg_x, convolveArg_y, weight_kx, weight_ky, weight, contribution, rowIndex, columnIndex, ct,st)
    for (subSampleIndex = 0;  subSampleIndex < numberOfSubsampledAngles; subSampleIndex++){
        thetaIndex = subsampledAngles[subSampleIndex];
        theta = angularArray[thetaIndex];
        ct = cos(theta);
        st = sin(theta);
        for (qIndex = 0; qIndex < numberOfFrequencies; qIndex++){
            k = qIndex - numberOfFrequencies/2;
            kx_p = k*ct;
            ky_p = k*st;
            kx_p_NN = (int)rint(kx_p); //this is only to find the support neighborhood
            ky_p_NN = (int)rint(ky_p); //this is only to find the support neighborhood
            if (kx_p_NN*kx_p_NN + ky_p_NN*ky_p_NN > cutOffFrequencySquared){
                continue;
            }
            local_kx_range_min = kx_p_NN - numberOfSupportNeighbours;
            local_ky_range_min = ky_p_NN - numberOfSupportNeighbours;
            local_kx_range_max = kx_p_NN + numberOfSupportNeighbours + 1;
            local_ky_range_max = ky_p_NN + numberOfSupportNeighbours + 1;
            for (kx = local_kx_range_min; kx < local_kx_range_max; kx++){
                for (ky = local_ky_range_min; ky < local_ky_range_max; ky++){
                    delta_kx = kx_p - kx;
                    delta_ky = ky_p - ky;
                    convolveArg_x = tblspcg*fabs(delta_kx);
                    convolveArg_y = tblspcg*fabs(delta_ky);
                    weight_kx = lookupTableOfConvolventInFourierSpace[(int)rint(convolveArg_x)];
                    weight_ky = lookupTableOfConvolventInFourierSpace[(int)rint(convolveArg_y)];
                    weight = weight_ky*weight_kx; //convolution 2D-mask..
                    /*..tensor product of 1D convolution arrays, psi_fft(x,y) = psi_fft(x) psi_fft(y) */
                    rowIndex = ky + numberOfFrequencies/2;
                    columnIndex = kx + numberOfFrequencies/2;
                    if(doBackward){
                        contribution = fft1dOfSinogram[thetaIndex*numberOfFrequencies + qIndex]*weight;
                        if (doInverse){
                            contribution *= fabs(k);
                            /*multiplication with ramp filter fabs(k) is an idealization and does
                            not take discrete nature of FFT into account (See regridding paper by F. Marone) */
                        }
                    fft2dOfImage[rowIndex*numberOfFrequencies + columnIndex] += contribution;
                    }
                    else{
                        contribution = fft2dOfImage[rowIndex*numberOfFrequencies + columnIndex]*weight;
                        fft1dOfSinogram[thetaIndex*numberOfFrequencies + qIndex] += contribution;
                    }
                }
            }
        }
    }
//    printf("finished running C code\n");
    return 0;
}

