
#include "defines.hpp"
#include "math_helper.cuh"
#include "cuda_kernel_utils.cuh"

#include "compute_smooth_weights.cuh"

///////////////////////// GPU

/**
 *  GPU kernel to compute the smooth weights on the gpu.
 *  @param  pNumSamples         Number of samples.
 *  @param  pNumNeighbors       Number of neighbors.
 *  @param  pInvRadii           Inverse radii used to select the 
 *      neighbors.
 *  @param  pInvBandwidth       Constant inverse banwidth.
 *  @param  pPts                Array of points.
 *  @param  pSamples            Array of samples.
 *  @param  pNeighbors          Array of neighbors.
 *  @param  pNeighIndexXSample  Indices of neighbors x sample.
 *  @param  pOutSmoothW         Output array with the smooth weights.
 *  @paramt D                   Number of dimensions. 
 */
template<int D>
__global__ void compute_smooth_weights_gpu_kernel(
    const unsigned int pNumSamples,
    const unsigned int pNumNeighbors,
    const mccnn::fpoint<D>* __restrict__ pInvRadii,
    const mccnn::fpoint<D>* __restrict__ pPts,
    const mccnn::fpoint<D>* __restrict__ pSamples,
    const int2* __restrict__ pNeighbors,
    const int* __restrict__ pNeighIndexXSample,
    float* __restrict__ pOutSmoothW)
{
    //Get the global thread index.
    int iniPtIndex = mccnn::compute_global_index_gpu_funct();
    int totalThreads = mccnn::compute_total_threads_gpu_funct();

    for(unsigned int curIter = iniPtIndex; 
        curIter < pNumNeighbors; 
        curIter += totalThreads)
    {
        //Get the index coordinates of the point and the center.
        int2 ptIds = pNeighbors[curIter];

        //Get the distance between the points.
        mccnn::fpoint<D> ptDiff = (pPts[ptIds.x] - pSamples[ptIds.y])*pInvRadii[0];
        float length = mccnn::length(ptDiff);

        //Compute the weight value.
        length = 1.0 - max((length - 0.75)*4.0, 0.0);
        float squareLength = length*length;
        float smoothW = 3.0*squareLength - 2.0*squareLength*length;

        //Save the PDF.
        pOutSmoothW[curIter] = smoothW;
    }
}

/**
 *  GPU kernel to compute gradients of the point wrt the smooth weights
 *  on the gpu.
 *  @param  pNumSamples         Number of samples.
 *  @param  pNumNeighbors       Number of neighbors.
 *  @param  pInvRadii           Inverse radii used to select the 
 *      neighbors.
 *  @param  pInvBandwidth       Constant inverse banwidth.
 *  @param  pPts                Array of points.
 *  @param  pSamples            Array of samples.
 *  @param  pNeighbors          Array of neighbors.
 *  @param  pNeighIndexXSample  Indices of neighbors x sample.
 *  @param  pSmoothWGrads       Input smooth weights gradients.
 *  @param  pOutPtGrads         Output array with the point gradients.
 *  @param  pOutSampleGrads     Output array with the sample point gradients.
 *  @paramt D                   Number of dimensions. 
 *  @paramt M                   Mode used to compute the bandwidth. 
 */
 template<int D>
 __global__ void compute_smooth_weights_grads_gpu_kernel(
    const unsigned int pNumSamples,
    const unsigned int pNumNeighbors,
    const mccnn::fpoint<D>* __restrict__ pInvRadii,
    const mccnn::fpoint<D>* __restrict__ pPts,
    const mccnn::fpoint<D>* __restrict__ pSamples,
    const int2* __restrict__ pNeighbors,
    const int* __restrict__ pNeighIndexXSample,
    const float* __restrict__ pSmoothWGrads,
    mccnn::fpoint<D>* __restrict__ pOutPtGrads,
    mccnn::fpoint<D>* __restrict__ pOutSampleGrads)
 {
    //Get the global thread index.
    int iniPtIndex = mccnn::compute_global_index_gpu_funct();
    int totalThreads = mccnn::compute_total_threads_gpu_funct();
 
    for(unsigned int curIter = iniPtIndex; 
        curIter < pNumNeighbors; 
        curIter += totalThreads)
    { 
        //Get the index coordinates of the point and the center.
        int2 ptIds = pNeighbors[curIter];

        //Get the distance between the points.
        mccnn::fpoint<D> ptDiff = (pPts[ptIds.x] - pSamples[ptIds.y])*pInvRadii[0];
        float length = mccnn::length(ptDiff);

        //Compute the weight value.
        float lengthMax = (length - 0.75)*4.0;
        float lengthMax1 = 1.0 - max(lengthMax, 0.0);

        //Get the current gradient.
        float curGrad = pSmoothWGrads[curIter]*(6.0*(lengthMax1-lengthMax1*lengthMax1))
            *(-1.0)*4.0*0.5*(1.0/length);
        if(lengthMax < 0.0f)
            curGrad = 0.0f;

        //Compute the gradients.
#pragma unroll
        for(int d = 0; d < D; ++d)
            atomicAdd(&pOutPtGrads[ptIds.x][d], ptDiff[d]*curGrad*pInvRadii[0][d]);

#pragma unroll
        for(int d = 0; d < D; ++d)
            atomicAdd(&pOutSampleGrads[ptIds.y][d], -ptDiff[d]*curGrad*pInvRadii[0][d]);
    }
 }

///////////////////////// CPU

template<int D>
void mccnn::compute_smooth_weights_gpu(
    std::unique_ptr<IGPUDevice>& pDevice,
    const unsigned int pNumSamples,
    const unsigned int pNumNeighbors,
    const float* pInGPUPtrInvRadii,
    const float* pInGPUPtrPts,
    const float* pInGPUPtrSamples,
    const int* pInGPUPtrNeighbors,
    const int* pInGPUPtrSampleNeighI,
    float* pOutGPUPtrSmoothW)
{
    //Get the cuda stream.
    auto cudaStream = pDevice->getCUDAStream();

#ifdef DEBUG_INFO
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, cudaStream);
#endif

    //Get the device properties.
    const GpuDeviceProperties& gpuProps = pDevice->get_device_properties();

    //Calculate the ideal number of blocks for the selected block size.
    unsigned int numMP = gpuProps.numMPs_;
    unsigned int blockSize = gpuProps.warpSize_*2;
    unsigned int numBlocks = pDevice->get_max_active_block_x_sm(
        blockSize,(const void*)compute_smooth_weights_gpu_kernel<D>, 0);
    pDevice->check_error(__FILE__, __LINE__);

    //Calculate the total number of blocks to execute.
    unsigned int execBlocks = pNumNeighbors/blockSize;
    execBlocks += (pNumNeighbors%blockSize != 0)?1:0;
    unsigned int totalNumBlocks = numMP*numBlocks;
    totalNumBlocks = (totalNumBlocks > execBlocks)?execBlocks:totalNumBlocks;

    //Execute the appropriate cuda kernel.
    compute_smooth_weights_gpu_kernel<D><<<totalNumBlocks, blockSize, 0, cudaStream>>>(
            pNumSamples, pNumNeighbors, 
            (const mccnn::fpoint<D>*)pInGPUPtrInvRadii, 
            (const mccnn::fpoint<D>*)pInGPUPtrPts,
            (const mccnn::fpoint<D>*)pInGPUPtrSamples,
            (const int2*)pInGPUPtrNeighbors,
            pInGPUPtrSampleNeighI, 
            pOutGPUPtrSmoothW);
    pDevice->check_error(__FILE__, __LINE__);

#ifdef DEBUG_INFO
    cudaEventRecord(stop, cudaStream);
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    float gpuOccupancy = (float)(numBlocks*blockSize)/(float)gpuProps.maxThreadsXMP_;

    fprintf(stderr, "### COMPUTE SMOOTH WEIGHTS ###\n");
    fprintf(stderr, "Num samples: %d\n", pNumSamples);
    fprintf(stderr, "Num neighbors: %d\n", pNumNeighbors);
    fprintf(stderr, "Occupancy: %f\n", gpuOccupancy);
    fprintf(stderr, "Execution time: %f\n", milliseconds);
    fprintf(stderr, "\n");
#endif
}

template<int D>
void mccnn::compute_smooth_weights_grads_gpu(
    std::unique_ptr<IGPUDevice>& pDevice,
    const unsigned int pNumPts,
    const unsigned int pNumSamples,
    const unsigned int pNumNeighbors,
    const float* pInGPUPtrInvRadii,
    const float* pInGPUPtrPts,
    const float* pInGPUPtrSamples,
    const int* pInGPUPtrNeighbors,
    const int* pInGPUPtrSampleNeighI,
    const float* pInGPUPtrSmoothWGrad,
    float* pOutGPUPtrPtGrads,
    float* pOutGPUPtrSampleGrads)
{
    //Get the cuda stream.
    auto cudaStream = pDevice->getCUDAStream();

#ifdef DEBUG_INFO
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, cudaStream);
#endif

    //Initialize to zero the output array.
    pDevice->memset(pOutGPUPtrPtGrads, 0, sizeof(float)*pNumPts*D);
    pDevice->memset(pOutGPUPtrSampleGrads, 0, sizeof(float)*pNumSamples*D);
    pDevice->check_error(__FILE__, __LINE__);

    //Get the device properties.
    const GpuDeviceProperties& gpuProps = pDevice->get_device_properties();

    //Calculate the ideal number of blocks for the selected block size.
    unsigned int numMP = gpuProps.numMPs_;
    unsigned int blockSize = gpuProps.warpSize_*2;
    unsigned int numBlocks = pDevice->get_max_active_block_x_sm(
        blockSize,(const void*)compute_smooth_weights_grads_gpu_kernel<D>, 0);
    pDevice->check_error(__FILE__, __LINE__);

    //Calculate the total number of blocks to execute.
    unsigned int execBlocks = pNumNeighbors/blockSize;
    execBlocks += (pNumNeighbors%blockSize != 0)?1:0;
    unsigned int totalNumBlocks = numMP*numBlocks;
    totalNumBlocks = (totalNumBlocks > execBlocks)?execBlocks:totalNumBlocks;

    //Execute the appropriate cuda kernel.
    compute_smooth_weights_grads_gpu_kernel<D><<<totalNumBlocks, blockSize, 0, cudaStream>>>(
            pNumSamples, pNumNeighbors, 
            (const mccnn::fpoint<D>*)pInGPUPtrInvRadii,
            (const mccnn::fpoint<D>*)pInGPUPtrPts,
            (const mccnn::fpoint<D>*)pInGPUPtrSamples,
            (const int2*)pInGPUPtrNeighbors,
            pInGPUPtrSampleNeighI, 
            pInGPUPtrSmoothWGrad,
            (mccnn::fpoint<D>*)pOutGPUPtrPtGrads,
            (mccnn::fpoint<D>*)pOutGPUPtrSampleGrads);
    pDevice->check_error(__FILE__, __LINE__);

#ifdef DEBUG_INFO
    cudaEventRecord(stop, cudaStream);
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    float gpuOccupancy = (float)(numBlocks*blockSize)/(float)gpuProps.maxThreadsXMP_;

    fprintf(stderr, "### COMPUTE SMOOTH WEIGHTS GRADS ###\n");
    fprintf(stderr, "Num samples: %d\n", pNumSamples);
    fprintf(stderr, "Num neighbors: %d\n", pNumNeighbors);
    fprintf(stderr, "Occupancy: %f\n", gpuOccupancy);
    fprintf(stderr, "Execution time: %f\n", milliseconds);
    fprintf(stderr, "\n");
#endif
}

///////////////////////// CPU Template declaration

#define COMPUTE_SMOOTHW_TEMP_DECL(Dims)                         \
    template void mccnn::compute_smooth_weights_gpu<Dims>(      \
        std::unique_ptr<IGPUDevice>& pDevice,                   \
        const unsigned int pNumSamples,                         \
        const unsigned int pNumNeighbors,                       \
        const float* pInGPUPtrInvRadii,                         \
        const float* pInGPUPtrPts,                              \
        const float* pInGPUPtrSamples,                          \
        const int* pInGPUPtrNeighbors,                          \
        const int* pInGPUPtrSampleNeighI,                       \
        float* pOutGPUPtrSmoothW);

#define COMPUTE_SMOOTHW_GRADS_TEMP_DECL(Dims)                       \
    template void mccnn::compute_smooth_weights_grads_gpu<Dims>(    \
        std::unique_ptr<IGPUDevice>& pDevice,                       \
        const unsigned int pNumPts,                                 \
        const unsigned int pNumSamples,                             \
        const unsigned int pNumNeighbors,                           \
        const float* pInGPUPtrInvRadii,                             \
        const float* pInGPUPtrPts,                                  \
        const float* pInGPUPtrSamples,                              \
        const int* pInGPUPtrNeighbors,                              \
        const int* pInGPUPtrSampleNeighI,                           \
        const float* pInGPUPtrSmoothWGrad,                          \
        float* pOutGPUPtrPtGrads,                                   \
        float* pOutGPUPtrSampleGrads);

DECLARE_TEMPLATE_DIMS(COMPUTE_SMOOTHW_TEMP_DECL)
DECLARE_TEMPLATE_DIMS(COMPUTE_SMOOTHW_GRADS_TEMP_DECL)