/******************************************************************************
 * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/******************************************************************************
 * Test of BlockHistogram utilities
 ******************************************************************************/

// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR

#include <stdio.h>
#include <limits>
#include <string>
#include <typeinfo>

#include <cub/block/block_histogram.cuh>
#include <cub/block/block_load.cuh>
#include <cub/block/block_store.cuh>
#include <cub/util_allocator.cuh>

#include "test_util.h"

using namespace cub;


//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------

bool                    g_verbose           = false;
int                     g_timing_iterations = 0;
int                     g_repeat            = 0;
CachingDeviceAllocator  g_allocator(true);


//---------------------------------------------------------------------
// Test kernels
//---------------------------------------------------------------------

/**
 * BlockHistogram test kernel.
 */
template <
    int                     BINS,
    int                     BLOCK_THREADS,
    int                     ITEMS_PER_THREAD,
    BlockHistogramAlgorithm ALGORITHM,
    typename                T,
    typename                HistoCounter>
__global__ void BlockHistogramKernel(
    T                       *d_samples,
    HistoCounter            *d_histogram)
{
    // Parameterize BlockHistogram type for our thread block
    typedef BlockHistogram<T, BLOCK_THREADS, ITEMS_PER_THREAD, BINS, ALGORITHM> BlockHistogram;

    // Allocate temp storage in shared memory
    __shared__ typename BlockHistogram::TempStorage temp_storage;

    // Per-thread tile data
    T data[ITEMS_PER_THREAD];
    LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_samples, data);

    // Test histo (writing directly to histogram buffer in global)
    BlockHistogram(temp_storage).Histogram(data, d_histogram);
}


/**
 * Initialize problem (and solution)
 */
template <
    int             BINS,
    typename        SampleT>
void Initialize(
    GenMode         gen_mode,
    SampleT         *h_samples,
    int             *h_histograms_linear,
    int             num_samples)
{
    // Init bins
    for (int bin = 0; bin < BINS; ++bin)
    {
        h_histograms_linear[bin] = 0;
    }

    if (g_verbose) printf("Samples: \n");

    // Initialize interleaved channel samples and histogram them correspondingly
    for (int i = 0; i < num_samples; ++i)
    {
        InitValue(gen_mode, h_samples[i], i);
        h_samples[i] %= BINS;

        if (g_verbose) std::cout << CoutCast(h_samples[i]) << ", ";

        h_histograms_linear[h_samples[i]]++;
    }

    if (g_verbose) printf("\n\n");
}


/**
 * Test BlockHistogram
 */
template <
    typename                    SampleT,
    int                         BINS,
    int                         BLOCK_THREADS,
    int                         ITEMS_PER_THREAD,
    BlockHistogramAlgorithm     ALGORITHM>
void Test(
    GenMode                     gen_mode)
{
    int num_samples = BLOCK_THREADS * ITEMS_PER_THREAD;

    printf("cub::BlockHistogram %s %d %s samples (%dB), %d bins, %d threads, gen-mode %s\n",
        (ALGORITHM == BLOCK_HISTO_SORT) ? "BLOCK_HISTO_SORT" : "BLOCK_HISTO_ATOMIC",
        num_samples,
        typeid(SampleT).name(),
        (int) sizeof(SampleT),
        BINS,
        BLOCK_THREADS,
        (gen_mode == RANDOM) ? "RANDOM" : (gen_mode == INTEGER_SEED) ? "SEQUENTIAL" : "HOMOGENOUS");
    fflush(stdout);

    // Allocate host arrays
    SampleT         *h_samples          = new SampleT[num_samples];
    int   *h_reference = new int[BINS];

    // Initialize problem
    Initialize<BINS>(gen_mode, h_samples, h_reference, num_samples);

    // Allocate problem device arrays
    SampleT         *d_samples = NULL;
    int             *d_histogram = NULL;
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples,             sizeof(SampleT) * num_samples));
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram,   sizeof(int) * BINS));

    // Initialize/clear device arrays
    CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * num_samples, cudaMemcpyHostToDevice));
    CubDebugExit(cudaMemset(d_histogram, 0, sizeof(int) * BINS));

    // Run kernel
    BlockHistogramKernel<BINS, BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<1, BLOCK_THREADS>>>(
        d_samples,
        d_histogram);

    // Check for correctness (and display results, if specified)
    int compare = CompareDeviceResults((int*) h_reference, d_histogram, BINS, g_verbose, g_verbose);
    printf("\t%s\n\n", compare ? "FAIL" : "PASS");

    // Flush any stdout/stderr
    CubDebugExit(cudaPeekAtLastError());
    CubDebugExit(cudaDeviceSynchronize());
    fflush(stdout);
    fflush(stderr);

    // Cleanup
    if (h_samples) delete[] h_samples;
    if (h_reference) delete[] h_reference;
    if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples));
    if (d_histogram) CubDebugExit(g_allocator.DeviceFree(d_histogram));

    // Correctness asserts
    AssertEquals(0, compare);
}


/**
 * Test different sample distributions
 */
template <
    typename                    SampleT,
    int                         BINS,
    int                         BLOCK_THREADS,
    int                         ITEMS_PER_THREAD,
    BlockHistogramAlgorithm     ALGORITHM>
void Test()
{
    Test<SampleT, BINS, BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>(UNIFORM);
    Test<SampleT, BINS, BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>(INTEGER_SEED);
    Test<SampleT, BINS, BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>(RANDOM);
}


/**
 * Test different ALGORITHM
 */
template <
    typename                    SampleT,
    int                         BINS,
    int                         BLOCK_THREADS,
    int                         ITEMS_PER_THREAD>
void Test()
{
    Test<SampleT, BINS, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_HISTO_SORT>();
    Test<SampleT, BINS, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_HISTO_ATOMIC>();
}


/**
 * Test different ITEMS_PER_THREAD
 */
template <
    typename                    SampleT,
    int                         BINS,
    int                         BLOCK_THREADS>
void Test()
{
    Test<SampleT, BINS, BLOCK_THREADS, 1>();
    Test<SampleT, BINS, BLOCK_THREADS, 5>();
}


/**
 * Test different BLOCK_THREADS
 */
template <
    typename                    SampleT,
    int                         BINS>
void Test()
{
    Test<SampleT, BINS, 32>();
    Test<SampleT, BINS, 96>();
    Test<SampleT, BINS, 128>();
}





//---------------------------------------------------------------------
// Main
//---------------------------------------------------------------------

/**
 * Main
 */
int main(int argc, char** argv)
{
    // Initialize command line
    CommandLineArgs args(argc, argv);
    g_verbose = args.CheckCmdLineFlag("v");
    args.GetCmdLineArgument("repeat", g_repeat);

    // Print usage
    if (args.CheckCmdLineFlag("help"))
    {
        printf("%s "
            "[--n=<total input samples across all channels> "
            "[--device=<device-id>] "
            "[--repeat=<repetitions of entire test suite>]"
            "[--v] "
            "\n", argv[0]);
        exit(0);
    }

    // Initialize device
    CubDebugExit(args.DeviceInit());

#ifdef CUB_TEST_BENCHMARK

    // Compile/run quick tests
    Test<unsigned char, 256, 128, 4, BLOCK_HISTO_SORT>(RANDOM);
    Test<unsigned char, 256, 128, 4, BLOCK_HISTO_ATOMIC>(RANDOM);

#else

    // Compile/run thorough tests
    for (int i = 0; i <= g_repeat; ++i)
    {
        Test<unsigned char, 32>();
        Test<unsigned char, 256>();
        Test<unsigned short, 1024>();
    }

#endif

    return 0;
}



