/******************************************************************************
 * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/**
 * \file
 * The cub::BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block.
 */

#pragma once

#include "specializations/block_reduce_raking.cuh"
#include "specializations/block_reduce_raking_commutative_only.cuh"
#include "specializations/block_reduce_warp_reductions.cuh"
#include "../util_ptx.cuh"
#include "../util_type.cuh"
#include "../thread/thread_operators.cuh"
#include "../util_namespace.cuh"

/// Optional outer namespace(s)
CUB_NS_PREFIX

/// CUB namespace
namespace cub {



/******************************************************************************
 * Algorithmic variants
 ******************************************************************************/

/**
 * BlockReduceAlgorithm enumerates alternative algorithms for parallel
 * reduction across a CUDA thread block.
 */
enum BlockReduceAlgorithm
{

    /**
     * \par Overview
     * An efficient "raking" reduction algorithm that only supports commutative
     * reduction operators (true for most operations, e.g., addition).
     *
     * \par
     * Execution is comprised of three phases:
     * -# Upsweep sequential reduction in registers (if threads contribute more
     *    than one input each).  Threads in warps other than the first warp place
     *    their partial reductions into shared memory.
     * -# Upsweep sequential reduction in shared memory.  Threads within the first
     *    warp continue to accumulate by raking across segments of shared partial reductions
     * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
     *
     * \par
     * \image html block_reduce.png
     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
     *
     * \par Performance Considerations
     * - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE
     *   and is preferable when the reduction operator is commutative.  This variant
     *   applies fewer reduction operators  than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
     *   throughput across the GPU when suitably occupied.  However, turn-around latency may be
     *   higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
     *   when the GPU is under-occupied.
     */
    BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,


    /**
     * \par Overview
     * An efficient "raking" reduction algorithm that supports commutative
     * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
     * operators. \blocked.
     *
     * \par
     * Execution is comprised of three phases:
     * -# Upsweep sequential reduction in registers (if threads contribute more
     *    than one input each).  Each thread then places the partial reduction
     *    of its item(s) into shared memory.
     * -# Upsweep sequential reduction in shared memory.  Threads within a
     *    single warp rake across segments of shared partial reductions.
     * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
     *
     * \par
     * \image html block_reduce.png
     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
     *
     * \par Performance Considerations
     * - This variant performs more communication than BLOCK_REDUCE_RAKING
     *   and is only preferable when the reduction operator is non-commutative.  This variant
     *   applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
     *   throughput across the GPU when suitably occupied.  However, turn-around latency may be
     *   higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
     *   when the GPU is under-occupied.
     */
    BLOCK_REDUCE_RAKING,


    /**
     * \par Overview
     * A quick "tiled warp-reductions" reduction algorithm that supports commutative
     * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
     * operators.
     *
     * \par
     * Execution is comprised of four phases:
     * -# Upsweep sequential reduction in registers (if threads contribute more
     *    than one input each).  Each thread then places the partial reduction
     *    of its item(s) into shared memory.
     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style
     *    reduction within each warp.
     * -# A propagation phase where the warp reduction outputs in each warp are
     *    updated with the aggregate from each preceding warp.
     *
     * \par
     * \image html block_scan_warpscans.png
     * <div class="centercaption">\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
     *
     * \par Performance Considerations
     * - This variant applies more reduction operators than BLOCK_REDUCE_RAKING
     *   or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall
     *   throughput across the GPU.  However turn-around latency may be lower and
     *   thus useful when the GPU is under-occupied.
     */
    BLOCK_REDUCE_WARP_REDUCTIONS,
};


/******************************************************************************
 * Block reduce
 ******************************************************************************/

/**
 * \brief The BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. ![](reduce_logo.png)
 * \ingroup BlockModule
 *
 * \tparam T                Data type being reduced
 * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
 * \tparam ALGORITHM        <b>[optional]</b> cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS)
 * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
 * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
 * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
 *
 * \par Overview
 * - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
 *   uses a binary combining operator to compute a single aggregate from a list of input elements.
 * - \rowmajor
 * - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput workload profiles:
 *   -# <b>cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY</b>.  An efficient "raking" reduction algorithm that only supports commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
 *   -# <b>cub::BLOCK_REDUCE_RAKING</b>.  An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
 *   -# <b>cub::BLOCK_REDUCE_WARP_REDUCTIONS</b>.  A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
 *
 * \par Performance Considerations
 * - \granularity
 * - Very efficient (only one synchronization barrier).
 * - Incurs zero bank conflicts for most types
 * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
 *   - Summation (<b><em>vs.</em></b> generic reduction)
 *   - \p BLOCK_THREADS is a multiple of the architecture's warp size
 *   - Every thread has a valid input (i.e., full <b><em>vs.</em></b> partial-tiles)
 * - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives
 *
 * \par A Simple Example
 * \blockcollective{BlockReduce}
 * \par
 * The code snippet below illustrates a sum reduction of 512 integer items that
 * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
 * where each thread owns 4 consecutive items.
 * \par
 * \code
 * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
 *
 * __global__ void ExampleKernel(...)
 * {
 *     // Specialize BlockReduce for a 1D block of 128 threads on type int
 *     typedef cub::BlockReduce<int, 128> BlockReduce;
 *
 *     // Allocate shared memory for BlockReduce
 *     __shared__ typename BlockReduce::TempStorage temp_storage;
 *
 *     // Obtain a segment of consecutive items that are blocked across threads
 *     int thread_data[4];
 *     ...
 *
 *     // Compute the block-wide sum for thread0
 *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
 *
 * \endcode
 *
 */
template <
    typename                T,
    int                     BLOCK_DIM_X,
    BlockReduceAlgorithm    ALGORITHM       = BLOCK_REDUCE_WARP_REDUCTIONS,
    int                     BLOCK_DIM_Y     = 1,
    int                     BLOCK_DIM_Z     = 1,
    int                     PTX_ARCH        = CUB_PTX_ARCH>
class BlockReduce
{
private:

    /******************************************************************************
     * Constants and type definitions
     ******************************************************************************/

    /// Constants
    enum
    {
        /// The thread block size in threads
        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
    };

    typedef BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>           WarpReductions;
    typedef BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>    RakingCommutativeOnly;
    typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>                   Raking;

    /// Internal specialization type
    typedef typename If<(ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS),
        WarpReductions,
        typename If<(ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY),
            RakingCommutativeOnly,
            Raking>::Type>::Type InternalBlockReduce;     // BlockReduceRaking

    /// Shared memory storage layout type for BlockReduce
    typedef typename InternalBlockReduce::TempStorage _TempStorage;


    /******************************************************************************
     * Utility methods
     ******************************************************************************/

    /// Internal storage allocator
    __device__ __forceinline__ _TempStorage& PrivateStorage()
    {
        __shared__ _TempStorage private_storage;
        return private_storage;
    }


    /******************************************************************************
     * Thread fields
     ******************************************************************************/

    /// Shared storage reference
    _TempStorage &temp_storage;

    /// Linear thread-id
    unsigned int linear_tid;


public:

    /// \smemstorage{BlockReduce}
    struct TempStorage : Uninitialized<_TempStorage> {};


    /******************************************************************//**
     * \name Collective constructors
     *********************************************************************/
    //@{

    /**
     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
     */
    __device__ __forceinline__ BlockReduce()
    :
        temp_storage(PrivateStorage()),
        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
    {}


    /**
     * \brief Collective constructor using the specified memory allocation as temporary storage.
     */
    __device__ __forceinline__ BlockReduce(
        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
    :
        temp_storage(temp_storage.Alias()),
        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
    {}


    //@}  end member group
    /******************************************************************//**
     * \name Generic reductions
     *********************************************************************/
    //@{


    /**
     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes one input element.
     *
     * \par
     * - The return value is undefined in threads other than thread<sub>0</sub>.
     * - \rowmajor
     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates a max reduction of 128 integer items that
     * are partitioned across 128 threads.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
     *
     * __global__ void ExampleKernel(...)
     * {
     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
     *     typedef cub::BlockReduce<int, 128> BlockReduce;
     *
     *     // Allocate shared memory for BlockReduce
     *     __shared__ typename BlockReduce::TempStorage temp_storage;
     *
     *     // Each thread obtains an input item
     *     int thread_data;
     *     ...
     *
     *     // Compute the block-wide max for thread0
     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
     *
     * \endcode
     *
     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
     */
    template <typename ReductionOp>
    __device__ __forceinline__ T Reduce(
        T               input,                      ///< [in] Calling thread's input
        ReductionOp     reduction_op)               ///< [in] Binary reduction functor 
    {
        return InternalBlockReduce(temp_storage).template Reduce<true>(input, BLOCK_THREADS, reduction_op);
    }


    /**
     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes an array of consecutive input elements.
     *
     * \par
     * - The return value is undefined in threads other than thread<sub>0</sub>.
     * - \granularity
     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates a max reduction of 512 integer items that
     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
     * where each thread owns 4 consecutive items.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
     *
     * __global__ void ExampleKernel(...)
     * {
     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
     *     typedef cub::BlockReduce<int, 128> BlockReduce;
     *
     *     // Allocate shared memory for BlockReduce
     *     __shared__ typename BlockReduce::TempStorage temp_storage;
     *
     *     // Obtain a segment of consecutive items that are blocked across threads
     *     int thread_data[4];
     *     ...
     *
     *     // Compute the block-wide max for thread0
     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
     *
     * \endcode
     *
     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
     */
    template <
        int ITEMS_PER_THREAD,
        typename ReductionOp>
    __device__ __forceinline__ T Reduce(
        T               (&inputs)[ITEMS_PER_THREAD],    ///< [in] Calling thread's input segment
        ReductionOp     reduction_op)                   ///< [in] Binary reduction functor 
    {
        // Reduce partials
        T partial = internal::ThreadReduce(inputs, reduction_op);
        return Reduce(partial, reduction_op);
    }


    /**
     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  The first \p num_valid threads each contribute one input element.
     *
     * \par
     * - The return value is undefined in threads other than thread<sub>0</sub>.
     * - \rowmajor
     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates a max reduction of a partially-full tile of integer items that
     * are partitioned across 128 threads.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
     *
     * __global__ void ExampleKernel(int num_valid, ...)
     * {
     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
     *     typedef cub::BlockReduce<int, 128> BlockReduce;
     *
     *     // Allocate shared memory for BlockReduce
     *     __shared__ typename BlockReduce::TempStorage temp_storage;
     *
     *     // Each thread obtains an input item
     *     int thread_data;
     *     if (threadIdx.x < num_valid) thread_data = ...
     *
     *     // Compute the block-wide max for thread0
     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid);
     *
     * \endcode
     *
     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
     */
    template <typename ReductionOp>
    __device__ __forceinline__ T Reduce(
        T                   input,                  ///< [in] Calling thread's input
        ReductionOp         reduction_op,           ///< [in] Binary reduction functor 
        int                 num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
    {
        // Determine if we scan skip bounds checking
        if (num_valid >= BLOCK_THREADS)
        {
            return InternalBlockReduce(temp_storage).template Reduce<true>(input, num_valid, reduction_op);
        }
        else
        {
            return InternalBlockReduce(temp_storage).template Reduce<false>(input, num_valid, reduction_op);
        }
    }


    //@}  end member group
    /******************************************************************//**
     * \name Summation reductions
     *********************************************************************/
    //@{


    /**
     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes one input element.
     *
     * \par
     * - The return value is undefined in threads other than thread<sub>0</sub>.
     * - \rowmajor
     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates a sum reduction of 128 integer items that
     * are partitioned across 128 threads.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
     *
     * __global__ void ExampleKernel(...)
     * {
     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
     *     typedef cub::BlockReduce<int, 128> BlockReduce;
     *
     *     // Allocate shared memory for BlockReduce
     *     __shared__ typename BlockReduce::TempStorage temp_storage;
     *
     *     // Each thread obtains an input item
     *     int thread_data;
     *     ...
     *
     *     // Compute the block-wide sum for thread0
     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
     *
     * \endcode
     *
     */
    __device__ __forceinline__ T Sum(
        T   input)                      ///< [in] Calling thread's input
    {
        return InternalBlockReduce(temp_storage).template Sum<true>(input, BLOCK_THREADS);
    }

    /**
     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes an array of consecutive input elements.
     *
     * \par
     * - The return value is undefined in threads other than thread<sub>0</sub>.
     * - \granularity
     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates a sum reduction of 512 integer items that
     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
     * where each thread owns 4 consecutive items.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
     *
     * __global__ void ExampleKernel(...)
     * {
     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
     *     typedef cub::BlockReduce<int, 128> BlockReduce;
     *
     *     // Allocate shared memory for BlockReduce
     *     __shared__ typename BlockReduce::TempStorage temp_storage;
     *
     *     // Obtain a segment of consecutive items that are blocked across threads
     *     int thread_data[4];
     *     ...
     *
     *     // Compute the block-wide sum for thread0
     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
     *
     * \endcode
     *
     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
     */
    template <int ITEMS_PER_THREAD>
    __device__ __forceinline__ T Sum(
        T   (&inputs)[ITEMS_PER_THREAD])    ///< [in] Calling thread's input segment
    {
        // Reduce partials
        T partial = internal::ThreadReduce(inputs, cub::Sum());
        return Sum(partial);
    }


    /**
     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  The first \p num_valid threads each contribute one input element.
     *
     * \par
     * - The return value is undefined in threads other than thread<sub>0</sub>.
     * - \rowmajor
     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates a sum reduction of a partially-full tile of integer items that
     * are partitioned across 128 threads.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
     *
     * __global__ void ExampleKernel(int num_valid, ...)
     * {
     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
     *     typedef cub::BlockReduce<int, 128> BlockReduce;
     *
     *     // Allocate shared memory for BlockReduce
     *     __shared__ typename BlockReduce::TempStorage temp_storage;
     *
     *     // Each thread obtains an input item (up to num_items)
     *     int thread_data;
     *     if (threadIdx.x < num_valid)
     *         thread_data = ...
     *
     *     // Compute the block-wide sum for thread0
     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid);
     *
     * \endcode
     *
     */
    __device__ __forceinline__ T Sum(
        T   input,                  ///< [in] Calling thread's input
        int num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
    {
        // Determine if we scan skip bounds checking
        if (num_valid >= BLOCK_THREADS)
        {
            return InternalBlockReduce(temp_storage).template Sum<true>(input, num_valid);
        }
        else
        {
            return InternalBlockReduce(temp_storage).template Sum<false>(input, num_valid);
        }
    }


    //@}  end member group
};

/**
 * \example example_block_reduce.cu
 */

}               // CUB namespace
CUB_NS_POSTFIX  // Optional outer namespace(s)

