#pragma once

#include <cstdint>

#include <cuda.h>
#include <cuda_runtime.h>

#include <cuda/util/editor_hack.h>


namespace sfrp {
namespace device {



template <typename scalar_t, uint32_t n_threads_per_block>
__device__ __forceinline__ scalar_t reduce_scalar_across_block(scalar_t thread_acc) {
    // This depends on the specific way that I constructed the blocks.
    const uint64_t thread_index = threadIdx.x;

    __shared__ scalar_t s_cache[n_threads_per_block];
    s_cache[thread_index] = thread_acc;
    __syncthreads();

    // We must have n_threads_per_block be a power of 2 due to the following code.
    int j = n_threads_per_block / 2;
    while (j != 0) {
        if (thread_index < j) {
            s_cache[thread_index] += s_cache[thread_index + j];
        }
        __syncthreads();
        j /= 2;
    }
    return s_cache[0];
}



template <typename scalar_t>
__device__ __forceinline__ scalar_t reduce_scalar_across_threads(
    scalar_t thread_acc,
    uint32_t thread_index,
    scalar_t* shared_buffer,
    // NOTE: n_threads must be a power of 2. This is NOT checked.
    uint32_t n_threads
) {

    shared_buffer[thread_index] = thread_acc;
    __syncthreads();

    // We must have n_threads be a power of 2 due to the following code.
    int j = n_threads / 2;
    while (j != 0) {
        if (thread_index < j) {
            shared_buffer[thread_index] += shared_buffer[thread_index + j];
        }
        __syncthreads();
        j /= 2;
    }
    return shared_buffer[0];
}


}  // device
}  // sfrp

