/***************************************************************************************************
 * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice, this
 * list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
/*! \file
    \brief Unit tests for thread-level Reduction
*/

#pragma once

#include "cutlass/reduction/thread/reduce.h"

#include "cutlass/layout/vector.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/tensor_view_io.h"

#include "cutlass/util/reference/host/tensor_copy.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/reference/host/tensor_compare.h"

namespace test {
namespace reduction {
namespace thread {

/////////////////////////////////////////////////////////////////////////////////////////////////

/// Structure to compute the reduction
template <
  /// Data type of elements
  typename Element,
  /// Number of elements
  int N
>
struct Testbed_reduce_host {

  /// Thread-level reduction operator
  using Reduce = cutlass::reduction::thread::Reduce<
    cutlass::plus<Element>,
    cutlass::Array<Element, N>
  >;

  //
  // Data members
  //

  cutlass::Array<Element, N> tensor_in;
  cutlass::Array<Element, 1> reduced_tensor_computed;
  cutlass::Array<Element, 1> reduced_tensor_reference;

  //
  // Methods
  //

  /// Allocates workspace in device memory
  Testbed_reduce_host() {
    tensor_in.clear();
    reduced_tensor_computed.clear();
    reduced_tensor_reference.clear();
  }

  /// Runs the test
  bool run() {

    //
    // initialize memory
    //

    for(int i = 0; i < N; i++)
      tensor_in.at(i) = Element(i);

   
    Reduce reduce;

    cutlass::Array<Element, 1> *out_ptr = &reduced_tensor_computed;
    out_ptr[0] = reduce(tensor_in);

    //
    // Reference implementation
    //
    Element e(0);
    for (int i = 0; i < N; i++)
       e = e + Element(i);

    reduced_tensor_reference.at(0) = e;

    //
    // Verify equivalence
    //

    // compare
    bool passed = reduced_tensor_reference[0] == reduced_tensor_computed[0];

    EXPECT_TRUE(passed) 
    << "Expected = " << float(reduced_tensor_reference.at(0)) << "\n\n"
    << "Actual   = " << float(reduced_tensor_computed.at(0)) << "\n\n"
    << std::endl;
    
    return passed;
  }
};


/////////////////////////////////////////////////////////////////////////////////////////////////

/// Thread-level reduction kernel
template <typename Element, int N>
__global__ void kernel_reduce(Element const *array_in, Element *result) {

  /// Thread-level reduction operator
  using Reduce = cutlass::reduction::thread::Reduce<
    cutlass::plus<Element>,
    cutlass::Array<Element, N>
  >;

  Reduce reduce;

  auto ptr_in = reinterpret_cast<cutlass::Array<Element , N> const *>(array_in);
  auto result_ptr = reinterpret_cast<cutlass::Array<Element , 1> *>(result);
  auto in = *ptr_in;
  result_ptr[0] = reduce(in);
}


/// Structure to compute the reduction
template <
  /// Data type of elements
  typename Element,
  /// Number of elements
  int N
>
struct Testbed_reduce_device {

  using Layout = cutlass::layout::PackedVectorLayout;

  //
  // Data members
  //

  cutlass::HostTensor<Element, Layout> tensor_in;
  cutlass::HostTensor<Element, Layout> reduced_tensor_computed;
  cutlass::HostTensor<Element, Layout> reduced_tensor_reference;

  //
  // Methods
  //

  /// Allocates workspace in device memory
  Testbed_reduce_device() {

    tensor_in.reset(cutlass::make_Coord(N), true);
    reduced_tensor_computed.reset(cutlass::make_Coord(1), true);
    reduced_tensor_reference.reset(cutlass::make_Coord(1), true);
  }


  /// Runs the test
  bool run() {

    //
    // initialize memory
    //

    cutlass::reference::host::TensorFill(
      tensor_in.host_view(),
      Element(1)
    );

    cutlass::reference::host::TensorFill(
      reduced_tensor_computed.host_view(),
      Element(0)
    );

    cutlass::reference::host::TensorFill(
      reduced_tensor_reference.host_view(),
      Element(N)
    );

    tensor_in.sync_device();
    reduced_tensor_computed.sync_device();
    reduced_tensor_reference.sync_device();

    /// call the kernel
    kernel_reduce<Element, N><<< dim3(1, 1), dim3(1, 1, 1) >>> (
        tensor_in.device_data(), 
        reduced_tensor_computed.device_data()
        );
    
    // verify no errors
    cudaError_t result = cudaDeviceSynchronize();

    EXPECT_EQ(result, cudaSuccess) << "CUDA ERROR: " << cudaGetErrorString(result);
    if (result != cudaSuccess) {
      return false;
    }

    // Copy back results
    reduced_tensor_computed.sync_host();

    // Verify equivalence
    bool passed = cutlass::reference::host::TensorEquals(
      reduced_tensor_computed.host_view(),
      reduced_tensor_reference.host_view()
    );

    EXPECT_TRUE(passed) 
    << "Expected = " << reduced_tensor_reference.host_view() << "\n\n"
    << "Actual   = " << reduced_tensor_computed.host_view() << "\n\n"
    << std::endl;
    
    return passed;
  }
};

} // namespace thread
} // namespace reduction
} // namespace test
