// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// unpack.h: unpacking the result blocks computed by compute.h,
// storing them into the destination matrix.

#ifndef GEMMLOWP_INTERNAL_UNPACK_H_
#define GEMMLOWP_INTERNAL_UNPACK_H_

#include "allocator.h"
#include "block_params.h"
#include "output.h"
#include "pack.h"

#include <cmath>

namespace gemmlowp {

template<typename tScalar>
class PackedResultBase {
 public:
  PackedResultBase(Allocator* _allocator, const BlockParams& _block_params)
      : allocator_(_allocator), block_params_(_block_params) {
    matrix_handle_ = allocator_->Reserve<tScalar>(block_params_.l2_rows *
                                                       block_params_.l2_cols);
  }

  ~PackedResultBase() {}

  MatrixMap<tScalar, MapOrder::ColMajor> Map() {
    return MatrixMap<tScalar, MapOrder::ColMajor>(
        allocator_->GetPointer<tScalar>(matrix_handle_),
        block_params_.l2_rows, block_params_.l2_cols, block_params_.l2_rows);
  }

  MatrixMap<const tScalar, MapOrder::ColMajor> Map() const {
    return MatrixMap<const tScalar, MapOrder::ColMajor>(
        allocator_->GetPointer<const tScalar>(matrix_handle_),
        block_params_.l2_rows, block_params_.l2_cols, block_params_.l2_rows);
  }

 private:
  Allocator* allocator_;
  Allocator::Handle matrix_handle_;
  const BlockParams& block_params_;
};

#ifdef INT8_ACC
using PackedResult = PackedResultBase<std::int8_t>;
#else
using PackedResult = PackedResultBase<std::int32_t>;
#endif


struct MatrixBlockBounds {
  int start_row;
  int start_col;
  int rows;
  int cols;

  MatrixBlockBounds(int start_row_, int start_col_, int rows_, int cols_)
      : start_row(start_row_),
        start_col(start_col_),
        rows(rows_),
        cols(cols_) {}
};

template <int Rows, int Cols, typename SrcMapType>
void PrefetchResultBlock(const SrcMapType& src,
                         const VectorMap<const std::int32_t, VectorShape::Col>&
                             lhs_sums_of_each_slice,
                         int src_row, int src_col) {
  const std::int32_t* src_data = src.data(src_row, src_col);
  const int src_stride = src.stride();
  const std::int32_t* lhs_sums_data = lhs_sums_of_each_slice.data(src_row);
  for (int r = 0; r < Rows; r += 4) {
    Prefetch(lhs_sums_data + r);
  }
  for (int c = 0; c < Cols; c++) {
    for (int r = 0; r < Rows; r += 4) {
      Prefetch(src_data + r + c * src_stride);
    }
  }
}

template <typename KernelFormat, typename RegisterBlockType,
          typename SrcMapType, typename LhsOffset, typename RhsOffset,
          typename OutputPipelineExecutorType, typename DstType>
void UnpackResultBlock(const SrcMapType& src,
                       const OutputPipelineExecutorType& executor, DstType* dst,
                       const VectorMap<const std::int32_t, VectorShape::Col>&
                           lhs_sums_of_each_slice,
                       const VectorMap<const std::int32_t, VectorShape::Row>&
                           rhs_sums_of_each_slice,
                       const LhsOffset& lhs_offset, const RhsOffset& rhs_offset,
                       int depth, int src_row, int src_col, int src_global_row,
                       int src_global_col, int dst_row, int dst_col) {
  using KernelLhsInputScalar = typename KernelFormat::Lhs::InputScalar;
  using KernelLhsScalar = typename KernelFormat::Lhs::Scalar;
  using KernelRhsInputScalar = typename KernelFormat::Rhs::InputScalar;
  using KernelRhsScalar = typename KernelFormat::Rhs::Scalar;
  static constexpr int KernelLhsZeroPointInput =
      ZeroPointInputValue<KernelLhsInputScalar, KernelLhsScalar>::kValue;
  static constexpr int KernelRhsZeroPointInput =
      ZeroPointInputValue<KernelRhsInputScalar, KernelRhsScalar>::kValue;
  auto acc = Load<RegisterBlockType>(src, src_row, src_col);

  //for (int i = 0; i < 4; i++)
  //    std::cout << "acc.reg[" << i << "] = " << (int)acc.buf.reg[i] << "\n";

  const auto& lhs_sums_of_each_slice_block =
      LoadForBroadcasting<RegisterBlockType>(lhs_sums_of_each_slice, src_row);
  const auto& rhs_sums_of_each_slice_block =
      LoadForBroadcasting<RegisterBlockType>(rhs_sums_of_each_slice, src_col);
  auto lhs_offset_block =
      LoadForBroadcasting<RegisterBlockType>(lhs_offset, src_row);
  auto rhs_offset_block =
      LoadForBroadcasting<RegisterBlockType>(rhs_offset, src_col);
  AddConstant<KernelLhsZeroPointInput>(&lhs_offset_block);
  AddConstant<KernelRhsZeroPointInput>(&rhs_offset_block);
  BroadcastMulAdd(lhs_sums_of_each_slice_block, rhs_offset_block, &acc);
  for (int i = 0; i < decltype(rhs_offset_block)::kRegisterCount; i++) {
    rhs_offset_block.buf.reg[i] = Mul(rhs_offset_block.buf.reg[i], depth);
  }
  BroadcastMulAdd(BroadcastAdd(rhs_sums_of_each_slice_block, rhs_offset_block),
                  lhs_offset_block, &acc);
  executor.Execute(acc, dst, src_global_row, src_global_col, dst_row, dst_col);
}

template <typename KernelFormat, typename ResultBlockType,
          typename PackedResultType, typename LhsOffset, typename RhsOffset,
          typename OutputPipelineType>
void UnpackResult(ResultBlockType* dst, const MatrixBlockBounds& dst_block,
                  const PackedResultType& src, int depth,
                  const std::int32_t* lhs_sums_of_each_slice_ptr,
                  const std::int32_t* rhs_sums_of_each_slice_ptr,
                  const LhsOffset& lhs_offset, const RhsOffset& rhs_offset,
                  const OutputPipelineType& output_pipeline) {
  ScopedProfilingLabel label(ResultBlockType::kOrder == MapOrder::ColMajor
                                 ? "unpack to column-major"
                                 : "unpack to row-major");
  assert(dst_block.start_row >= 0);
  assert(dst_block.start_row + dst_block.rows <= dst->rows());
  assert(dst_block.start_col >= 0);
  assert(dst_block.start_col + dst_block.cols <= dst->cols());
  const auto src_map = src.Map();
  const VectorMap<const std::int32_t, VectorShape::Col> lhs_sums_of_each_slice(
      lhs_sums_of_each_slice_ptr, dst_block.rows);
  const VectorMap<const std::int32_t, VectorShape::Row> rhs_sums_of_each_slice(
      rhs_sums_of_each_slice_ptr, dst_block.cols);
  using Int32x1x1 = RegisterBlock<std::int32_t, 1, 1>;
  using Int32x4x1 = RegisterBlock<std::int32_t, 4, 1>;
  using Int32x8x1 = RegisterBlock<std::int32_t, 8, 1>;
  using Int32x1x4 = RegisterBlock<std::int32_t, 1, 4>;
  using Int32x4x4 = RegisterBlock<std::int32_t, 4, 4>;
  using Int32x8x4 = RegisterBlock<std::int32_t, 8, 4>;

  using DstScalarType = typename ResultBlockType::Scalar;
  using DstScalarx8x8 = RegisterBlock<DstScalarType, 8, 8>;

  OutputPipelineExecutor<OutputPipelineType, Int32x1x1>
      output_pipeline_executor_1x1(output_pipeline);
  OutputPipelineExecutor<OutputPipelineType, Int32x4x1>
      output_pipeline_executor_4x1(output_pipeline);
  OutputPipelineExecutor<OutputPipelineType, Int32x8x1>
      output_pipeline_executor_8x1(output_pipeline);
  OutputPipelineExecutor<OutputPipelineType, Int32x1x4>
      output_pipeline_executor_1x4(output_pipeline);
  OutputPipelineExecutor<OutputPipelineType, Int32x4x4>
      output_pipeline_executor_4x4(output_pipeline);
  OutputPipelineExecutor<OutputPipelineType, Int32x8x4>
      output_pipeline_executor_8x4(output_pipeline);

  int c8 = 0;
  if (ResultBlockType::kOrder == MapOrder::RowMajor) {
    for (; c8 <= dst_block.cols - 8; c8 += 8) {
      PrefetchResultBlock<8, 8>(src_map, lhs_sums_of_each_slice, 0, c8);
      int r = 0;
      for (; r <= dst_block.rows - 8; r += 8) {
        const int global_row = r + dst_block.start_row;
        PrefetchResultBlock<8, 8>(src_map, lhs_sums_of_each_slice, r + 8, c8);
        DstScalarType dst_colmajor_buf[64];
        MatrixMap<DstScalarType, MapOrder::ColMajor> dst_colmajor_map(
            dst_colmajor_buf, 8, 8);
        for (int cx = 0; cx < 8; cx += 4) {
          const int c = c8 + cx;
          const int global_col = c + dst_block.start_col;
          UnpackResultBlock<KernelFormat, Int32x8x4>(
              src_map, output_pipeline_executor_8x4, &dst_colmajor_map,
              lhs_sums_of_each_slice, rhs_sums_of_each_slice, lhs_offset,
              rhs_offset, depth, r, c, global_row, global_col, 0, cx);
        }
        StoreFinalOutput(LoadContiguous<DstScalarx8x8>(dst_colmajor_buf), dst,
                         r + dst_block.start_row, c8 + dst_block.start_col);
      }
      for (; r <= dst_block.rows - 4; r += 4) {
        const int global_row = r + dst_block.start_row;
        for (int cx = 0; cx < 8; cx += 4) {
          const int c = c8 + cx;
          const int global_col = c + dst_block.start_col;
          UnpackResultBlock<KernelFormat, Int32x4x4>(
              src_map, output_pipeline_executor_4x4, dst,
              lhs_sums_of_each_slice, rhs_sums_of_each_slice, lhs_offset,
              rhs_offset, depth, r, c, global_row, global_col, global_row,
              global_col);
        }
      }
      for (; r < dst_block.rows; r++) {
        const int global_row = r + dst_block.start_row;
        for (int cx = 0; cx < 8; cx += 4) {
          const int c = c8 + cx;
          const int global_col = c + dst_block.start_col;
          UnpackResultBlock<KernelFormat, Int32x1x4>(
              src_map, output_pipeline_executor_1x4, dst,
              lhs_sums_of_each_slice, rhs_sums_of_each_slice, lhs_offset,
              rhs_offset, depth, r, c, global_row, global_col, global_row,
              global_col);
        }
      }
    }
  }
  int c = c8;
  for (; c <= dst_block.cols - 4; c += 4) {
    const int global_col = c + dst_block.start_col;
    PrefetchResultBlock<8, 4>(src_map, lhs_sums_of_each_slice, 0, c);
    int r = 0;
    for (; r <= dst_block.rows - 8; r += 8) {
      const int global_row = r + dst_block.start_row;
      PrefetchResultBlock<8, 4>(src_map, lhs_sums_of_each_slice, r + 8, c);
      UnpackResultBlock<KernelFormat, Int32x8x4>(
          src_map, output_pipeline_executor_8x4, dst, lhs_sums_of_each_slice,
          rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c,
          global_row, global_col, global_row, global_col);
    }
    for (; r <= dst_block.rows - 4; r += 4) {
      const int global_row = r + dst_block.start_row;
      UnpackResultBlock<KernelFormat, Int32x4x4>(
          src_map, output_pipeline_executor_4x4, dst, lhs_sums_of_each_slice,
          rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c,
          global_row, global_col, global_row, global_col);
    }
    for (; r < dst_block.rows; r++) {
      const int global_row = r + dst_block.start_row;
      UnpackResultBlock<KernelFormat, Int32x1x4>(
          src_map, output_pipeline_executor_1x4, dst, lhs_sums_of_each_slice,
          rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c,
          global_row, global_col, global_row, global_col);
    }
  }
  for (; c < dst_block.cols; c++) {
    const int global_col = c + dst_block.start_col;
    PrefetchResultBlock<8, 1>(src_map, lhs_sums_of_each_slice, 0, c);
    int r = 0;
    for (; r <= dst_block.rows - 8; r += 8) {
      const int global_row = r + dst_block.start_row;
      PrefetchResultBlock<8, 1>(src_map, lhs_sums_of_each_slice, r + 8, c);
      UnpackResultBlock<KernelFormat, Int32x8x1>(
          src_map, output_pipeline_executor_8x1, dst, lhs_sums_of_each_slice,
          rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c,
          global_row, global_col, global_row, global_col);
    }
    for (; r <= dst_block.rows - 4; r += 4) {
      const int global_row = r + dst_block.start_row;
      UnpackResultBlock<KernelFormat, Int32x4x1>(
          src_map, output_pipeline_executor_4x1, dst, lhs_sums_of_each_slice,
          rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c,
          global_row, global_col, global_row, global_col);
    }
    for (; r < dst_block.rows; r++) {
      const int global_row = r + dst_block.start_row;
      UnpackResultBlock<KernelFormat, Int32x1x1>(
          src_map, output_pipeline_executor_1x1, dst, lhs_sums_of_each_slice,
          rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c,
          global_row, global_col, global_row, global_col);
    }
  }
}

}  // end namespace gemmlowp

#endif  // GEMMLOWP_INTERNAL_UNPACK_H_
