#pragma once
// QoL improvement code for moving data to and from the GPU.

#include <util/macros.h>
#include <gpu/contexts/device_context.h>

#include <containers/dense_matrix.h>
#include <containers/sparse_matrix.h>

#include "./dense_matrix.h"
#include "./sparse_matrix.h"


namespace npeff {
namespace gpu {


template<>
void DeviceContext::copy_to_device_async<gpu::DenseMatrix, npeff::DenseMatrix<float>>(
    gpu::DenseMatrix& dev_mat,
    npeff::DenseMatrix<float>& host_mat
);

template<>
void DeviceContext::copy_to_device_async<gpu::DenseMatrix, npeff::DenseMatrixContiguousView<float>>(
    gpu::DenseMatrix& dev_mat,
    npeff::DenseMatrixContiguousView<float>& host_mat
);

template<>
void DeviceContext::copy_to_device_async<gpu::CsrMatrix<int32_t>, npeff::CsrMatrix<int32_t>>(
    gpu::CsrMatrix<int32_t>& dev_mat,
    npeff::CsrMatrix<int32_t>& host_mat
);

template<>
void DeviceContext::copy_to_device_async<gpu::CsrMatrix<int64_t>, npeff::CsrMatrix<int64_t>>(
    gpu::CsrMatrix<int64_t>& dev_mat,
    npeff::CsrMatrix<int64_t>& host_mat
);

// ///////////////////////////////////////////////////////////


void copy_to_host_into_submatrix_async(
    DeviceContext& ctx, 
    gpu::DenseMatrix& dev_mat,
    npeff::DenseMatrix<float>& host_mat,
    int64_t row_offset,
    int64_t col_offset
);


}  // gpu
}  // npeff
