#pragma once

#include <omp.h>
#include <torch/extension.h>

void synchronize_cuda();
void d2h_synchronize_cuda();
void h2d_synchronize_cuda();

void fill_async_cuda(torch::Tensor src, torch::Tensor dst);

void upload_async_cuda(torch::Tensor src, torch::Tensor dst);

void gather_async_cuda(int pid, std::vector<torch::Tensor> srcs,
                       torch::Tensor dst, std::vector<torch::Tensor> bndries);

void scatter_async_cuda(int pid, torch::Tensor src,
                        std::vector<torch::Tensor> dsts,
                        std::vector<torch::Tensor> bndries);

void read_async_cuda(torch::Tensor src,
                     torch::optional<torch::Tensor> optional_offset,
                     torch::optional<torch::Tensor> optional_count,
                     torch::Tensor index, torch::Tensor dst,
                     torch::Tensor buffer);
void write_async_cuda(torch::Tensor src, torch::Tensor offset,
                      torch::Tensor count, torch::Tensor dst);
                      
void contiguous_write_async_cuda(torch::Tensor src, torch::Tensor dst);

void conti_write_with_reduction_async_cuda(torch::Tensor src, torch::Tensor dst);

void write_with_reduction_async_cuda(torch::Tensor src, torch::Tensor dst, torch::Tensor index);