#pragma once
// Element-wise product of two matrices.


#include <cstdint>

#include <cuda_runtime.h>

#include <gpu/contexts/device_context.h>
#include <gpu/containers/dense_matrix.h>


namespace npeff {
namespace gpu {
namespace ops {
namespace custom {


class HadamardProduct {
    DeviceContext& ctx;

    DenseMatrix& left;
    DenseMatrix& right;
    DenseMatrix& out;

    const int64_t n_elements;

    // TODO: Figure out how to set this.
    const int64_t block_size = 256;

public:
    HadamardProduct(
        DeviceContext& ctx,
        DenseMatrix& left,
        DenseMatrix& right,
        DenseMatrix& out
    ) :
        ctx(ctx),
        left(left), right(right), out(out),
        n_elements(out.n_rows * out.n_cols)
    {
        THROW_IF_FALSE(left.n_rows == out.n_rows);
        THROW_IF_FALSE(right.n_rows == out.n_rows);
        THROW_IF_FALSE(left.n_cols == out.n_cols);
        THROW_IF_FALSE(right.n_cols == out.n_cols);

    }

    void call_async();
};


}  // custom
}  // ops
}  // gpu
}  // npeff
