#include "./elwise_square.h"

namespace npeff {
namespace gpu {
namespace ops {
namespace custom {


__global__
void ElwiseSquare_Kernel(int64_t n, float* in_data, float* out_data) {
    INDEX_STRIDE_1D(n, i) {
        out_data[i] = in_data[i] * in_data[i];
    }
}


void ElwiseSquare::call_async() {
    ctx.set_device();
    long n_blocks = (n_elements + block_size - 1) / block_size;

    ElwiseSquare_Kernel<<<n_blocks, block_size, 0, ctx.stream>>>(
        n_elements, (float*) in_data, (float*) out_data
    );
}


}  // custom
}  // ops
}  // gpu
}  // npeff
