#include <cuda_runtime.h>
#include <stdio.h>

extern "C++" {
    __global__ void accumArraySpecific(const double* m_indices, const float* quad_forms_vector, 
                                      float* output,   int M, int N, int k) {
        
        int idx = blockIdx.x * blockDim.x + threadIdx.x;
        int total_size = N*k;
        if (idx < total_size) {

            int n_index = idx / k + 1;
            

            int m_index = (int)m_indices[idx];
            

            if (m_index >= 1 && m_index <= M && n_index >= 1 && n_index <= N) {
                float value = quad_forms_vector[idx];
                

                int linear_index = (n_index - 1) * M + (m_index - 1);
                

                atomicAdd(&output[linear_index], value);
            }
        }
    }
    
    
    __global__ void accumArrayDebug(const double* m_indices, const float* quad_forms_vector, 
                                   float* output, int total_size, int M, int N, int k) {
        int idx = blockIdx.x * blockDim.x + threadIdx.x;
        

        if (idx < 5) {
            int n_index = idx / k + 1;
            int m_index = (int)m_indices[idx];

        }
        
        if (idx < total_size) {
            int n_index = idx / k + 1;
            int m_index = (int)m_indices[idx];
            
            if (m_index >= 1 && m_index <= M && n_index >= 1 && n_index <= N) {
                float value = quad_forms_vector[idx];
                int linear_index = (n_index - 1) * M + (m_index - 1);
                atomicAdd(&output[linear_index], value);
            }
        }
    }
}