


#include <stdio.h>
#define THREADS_PER_BLOCK 16
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))

// #define DEBUG
const float EPS = 1e-8;


__device__ inline float rmbf(const float bj, const float bk){
    if (min(bj, bk) == 0.0)
        return 0.0;
    else
        return 1 - abs(bj - bk) / (bj + bk);
}


__device__ inline float diss(const float *belief, const int K){

    float dissonance = 0.0;
    // get sum of belief
    float sum_of_belief = 0.0;
    for(int i = 0; i < K; i++){
        sum_of_belief += belief[i];
    }

    for(int k = 0; k < K; k++){
        float bk = belief[k];
        if (bk == 1.0)
            continue;
        float numerator = 0.0;
        float denominator = sum_of_belief - bk + 1e-8;
        for(int j = 0; j < K; j++){
            float bj = belief[j];
            if(k == j)
                continue;

            numerator += bj * rmbf(bj, bk);
        }
        numerator *= bk;
        dissonance += numerator / denominator;
    }

    return dissonance;

}

__global__ void dissonance_kernel(const int num, const float *belief, float *ans_dissonance, const int K){
    const int tid = blockDim.x * blockIdx.x + threadIdx.x;
    if(tid < num)
    {
        const float * cur_belief = tid*K + belief;
        ans_dissonance[tid] = diss(cur_belief, K);
    }
}


void dissonanceLauncher(const int num, const float *belief, float *ans_dissonance, const int K){

    dim3 blocks(DIVUP(num, THREADS_PER_BLOCK));  // blockIdx.x(col), blockIdx.y(row)
    dim3 threads(THREADS_PER_BLOCK);

    dissonance_kernel<<<blocks, threads>>>(num, belief, ans_dissonance, K);
#ifdef DEBUG
    cudaDeviceSynchronize();  // for using printf in kernel function
#endif
}