
#include "bench_harness.h"
#include "bench_utils.h"
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
void kernel_run(int R, int D, const double *logits, const int32_t *target,
                double *loss) {

  for (int r = 0; r < R; r++) {
    const double *x = logits + r * D;
    int t = target[r];
    double m = x[0];
    for (int i = 1; i < D; i++)
      if (x[i] > m)
        m = x[i];
    double s = 0.0;
    for (int i = 0; i < D; i++)
      s += exp(x[i] - m);
    double logZ = log(s) + m;
    loss[r] = logZ - x[t];
  }
}
BENCH_MAIN_ARRAY3_D(
    T002_Ops_024, OP24, 4096, 8192, 16384, int D = 64; int R = n / D;
    double *logits = malloc((size_t)R * D * sizeof(double));
    int32_t *tgt = malloc((size_t)R * sizeof(int32_t));
    double *loss = malloc((size_t)R * sizeof(double)),
    {
      bench_fill_array(logits, (size_t)R * D, bench_seed(24));
      for (int r = 0; r < R; r++) {
        tgt[r] = (int32_t)(bench_seed(24) ^ r) % D;
        if (tgt[r] < 0)
          tgt[r] = 0;
      }
    },
    kernel_run(R, D, logits, tgt, loss), loss, R, free(logits);
    free(tgt); free(loss))
