
#include "bench_harness.h"
#include "bench_utils.h"
#include <math.h>
#include <stdint.h>
#include <stdlib.h>
void kernel_run(int rows, int cols, const double *x, double eps, double *y) {
  double *mean = (double *)malloc((size_t)cols * sizeof(double));
  double *var = (double *)malloc((size_t)cols * sizeof(double));

  for (int c = 0; c < cols; c++) {
    double m = 0.0;
    for (int r = 0; r < rows; r++) {
      m += x[r * cols + c];
    }
    m /= (double)rows;
    mean[c] = m;
    double v = 0.0;
    for (int r = 0; r < rows; r++) {
      double d = x[r * cols + c] - m;
      v += d * d;
    }
    v /= (double)rows;
    var[c] = v;
  }
  for (int r = 0; r < rows; r++) {
    for (int c = 0; c < cols; c++) {
      int idx = r * cols + c;
      double inv_std = 1.0 / sqrt(var[c] + eps);
      y[idx] = (x[idx] - mean[c]) * inv_std;
    }
  }
  free(mean);
  free(var);
}
BENCH_MAIN_ARRAY3_D(
    T002_Ops_050, OP80, 16384, 32768, 65536,
    int rows = (case_id == 1 ? 128 : (case_id == 2 ? 256 : 512));
    int cols = 128; double *x = malloc((size_t)n * sizeof(double));
    double *y = malloc((size_t)n * sizeof(double));
    double eps = 1e-5, bench_fill_array(x, (size_t)n, bench_seed(80));
    , kernel_run(rows, cols, x, eps, y), y, (size_t)n, free(x); free(y))
