
#include "bench_harness.h"
#include "bench_utils.h"
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
void kernel_run(int R, int D, int G, const double *x, const double *g,
                const double *b, double *y, double eps) {
  int group_sz = D / G;

  for (int r = 0; r < R; r++) {
    const double *xr = x + r * D;
    double *yr = y + r * D;
    for (int gidx = 0; gidx < G; gidx++) {
      int off = gidx * group_sz;
      double mean = 0.0;
      for (int i = 0; i < group_sz; i++)
        mean += xr[off + i];
      mean /= group_sz;
      double var = 0.0;
      for (int i = 0; i < group_sz; i++) {
        double d = xr[off + i] - mean;
        var += d * d;
      }
      var /= group_sz;
      double inv = 1.0 / sqrt(var + eps);
      for (int i = 0; i < group_sz; i++) {
        int j = off + i;
        yr[j] = (xr[j] - mean) * inv * g[j] + b[j];
      }
    }
  }
}
BENCH_MAIN_ARRAY3_D(T002_Ops_011, OP11, 4096, 8192, 16384, int D = 64;
                    int R = n / D; int G = 8;
                    double *x = malloc((size_t)R * D * sizeof(double));
                    double *y = malloc((size_t)R * D * sizeof(double));
                    double *gamma = malloc(D * sizeof(double));
                    double *beta = malloc(D * sizeof(double)),
                    bench_fill_array(x, (size_t)R *D, bench_seed(11));
                    bench_fill_array(gamma, D, bench_seed(11) ^ 0x55);
                    bench_fill_array(beta, D, bench_seed(11) ^ 0x66);
                    , kernel_run(R, D, G, x, gamma, beta, y, 1e-5), y,
                    (size_t)R *D, free(x);
                    free(y); free(gamma); free(beta))
