
#include "bench_harness.h"
#include "bench_utils.h"
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
void kernel_run(int R, int D, const double *a, const double *b, const double *g,
                const double *t, double *y, double eps) {

  for (int r = 0; r < R; r++) {
    double mean = 0.0;
    for (int i = 0; i < D; i++) {
      double z = a[r * D + i] + b[r * D + i];
      mean += z;
      y[r * D + i] = z;
    }
    mean /= D;
    double var = 0.0;
    for (int i = 0; i < D; i++) {
      double d = y[r * D + i] - mean;
      var += d * d;
    }
    var /= D;
    double inv = 1.0 / sqrt(var + eps);
    for (int i = 0; i < D; i++) {
      double z = (y[r * D + i] - mean) * inv;
      y[r * D + i] = z * g[i] + t[i];
    }
  }
}
BENCH_MAIN_ARRAY3_D(T002_Ops_020, OP20, 4096, 8192, 16384, int D = 64;
                    int R = n / D;
                    double *a = malloc((size_t)R * D * sizeof(double));
                    double *b = malloc((size_t)R * D * sizeof(double));
                    double *y = malloc((size_t)R * D * sizeof(double));
                    double *g = malloc(D * sizeof(double));
                    double *t = malloc(D * sizeof(double)),
                    bench_fill_array(a, (size_t)R *D, bench_seed(20) ^ 1);
                    bench_fill_array(b, (size_t)R *D, bench_seed(20) ^ 2);
                    bench_fill_array(g, D, bench_seed(20) ^ 3);
                    bench_fill_array(t, D, bench_seed(20) ^ 4);
                    , kernel_run(R, D, a, b, g, t, y, 1e-5), y, (size_t)R *D,
                    free(a);
                    free(b); free(y); free(g); free(t))
