
#include "bench_harness.h"
#include "bench_utils.h"
#include <math.h>
#include <stdint.h>
#include <stdlib.h>
void kernel_run(int C, int H, int W, const double *x, const double *gamma,
                const double *beta, double eps, double *y) {
  int HW = H * W;

  for (int c = 0; c < C; c++) {
    double mean = 0.0;
    for (int hw = 0; hw < HW; hw++) {
      mean += x[c * HW + hw];
    }
    mean /= (double)HW;
    double var = 0.0;
    for (int hw = 0; hw < HW; hw++) {
      double d = x[c * HW + hw] - mean;
      var += d * d;
    }
    var /= (double)HW;
    double inv_std = 1.0 / sqrt(var + eps);
    double g = gamma[c];
    double b = beta[c];
    for (int hw = 0; hw < HW; hw++) {
      double v = (x[c * HW + hw] - mean) * inv_std;
      y[c * HW + hw] = v * g + b;
    }
  }
}
BENCH_MAIN_ARRAY3_D(T002_Ops_043, OP73, 4096, 16384, 65536, int H = 32;
                    int W = 32;
                    int C = (case_id == 1 ? 4 : (case_id == 2 ? 16 : 64));
                    double *x = malloc((size_t)(C * H * W) * sizeof(double));
                    double *y = malloc((size_t)(C * H * W) * sizeof(double));
                    double *gamma = malloc((size_t)C * sizeof(double));
                    double *beta = malloc((size_t)C * sizeof(double));
                    double eps = 1e-5,
                    bench_fill_array(x, (size_t)(C * H * W), bench_seed(73));
                    bench_fill_array(gamma, (size_t)C, bench_seed(73) ^ 0x1u);
                    bench_fill_array(beta, (size_t)C, bench_seed(73) ^ 0x2u);
                    , kernel_run(C, H, W, x, gamma, beta, eps, y), y,
                    (size_t)(C * H * W), free(x);
                    free(y); free(gamma); free(beta))
