
#include "bench_harness.h"
#include "bench_utils.h"
#include <math.h>
#include <stdint.h>
#include <stdlib.h>
void kernel_run(int rows, int cols, const double *a, const double *b,
                const double *gamma, double eps, double *y) {

  for (int r = 0; r < rows; r++) {
    double sqsum = 0.0;
    for (int c = 0; c < cols; c++) {
      int idx = r * cols + c;
      double tmp = a[idx] + b[idx];
      sqsum += tmp * tmp;
    }
    double mean_sq = sqsum / (double)cols;
    double inv_rms = 1.0 / sqrt(mean_sq + eps);
    for (int c = 0; c < cols; c++) {
      int idx = r * cols + c;
      double tmp = a[idx] + b[idx];
      double nr = tmp * inv_rms;
      y[idx] = nr * gamma[c];
    }
  }
}
BENCH_MAIN_ARRAY3_D(
    T002_Ops_041, OP71, 16384, 32768, 65536,
    int rows = (case_id == 1 ? 128 : (case_id == 2 ? 256 : 512));
    int cols = 128; double *a = malloc((size_t)n * sizeof(double));
    double *b = malloc((size_t)n * sizeof(double));
    double *gamma = malloc((size_t)cols * sizeof(double));
    double *y = malloc((size_t)n * sizeof(double));
    double eps = 1e-5, bench_fill_array(a, (size_t)n, bench_seed(71));
    bench_fill_array(b, (size_t)n, bench_seed(71) ^ 0xF00u);
    bench_fill_array(gamma, (size_t)cols, bench_seed(71) ^ 0xBADu);
    , kernel_run(rows, cols, a, b, gamma, eps, y), y, (size_t)n, free(a);
    free(b); free(gamma); free(y))
