
#include "bench_harness.h"
#include "bench_utils.h"
#include <math.h>
#include <stdint.h>
#include <stdlib.h>
void kernel_run(int rows, int cols, const double *a, const double *b,
                double eps, double *y) {

  for (int r = 0; r < rows; r++) {
    double dot = 0.0;
    double na2 = 0.0;
    double nb2 = 0.0;
    for (int c = 0; c < cols; c++) {
      int idx = r * cols + c;
      double va = a[idx];
      double vb = b[idx];
      dot += va * vb;
      na2 += va * va;
      nb2 += vb * vb;
    }
    double denom = sqrt(na2) * sqrt(nb2) + eps;
    y[r] = dot / denom;
  }
}
BENCH_MAIN_ARRAY3_D(
    T002_Ops_051, OP81, 16384, 32768, 65536,
    int rows = (case_id == 1 ? 128 : (case_id == 2 ? 256 : 512));
    int cols = 128; double *a = malloc((size_t)n * sizeof(double));
    double *b = malloc((size_t)n * sizeof(double));
    double *y_rows = malloc((size_t)rows * sizeof(double));
    double eps = 1e-9, bench_fill_array(a, (size_t)n, bench_seed(81));
    bench_fill_array(b, (size_t)n, bench_seed(81) ^ 0x111u);
    , kernel_run(rows, cols, a, b, eps, y_rows), y_rows, (size_t)rows, free(a);
    free(b); free(y_rows))
