
#include "bench_harness.h"
#include "bench_utils.h"
#include <math.h>
#include <stdint.h>
#include <stdlib.h>
void kernel_run(int S, int D, const double *Q, const double *K,
                double *Scores) {
  double scale = 1.0 / sqrt((double)D);

  for (int i = 0; i < S; i++) {
    for (int j = 0; j < S; j++) {
      double acc = 0.0;
      for (int d = 0; d < D; d++) {
        acc += Q[i * D + d] * K[j * D + d];
      }
      Scores[i * S + j] = acc * scale;
    }
  }
}
BENCH_MAIN_ARRAY3_D(T002_Ops_052, OP82, 4096, 16384, 65536, int D = 64;
                    int S = (case_id == 1 ? 64 : (case_id == 2 ? 128 : 256));
                    double *Q = malloc((size_t)(S * D) * sizeof(double));
                    double *K = malloc((size_t)(S * D) * sizeof(double));
                    double *Scores = malloc((size_t)(S * S) * sizeof(double)),
                    bench_fill_array(Q, (size_t)(S * D), bench_seed(82));
                    bench_fill_array(K, (size_t)(S * D),
                                     bench_seed(82) ^ 0x222u);
                    , kernel_run(S, D, Q, K, Scores), Scores, (size_t)(S * S),
                    free(Q);
                    free(K); free(Scores))
