#include "bench_harness.h"
#include "bench_utils.h"
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define IDX(i, j, n) ((size_t)(i) * (size_t)(n) + (size_t)(j))
void kernel_run(int n, double *E, double *F, const double *A, const double *B,
                const double *C, const double *D) {

  for (int i = 0; i < n; i++)
    for (int j = 0; j < n; j++) {
      double s = 0.0;
      for (int k = 0; k < n; k++)
        s += A[IDX(i, k, n)] * B[IDX(k, j, n)];
      E[IDX(i, j, n)] = s;
    }
  for (int i = 0; i < n; i++)
    for (int j = 0; j < n; j++) {
      double s = 0.0;
      for (int k = 0; k < n; k++)
        s += C[IDX(i, k, n)] * D[IDX(k, j, n)];
      F[IDX(i, j, n)] = s;
    }
  double *TMP = E;
  const double *LHS = E;
  const double *RHS = F;
  for (int i = 0; i < n; i++)
    for (int j = 0; j < n; j++) {
      double s = 0.0;
      for (int k = 0; k < n; k++)
        s += LHS[IDX(i, k, n)] * RHS[IDX(k, j, n)];
      TMP[IDX(i, j, n)] = s;
    }
}
BENCH_MAIN_ARRAY3_D(T001_Loops_017, 12, 32, 48, 64,
                    double *A = malloc((size_t)n * n * sizeof(double));
                    double *B = malloc((size_t)n * n * sizeof(double));
                    double *C = malloc((size_t)n * n * sizeof(double));
                    double *D = malloc((size_t)n * n * sizeof(double));
                    double *E = malloc((size_t)n * n * sizeof(double));
                    double *F = malloc((size_t)n * n * sizeof(double));
                    , bench_fill_array(A, (size_t)n *n, bench_seed(101));
                    bench_fill_array(B, (size_t)n *n, bench_seed(102));
                    bench_fill_array(C, (size_t)n *n, bench_seed(103));
                    bench_fill_array(D, (size_t)n *n, bench_seed(104));
                    , kernel_run(n, E, F, A, B, C, D);
                    , E, (size_t)n *n, free(A); free(B); free(C); free(D);
                    free(E); free(F);)
