#include "bench_harness.h"
#include "bench_utils.h"
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define IDX(i, j, n) ((size_t)(i) * (size_t)(n) + (size_t)(j))
void kernel_run(int n, double *A, const double *C4, double *sumv) {

  for (int i = 0; i < n; i++) {
    for (int j = 0; j < n; j++) {
      for (int k = 0; k < n; k++) {
        double s = 0.0;
        for (int r = 0; r < n; r++)
          s += A[((size_t)i * n + j) * n + r] * C4[IDX(r, k, n)];
        sumv[IDX(j, k, n)] = s;
      }
      for (int k = 0; k < n; k++) {
        A[((size_t)i * n + j) * n + k] = sumv[IDX(j, k, n)];
      }
    }
  }
}
BENCH_MAIN_ARRAY3_D(T001_Loops_021, 10, 16, 24, 32,
                    double *A = malloc((size_t)n * n * n * sizeof(double));
                    double *C4 = malloc((size_t)n * n * sizeof(double));
                    double *sumv = malloc((size_t)n * n * sizeof(double));
                    , bench_fill_array(A, (size_t)n * n * n, bench_seed(18));
                    bench_fill_array(C4, (size_t)n *n, bench_seed(19));
                    , kernel_run(n, A, C4, sumv);
                    , A, (size_t)n * n * n, free(A); free(C4); free(sumv);)
