
#include "bench_harness.h"
#include "bench_utils.h"
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
void kernel_run(int N, double (*u1)[N][3], double (*u2)[N][3],
                double (*u3)[N][3], double *du1, double *du2, double *du3) {
  double a11 = 0.1, a12 = 0.2, a13 = 0.3, a21 = 0.4, a22 = 0.5, a23 = 0.6,
         a31 = 0.7, a32 = 0.8, a33 = 0.9, sig = 0.01;

  int nl1 = 0, nl2 = 1;
  for (int kx = 1; kx < 3; kx++) {
    for (int ky = 1; ky < N - 1; ky++) {
      du1[ky] = u1[nl1][ky + 1][kx] - u1[nl1][ky - 1][kx];
      du2[ky] = u2[nl1][ky + 1][kx] - u2[nl1][ky - 1][kx];
      du3[ky] = u3[nl1][ky + 1][kx] - u3[nl1][ky - 1][kx];
      u1[nl2][ky][kx] = u1[nl1][ky][kx] + a11 * du1[ky] + a12 * du2[ky] +
                        a13 * du3[ky] +
                        sig * (u1[nl1][ky][kx + 1] - 2.0 * u1[nl1][ky][kx] +
                               u1[nl1][ky][kx - 1]);
      u2[nl2][ky][kx] = u2[nl1][ky][kx] + a21 * du1[ky] + a22 * du2[ky] +
                        a23 * du3[ky] +
                        sig * (u2[nl1][ky][kx + 1] - 2.0 * u2[nl1][ky][kx] +
                               u2[nl1][ky][kx - 1]);
      u3[nl2][ky][kx] = u3[nl1][ky][kx] + a31 * du1[ky] + a32 * du2[ky] +
                        a33 * du3[ky] +
                        sig * (u3[nl1][ky][kx + 1] - 2.0 * u3[nl1][ky][kx] +
                               u3[nl1][ky][kx - 1]);
    }
  }
}
BENCH_MAIN_ARRAY3_D(
    T001_Loops_007, 08, 128, 256, 512, int N = n;
    double (*u1)[N][3] = malloc(2 * (size_t)N * 3 * sizeof(double));
    double (*u2)[N][3] = malloc(2 * (size_t)N * 3 * sizeof(double));
    double (*u3)[N][3] = malloc(2 * (size_t)N * 3 * sizeof(double));
    double *du1 = malloc((size_t)N * sizeof(double));
    double *du2 = malloc((size_t)N * sizeof(double));
    double *du3 = malloc((size_t)N * sizeof(double)),
    bench_fill_array((double *)u1, 2 * (size_t)N * 3, bench_seed(9) ^ 0xabc1);
    bench_fill_array((double *)u2, 2 * (size_t)N * 3, bench_seed(9) ^ 0xabc2);
    bench_fill_array((double *)u3, 2 * (size_t)N * 3, bench_seed(9) ^ 0xabc3),
    kernel_run(n, u1, u2, u3, du1, du2, du3), (const double *)u1,
    2 * (size_t)n * 3, free(du1);
    free(du2); free(du3); free(u1); free(u2); free(u3))
