#include "bench_harness.h"
#include "bench_utils.h"
#include <math.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
static inline void smooth_jacobi(int N, double *u, double *tmp, const double *f,
                                 int iters) {
  double *cur = u;
  double *nxt = tmp;
  for (int it = 0; it < iters; it++) {
    for (int t = 0; t < N * N; t++) {
      nxt[t] = cur[t];
    }
    for (int i = 1; i < N - 1; i++) {
      int row = i * N;
      int rowm = (i - 1) * N;
      int rowp = (i + 1) * N;
      for (int j = 1; j < N - 1; j++) {
        int idx = row + j;
        nxt[idx] = 0.25 * (cur[rowm + j] + cur[rowp + j] + cur[row + j - 1] +
                           cur[row + j + 1] - f[idx]);
      }
    }
    double *swp = cur;
    cur = nxt;
    nxt = swp;
  }
  if (cur != u) {
    memcpy(u, cur, (size_t)N * (size_t)N * sizeof(double));
  }
}
static inline void compute_residual(int N, const double *u, const double *f,
                                    double *res) {
  for (int t = 0; t < N * N; t++) {
    res[t] = 0.0;
  }
  for (int i = 1; i < N - 1; i++) {
    int row = i * N;
    int rowm = (i - 1) * N;
    int rowp = (i + 1) * N;
    for (int j = 1; j < N - 1; j++) {
      int idx = row + j;
      double Au = (-4.0 * u[idx] + u[rowm + j] + u[rowp + j] + u[row + j - 1] +
                   u[row + j + 1]);
      res[idx] = f[idx] - Au;
    }
  }
}
static inline void restrict_fullweight(int Nf, int Nc, const double *rf,
                                       double *rc) {
  int NNc = Nc * Nc;
  for (int t = 0; t < NNc; t++) {
    rc[t] = 0.0;
  }
  for (int i = 1; i < Nc - 1; i++) {
    for (int j = 1; j < Nc - 1; j++) {
      int fi = 2 * i;
      int fj = 2 * j;
      double s = rf[fi * Nf + fj] + rf[(fi + 1) * Nf + fj] +
                 rf[fi * Nf + fj + 1] + rf[(fi + 1) * Nf + fj + 1];
      rc[i * Nc + j] = 0.25 * s;
    }
  }
}
static inline void prolong_inject(int Nf, int Nc, const double *uc,
                                  double *uf) {
  for (int i = 1; i < Nc - 1; i++) {
    for (int j = 1; j < Nc - 1; j++) {
      double v = uc[i * Nc + j];
      int fi = 2 * i;
      int fj = 2 * j;
      uf[fi * Nf + fj] += v;
      uf[(fi + 1) * Nf + fj] += v;
      uf[fi * Nf + fj + 1] += v;
      uf[(fi + 1) * Nf + fj + 1] += v;
    }
  }
}
static inline double residual_norm(int N, const double *u, const double *f,
                                   double *rbuf) {
  compute_residual(N, u, f, rbuf);
  double s = 0.0;
  int NN = N * N;
  for (int i = 0; i < NN; i++) {
    double v = rbuf[i];
    s += v * v;
  }
  return sqrt(s);
}
static double run_vcycle(int N0, int N1, int N2, double *u0, double *tmp0,
                         double *f0, double *r0, double *u1, double *tmp1,
                         double *f1, double *r1, double *u2, double *tmp2,
                         double *f2, double *r2) {
  int n1 = N1 * N1;
  int n2 = N2 * N2;
  for (int i = 0; i < n1; i++) {
    u1[i] = 0.0;
    tmp1[i] = 0.0;
    f1[i] = 0.0;
    r1[i] = 0.0;
  }
  for (int i = 0; i < n2; i++) {
    u2[i] = 0.0;
    tmp2[i] = 0.0;
    f2[i] = 0.0;
    r2[i] = 0.0;
  }
  smooth_jacobi(N0, u0, tmp0, f0, 3);
  compute_residual(N0, u0, f0, r0);
  restrict_fullweight(N0, N1, r0, f1);
  smooth_jacobi(N1, u1, tmp1, f1, 3);
  compute_residual(N1, u1, f1, r1);
  restrict_fullweight(N1, N2, r1, f2);
  smooth_jacobi(N2, u2, tmp2, f2, 10);
  prolong_inject(N1, N2, u2, u1);
  smooth_jacobi(N1, u1, tmp1, f1, 3);
  prolong_inject(N0, N1, u1, u0);
  smooth_jacobi(N0, u0, tmp0, f0, 3);
  double norm = residual_norm(N0, u0, f0, r0);
  double outv = 0.0;
  outv = norm;
  return outv;
}
BENCH_MAIN_SCALAR3(
    T004_Module_043, MG, 4096, 16384, 65536,
    int Ndim = (int)(sqrt((double)n) + 0.5);
    int N1 = Ndim / 2; int N2 = N1 / 2; int n1 = N1 * N1; int n2 = N2 * N2;
    double *u0 = (double *)malloc((size_t)n * sizeof(double));
    double *tmp0 = (double *)malloc((size_t)n * sizeof(double));
    double *f0 = (double *)malloc((size_t)n * sizeof(double));
    double *r0 = (double *)malloc((size_t)n * sizeof(double));
    double *u1 = (double *)malloc((size_t)n1 * sizeof(double));
    double *tmp1 = (double *)malloc((size_t)n1 * sizeof(double));
    double *f1 = (double *)malloc((size_t)n1 * sizeof(double));
    double *r1 = (double *)malloc((size_t)n1 * sizeof(double));
    double *u2 = (double *)malloc((size_t)n2 * sizeof(double));
    double *tmp2 = (double *)malloc((size_t)n2 * sizeof(double));
    double *f2 = (double *)malloc((size_t)n2 * sizeof(double));
    double *r2 = (double *)malloc((size_t)n2 * sizeof(double));
    double ans_scalar = 0.0;
    ,
    {
      bench_rng64_t rng = bench_rng_init(seed);
      for (int i = 0; i < n; i++) {
        u0[i] = 0.0;
        tmp0[i] = 0.0;
        f0[i] = bench_rng_double_signed(&rng);
        r0[i] = 0.0;
      }
      for (int i = 0; i < n1; i++) {
        u1[i] = 0.0;
        tmp1[i] = 0.0;
        f1[i] = 0.0;
        r1[i] = 0.0;
      }
      for (int i = 0; i < n2; i++) {
        u2[i] = 0.0;
        tmp2[i] = 0.0;
        f2[i] = 0.0;
        r2[i] = 0.0;
      }
    },
    ans_scalar = run_vcycle(Ndim, N1, N2, u0, tmp0, f0, r0, u1, tmp1, f1, r1,
                            u2, tmp2, f2, r2),
    ans_scalar, free(u0);
    free(tmp0); free(f0); free(r0); free(u1); free(tmp1); free(f1); free(r1);
    free(u2); free(tmp2); free(f2); free(r2);)
