#include "bench_harness.h"
#include "bench_utils.h"
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define IDX(i, j, n) ((size_t)(i) * (size_t)(n) + (size_t)(j))
static inline void deriche_line(int n, const double *in, double *out, double a1,
                                double a2, double a3, double a4, double b1,
                                double b2) {
  double ym1 = 0.0, ym2 = 0.0, xm1 = 0.0;
  for (int i = 0; i < n; i++) {
    double x = in[i];
    double y = a1 * x + a2 * xm1 + b1 * ym1 + b2 * ym2;
    out[i] = y;
    xm1 = x;
    ym2 = ym1;
    ym1 = y;
  }
  double yp1 = 0.0, yp2 = 0.0, xp1 = 0.0;
  for (int i = n - 1; i >= 0; i--) {
    double x = in[i];
    double y = a3 * xp1 + a4 * x + b1 * yp1 + b2 * yp2;
    out[i] += y;
    xp1 = x;
    yp2 = yp1;
    yp1 = y;
  }
}
void kernel_run(int n, double *img, double *out) {
  double alpha = 0.25;
  double ea = exp(-alpha);
  double k = (1.0 - ea) * (1.0 - ea) / (1.0 + 2.0 * alpha * ea - ea * ea);
  double a1 = k, a2 = k * ea * (alpha - 1.0);
  double a3 = k * ea * (alpha + 1.0), a4 = -k * ea * ea;
  double b1 = 2.0 * ea, b2 = -ea * ea;

  for (int r = 0; r < n; r++) {
    deriche_line(n, &img[IDX(r, 0, n)], &out[IDX(r, 0, n)], a1, a2, a3, a4, b1,
                 b2);
  }
  for (int i = 0; i < n * n; i++)
    img[i] = out[i];
}
BENCH_MAIN_ARRAY3_D(T001_Loops_020, 07, 128, 256, 512,
                    double *img = malloc((size_t)n * n * sizeof(double));
                    double *out = malloc((size_t)n * n * sizeof(double));
                    , bench_fill_array(img, (size_t)n *n, bench_seed(13));
                    memset(out, 0, (size_t)n * n * sizeof(double));
                    , kernel_run(n, img, out);, img, (size_t)n *n, free(img);
                    free(out);)
