#include "bench_harness.h"
#include "bench_utils.h"
#include <math.h>
#include <stdint.h>
#include <stdlib.h>
#define BRAD 2
static inline int clampi_int6(int v, int lo, int hi) {
  if (v < lo)
    return lo;
  if (v > hi)
    return hi;
  return v;
}
static void init_spatial_kernel(double *sk) {
  double sigma_s = 1.5;
  int idx = 0;
  for (int dy = -BRAD; dy <= BRAD; dy++) {
    for (int dx = -BRAD; dx <= BRAD; dx++) {
      double r2 = (double)(dx * dx + dy * dy);
      sk[idx++] = exp(-(r2) / (2.0 * sigma_s * sigma_s));
    }
  }
}
static void init_range_lut(double *rlut) {
  double sigma_r = 25.5;
  for (int d = 0; d < 256; d++) {
    double v = (double)d;
    rlut[d] = exp(-(v * v) / (2.0 * sigma_r * sigma_r));
  }
}
static void bilateral_once(const uint8_t *img, int w, int h, const double *sk,
                           const double *rlut, uint8_t *out) {
  for (int y = 0; y < h; y++) {
    for (int x = 0; x < w; x++) {
      int center = img[y * w + x];
      double vsum = 0.0;
      double wsum = 0.0;
      int idxk = 0;
      for (int dy = -BRAD; dy <= BRAD; dy++) {
        for (int dx = -BRAD; dx <= BRAD; dx++) {
          int xx = clampi_int6(x + dx, 0, w - 1);
          int yy = clampi_int6(y + dy, 0, h - 1);
          int pix = img[yy * w + xx];
          int diff = center - pix;
          if (diff < 0)
            diff = -diff;
          double wgt = sk[idxk] * rlut[diff];
          vsum += (double)pix * wgt;
          wsum += wgt;
          idxk++;
        }
      }
      int val = 0;
      if (wsum > 0.0)
        val = (int)lrint(vsum / wsum);
      if (val < 0)
        val = 0;
      if (val > 255)
        val = 255;
      out[y * w + x] = (uint8_t)val;
    }
  }
}
static double pipeline_run(int w, int h, const uint8_t *img) {
  int n = w * h;
  double spatial[(2 * BRAD + 1) * (2 * BRAD + 1)];
  double rangeLUT[256];
  init_spatial_kernel(spatial);
  init_range_lut(rangeLUT);
  uint8_t *out = (uint8_t *)malloc((size_t)n * sizeof(uint8_t));
  bilateral_once(img, w, h, spatial, rangeLUT, out);
  double acc = 0.0;
  for (int i = 0; i < n; i++)
    acc += (double)out[i];
  free(out);
  double ans = 0.0;

  ans = acc;
  return ans;
}
BENCH_MAIN_SCALAR3(
    T004_Module_008, IMG08, 4096, 16384, 65536,
    int w = (int)(sqrt((double)n) + 0.5);
    int h = w; uint8_t *img = (uint8_t *)malloc((size_t)(n) * sizeof(uint8_t));
    double ans_scalar = 0.0;
    ,
    {
      bench_rng64_t rng = bench_rng_init(seed);
      for (int i = 0; i < n; i++)
        img[i] = (uint8_t)(bench_rng_next(&rng) & 255ULL);
    },
    ans_scalar = pipeline_run(w, h, img), ans_scalar, free(img);)
