#include "bench_harness.h"
#include "bench_utils.h"
#include <math.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#define TILE_W 32
#define TILE_H 32
#define CLIP_LIMIT 40
static void tile_hist(const uint8_t *img, int w, int h, int tx, int ty,
                      int *hist) {
  for (int i = 0; i < 256; i++)
    hist[i] = 0;
  int sx = tx * TILE_W;
  int sy = ty * TILE_H;
  for (int yy = 0; yy < TILE_H; yy++) {
    for (int xx = 0; xx < TILE_W; xx++) {
      int x = sx + xx;
      int y = sy + yy;
      int idx = y * w + x;
      hist[(int)img[idx]]++;
    }
  }
}
static void clip_redistribute(const int *hist_in, int tile_area,
                              uint8_t *lut_out) {
  int tmp[256];
  int excess = 0;
  for (int i = 0; i < 256; i++) {
    int v = hist_in[i];
    if (v > CLIP_LIMIT) {
      excess += v - CLIP_LIMIT;
      v = CLIP_LIMIT;
    }
    tmp[i] = v;
  }
  int base = excess / 256;
  int rem = excess % 256;
  for (int i = 0; i < 256; i++)
    tmp[i] += base;
  for (int i = 0; i < rem; i++)
    tmp[i]++;
  int cdf = 0;
  for (int i = 0; i < 256; i++) {
    cdf += tmp[i];
    int val = (cdf * 255) / tile_area;
    if (val < 0)
      val = 0;
    if (val > 255)
      val = 255;
    lut_out[i] = (uint8_t)val;
  }
}
static double interp_tiles(const uint8_t *img, int w, int h,
                           const uint8_t *allLUT, int tilesX, int tilesY) {
  double acc = 0.0;
  for (int y = 0; y < h; y++) {
    int gy = y / TILE_H;
    if (gy >= tilesY)
      gy = tilesY - 1;
    int gy1 = gy + 1;
    if (gy1 >= tilesY)
      gy1 = gy;
    double fy = (double)(y - gy * TILE_H) / (double)TILE_H;
    for (int x = 0; x < w; x++) {
      int gx = x / TILE_W;
      if (gx >= tilesX)
        gx = tilesX - 1;
      int gx1 = gx + 1;
      if (gx1 >= tilesX)
        gx1 = gx;
      double fx = (double)(x - gx * TILE_W) / (double)TILE_W;
      int v = (int)img[y * w + x];
      const uint8_t *lut00 = allLUT + ((gy * tilesX + gx) * 256);
      const uint8_t *lut10 = allLUT + ((gy * tilesX + gx1) * 256);
      const uint8_t *lut01 = allLUT + ((gy1 * tilesX + gx) * 256);
      const uint8_t *lut11 = allLUT + ((gy1 * tilesX + gx1) * 256);
      double a00 = (double)lut00[v];
      double a10 = (double)lut10[v];
      double a01 = (double)lut01[v];
      double a11 = (double)lut11[v];
      double v0 = a00 + fx * (a10 - a00);
      double v1 = a01 + fx * (a11 - a01);
      double vv = v0 + fy * (v1 - v0);
      acc += vv;
    }
  }
  return acc;
}
static double pipeline_run(int w, int h, const uint8_t *img) {
  int tilesX = w / TILE_W;
  int tilesY = h / TILE_H;
  int tile_area = TILE_W * TILE_H;
  uint8_t *allLUT =
      (uint8_t *)malloc((size_t)(tilesX * tilesY * 256) * sizeof(uint8_t));
  int hist[256];
  uint8_t lut[256];
  for (int ty = 0; ty < tilesY; ty++) {
    for (int tx = 0; tx < tilesX; tx++) {
      tile_hist(img, w, h, tx, ty, hist);
      clip_redistribute(hist, tile_area, lut);
      memcpy(allLUT + ((ty * tilesX + tx) * 256), lut, 256);
    }
  }
  double val = interp_tiles(img, w, h, allLUT, tilesX, tilesY);
  free(allLUT);
  double ans = 0.0;

  ans = val;
  return ans;
}
BENCH_MAIN_SCALAR3(
    T004_Module_003, IMG03, 4096, 16384, 65536,
    int w = (int)(sqrt((double)n) + 0.5);
    int h = w; uint8_t *img = (uint8_t *)malloc((size_t)(n) * sizeof(uint8_t));
    double ans_scalar = 0.0;
    ,
    {
      bench_rng64_t rng = bench_rng_init(seed);
      for (int i = 0; i < n; i++)
        img[i] = (uint8_t)(bench_rng_next(&rng) & 255ULL);
    },
    ans_scalar = pipeline_run(w, h, img), ans_scalar, free(img);)
