#include "bench_harness.h"
#include "bench_utils.h"
#include <math.h>
#include <stdint.h>
#include <stdlib.h>
static inline int clampi_int3(int v, int lo, int hi) {
  if (v < lo)
    return lo;
  if (v > hi)
    return hi;
  return v;
}
static void dilate_r(const uint8_t *src, uint8_t *dst, int w, int h, int r) {
  for (int y = 0; y < h; y++) {
    for (int x = 0; x < w; x++) {
      int best = 0;
      for (int dy = -r; dy <= r; dy++) {
        for (int dx = -r; dx <= r; dx++) {
          int xx = clampi_int3(x + dx, 0, w - 1);
          int yy = clampi_int3(y + dy, 0, h - 1);
          int v = (int)src[yy * w + xx];
          if (v > best)
            best = v;
        }
      }
      dst[y * w + x] = (uint8_t)best;
    }
  }
}
static void erode_r(const uint8_t *src, uint8_t *dst, int w, int h, int r) {
  for (int y = 0; y < h; y++) {
    for (int x = 0; x < w; x++) {
      int best = 255;
      for (int dy = -r; dy <= r; dy++) {
        for (int dx = -r; dx <= r; dx++) {
          int xx = clampi_int3(x + dx, 0, w - 1);
          int yy = clampi_int3(y + dy, 0, h - 1);
          int v = (int)src[yy * w + xx];
          if (v < best)
            best = v;
        }
      }
      dst[y * w + x] = (uint8_t)best;
    }
  }
}
static void dilate3x3(const uint8_t *src, uint8_t *dst, int w, int h) {
  dilate_r(src, dst, w, h, 1);
}
static void erode3x3(const uint8_t *src, uint8_t *dst, int w, int h) {
  erode_r(src, dst, w, h, 1);
}
static void dilate5x5(const uint8_t *src, uint8_t *dst, int w, int h) {
  dilate_r(src, dst, w, h, 2);
}
static void erode5x5(const uint8_t *src, uint8_t *dst, int w, int h) {
  erode_r(src, dst, w, h, 2);
}
static double pipeline_run(int w, int h, const uint8_t *img0) {
  int n = w * h;
  uint8_t *buf1 = (uint8_t *)malloc((size_t)n * sizeof(uint8_t));
  uint8_t *buf2 = (uint8_t *)malloc((size_t)n * sizeof(uint8_t));
  for (int i = 0; i < n; i++)
    buf1[i] = img0[i];
  dilate3x3(buf1, buf2, w, h);
  erode3x3(buf2, buf1, w, h);
  dilate5x5(buf1, buf2, w, h);
  erode5x5(buf2, buf1, w, h);
  double acc = 0.0;
  for (int i = 0; i < n; i++)
    acc += (double)buf1[i];
  free(buf1);
  free(buf2);
  double ans = 0.0;

  ans = acc;
  return ans;
}
BENCH_MAIN_SCALAR3(
    T004_Module_005, IMG05, 4096, 16384, 65536,
    int w = (int)(sqrt((double)n) + 0.5);
    int h = w; uint8_t *img = (uint8_t *)malloc((size_t)(n) * sizeof(uint8_t));
    double ans_scalar = 0.0;
    ,
    {
      bench_rng64_t rng = bench_rng_init(seed);
      for (int i = 0; i < n; i++) {
        uint8_t v = (uint8_t)(bench_rng_next(&rng) & 255ULL);
        img[i] = (v & 128) ? 255 : 0;
      }
    },
    ans_scalar = pipeline_run(w, h, img), ans_scalar, free(img);)
