
#include "bench_harness.h"
#include "bench_utils.h"
#include <math.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#define DO_STRONG 1
static inline int clip_u8_int(int v) {
  if (v < 0)
    v = 0;
  if (v > 255)
    v = 255;
  return v;
}
static inline void strong_filter_luma(const uint8_t *src, uint8_t *dst, int W,
                                      int H) {
  for (int y = 0; y < H; y++) {
    int off = y * W;
    if (W > 0)
      dst[off] = src[off];
    for (int x = 1; x < W - 1; x++) {
      int p0 = src[off + x - 1];
      int p1 = src[off + x];
      int p2 = src[off + x + 1];
      int v = (p0 + p1 + p2) / 3;
      dst[off + x] = (uint8_t)clip_u8_int(v);
    }
    if (W > 1)
      dst[off + W - 1] = src[off + W - 1];
  }
}
static inline void weak_filter_luma(const uint8_t *src, uint8_t *dst, int W,
                                    int H) {
  for (int y = 0; y < H; y++) {
    int off = y * W;
    for (int x = 0; x < W; x++) {
      dst[off + x] = src[off + x];
    }
  }
}
static double pipeline_run(int n, uint8_t *y_in, uint8_t *y_out, uint8_t *uv,
                           int W, int H) {
  (void)uv;

#if DO_STRONG
  strong_filter_luma(y_in, y_out, W, H);
#else
  weak_filter_luma(y_in, y_out, W, H);
#endif

  long long acc = 0;
  for (int i = 0; i < n; i++) {
    acc += (long long)y_out[i];
  }
  return (double)acc;
}
BENCH_MAIN_SCALAR3(
    T004_Module_014, NV12DB, 4096, 16384, 65536,
    uint8_t *y_in = (uint8_t *)malloc((size_t)n * sizeof(uint8_t));
    uint8_t *y_out = (uint8_t *)malloc((size_t)n * sizeof(uint8_t));
    uint8_t *uv = (uint8_t *)malloc((size_t)(n / 2 + 1) * sizeof(uint8_t));
    double ans_scalar = 0.0;
    ,
    {
      int W = 128;
      int H = n / W;
      bench_rng64_t rng = bench_rng_init(seed);
      for (int i = 0; i < n; i++) {
        y_in[i] = (uint8_t)(bench_rng_next(&rng) & 0xFFu);
        y_out[i] = 0;
      }
      for (int i = 0; i < n / 2; i++) {
        uv[i] = (uint8_t)(bench_rng_next(&rng) & 0xFFu);
      }
    },
    {
      int W = 128;
      int H = n / W;
      ans_scalar = pipeline_run(n, y_in, y_out, uv, W, H);
    },
    ans_scalar, free(y_in);
    free(y_out); free(uv);)
