
#include "bench_harness.h"
#include "bench_utils.h"
#include <math.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#define BLK 8
#define RADIUS 2
#define USE_SAD 1
#define USE_DIAMOND 1
typedef int (*cand_gen_fn)(int bx, int by, int *dx, int *dy);
static inline int sad_block8(const uint8_t *cur, const uint8_t *ref, int W,
                             int bx, int by, int rx, int ry) {
  int acc = 0;
  for (int j = 0; j < BLK; j++) {
    int o1 = (by + j) * W + bx;
    int o2 = (ry + j) * W + rx;
    for (int i = 0; i < BLK; i++) {
      int d = (int)cur[o1 + i] - (int)ref[o2 + i];
#if USE_SAD
      if (d < 0)
        d = -d;
      acc += d;
#else
      acc += d * d;
#endif
    }
  }
  return acc;
}
static inline int clampi(int v, int lo, int hi) {
  if (v < lo)
    return lo;
  if (v > hi)
    return hi;
  return v;
}
static int gen_candidates_full(int bx, int by, int *dx, int *dy) {
  int m = 0;
  for (int yy = -RADIUS; yy <= RADIUS; yy++) {
    for (int xx = -RADIUS; xx <= RADIUS; xx++) {
      dx[m] = xx;
      dy[m] = yy;
      m++;
    }
  }
  return m;
}
static int gen_candidates_diamond(int bx, int by, int *dx, int *dy) {
  (void)bx;
  (void)by;
  dx[0] = 0;
  dy[0] = 0;
  dx[1] = 1;
  dy[1] = 0;
  dx[2] = -1;
  dy[2] = 0;
  dx[3] = 0;
  dy[3] = 1;
  dx[4] = 0;
  dy[4] = -1;
  return 5;
}
static inline int search_block(const uint8_t *cur, const uint8_t *ref, int W,
                               int H, int bx, int by, cand_gen_fn gen) {
  int cand_dx[64];
  int cand_dy[64];
  int nc = gen(bx, by, cand_dx, cand_dy);
  int best_cost = 0x7fffffff;
  for (int k = 0; k < nc; k++) {
    int tx = bx + cand_dx[k];
    int ty = by + cand_dy[k];
    tx = clampi(tx, 0, W - BLK);
    ty = clampi(ty, 0, H - BLK);
    int c = sad_block8(cur, ref, W, bx, by, tx, ty);
    if (c < best_cost)
      best_cost = c;
  }
  return best_cost;
}
static long long process_frame(const uint8_t *cur, const uint8_t *ref, int W,
                               int H, cand_gen_fn gen) {
  long long total = 0;
  for (int by = 0; by <= H - BLK; by += BLK) {
    for (int bx = 0; bx <= W - BLK; bx += BLK) {
      total += (long long)search_block(cur, ref, W, H, bx, by, gen);
    }
  }
  return total;
}
static double pipeline_run(int n, const uint8_t *cur, const uint8_t *ref, int W,
                           int H) {
#if USE_DIAMOND
  cand_gen_fn gen = gen_candidates_diamond;
#else
  cand_gen_fn gen = gen_candidates_full;
#endif
  long long acc = 0;

  acc = process_frame(cur, ref, W, H, gen);

  return (double)acc;
}
BENCH_MAIN_SCALAR3(
    T004_Module_015, MOTEST, 4096, 16384, 65536,
    uint8_t *cur = (uint8_t *)malloc((size_t)n * sizeof(uint8_t));
    uint8_t *ref = (uint8_t *)malloc((size_t)n * sizeof(uint8_t));
    double ans_scalar = 0.0;
    ,
    {
      int W = 64;
      int H = n / W;
      bench_rng64_t rng = bench_rng_init(seed);
      for (int i = 0; i < n; i++) {
        cur[i] = (uint8_t)(bench_rng_next(&rng) & 0xFFu);
        ref[i] = (uint8_t)(bench_rng_next(&rng) & 0xFFu);
      }
      (void)H;
    },
    {
      int W = 64;
      int H = n / W;
      ans_scalar = pipeline_run(n, cur, ref, W, H);
    },
    ans_scalar, free(cur);
    free(ref);)
