#include "bench_harness.h"
#include "bench_utils.h"
#include <math.h>
#include <stdint.h>
#include <stdlib.h>
static const unsigned char gamma_lut[256] = {
    0,   21,  28,  34,  39,  43,  46,  50,  53,  56,  59,  61,  64,  66,  68,
    70,  72,  74,  76,  78,  80,  82,  84,  85,  87,  89,  90,  92,  93,  95,
    96,  98,  99,  101, 102, 103, 105, 106, 107, 109, 110, 111, 112, 114, 115,
    116, 117, 118, 119, 120, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
    132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 144, 145,
    146, 147, 148, 149, 150, 151, 151, 152, 153, 154, 155, 156, 156, 157, 158,
    159, 160, 160, 161, 162, 163, 164, 164, 165, 166, 167, 167, 168, 169, 170,
    170, 171, 172, 173, 173, 174, 175, 175, 176, 177, 178, 178, 179, 180, 180,
    181, 182, 182, 183, 184, 184, 185, 186, 186, 187, 188, 188, 189, 190, 190,
    191, 192, 192, 193, 194, 194, 195, 195, 196, 197, 197, 198, 199, 199, 200,
    200, 201, 202, 202, 203, 203, 204, 205, 205, 206, 206, 207, 207, 208, 209,
    209, 210, 210, 211, 212, 212, 213, 213, 214, 214, 215, 215, 216, 217, 217,
    218, 218, 219, 219, 220, 220, 221, 221, 222, 223, 223, 224, 224, 225, 225,
    226, 226, 227, 227, 228, 228, 229, 229, 230, 230, 231, 231, 232, 232, 233,
    233, 234, 234, 235, 235, 236, 236, 237, 237, 238, 238, 239, 239, 240, 240,
    241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 248,
    248, 249, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255,
    255};
static inline int clamp255_int(int v) {
  if (v < 0)
    return 0;
  if (v > 255)
    return 255;
  return v;
}
static void yuv420_to_rgb(const uint8_t *y, const uint8_t *u, const uint8_t *v,
                          int w, int h, uint8_t *rgb) {
  for (int yy = 0; yy < h; yy++) {
    for (int xx = 0; xx < w; xx++) {
      int Yv = y[yy * w + xx];
      int Uv = u[(yy >> 1) * (w >> 1) + (xx >> 1)];
      int Vv = v[(yy >> 1) * (w >> 1) + (xx >> 1)];
      double Yd = (double)Yv;
      double Ud = (double)Uv - 128.0;
      double Vd = (double)Vv - 128.0;
      double Rd = Yd + 1.402 * Vd;
      double Gd = Yd - 0.344136 * Ud - 0.714136 * Vd;
      double Bd = Yd + 1.772 * Ud;
      int R8 = clamp255_int((int)lrint(Rd));
      int G8 = clamp255_int((int)lrint(Gd));
      int B8 = clamp255_int((int)lrint(Bd));
      int idx = yy * w + xx;
      rgb[3 * idx + 0] = (uint8_t)R8;
      rgb[3 * idx + 1] = (uint8_t)G8;
      rgb[3 * idx + 2] = (uint8_t)B8;
    }
  }
}
static double apply_gamma_lut_and_sum(const uint8_t *rgb, int n) {
  double acc = 0.0;
  int total = 3 * n;
  for (int i = 0; i < total; i++) {
    acc += (double)gamma_lut[(int)rgb[i]];
  }
  return acc;
}
static double pipeline_run(int w, int h, const uint8_t *y, const uint8_t *u,
                           const uint8_t *v) {
  int n = w * h;
  uint8_t *rgb = (uint8_t *)malloc((size_t)(3 * n) * sizeof(uint8_t));
  yuv420_to_rgb(y, u, v, w, h, rgb);
  double val = apply_gamma_lut_and_sum(rgb, n);
  free(rgb);
  double ans = 0.0;

  ans = val;
  return ans;
}
BENCH_MAIN_SCALAR3(
    T004_Module_002, IMG02, 4096, 16384, 65536,
    int w = (int)(sqrt((double)n) + 0.5);
    int h = w; uint8_t *Yp = (uint8_t *)malloc((size_t)(n) * sizeof(uint8_t));
    uint8_t *Up = (uint8_t *)malloc((size_t)(n / 4) * sizeof(uint8_t));
    uint8_t *Vp = (uint8_t *)malloc((size_t)(n / 4) * sizeof(uint8_t));
    double ans_scalar = 0.0;
    ,
    {
      bench_rng64_t rng = bench_rng_init(seed);
      for (int i = 0; i < n; i++)
        Yp[i] = (uint8_t)(bench_rng_next(&rng) & 255ULL);
      for (int i = 0; i < n / 4; i++)
        Up[i] = (uint8_t)(bench_rng_next(&rng) & 255ULL);
      for (int i = 0; i < n / 4; i++)
        Vp[i] = (uint8_t)(bench_rng_next(&rng) & 255ULL);
    },
    ans_scalar = pipeline_run(w, h, Yp, Up, Vp), ans_scalar, free(Yp);
    free(Up); free(Vp);)
