#include "bench_harness.h"
#include "bench_utils.h"
#include <math.h>
#include <stdint.h>
#include <stdlib.h>
#define USE_BT709 1
#define WIDTH_CS 128
static inline uint8_t clip_u8_int(int v) {
  if (v < 0)
    v = 0;
  if (v > 255)
    v = 255;
  return (uint8_t)v;
}
static inline void convert_line_bt601(const uint8_t *y, const uint8_t *u,
                                      const uint8_t *v, uint8_t *r, uint8_t *g,
                                      uint8_t *b, int W) {
  for (int i = 0; i < W; i++) {
    double Y = (double)y[i];
    double U = (double)u[i] - 128.0;
    double V = (double)v[i] - 128.0;
    double Rf = Y + 1.402 * V;
    double Gf = Y - 0.344136 * U - 0.714136 * V;
    double Bf = Y + 1.772 * U;
    r[i] = clip_u8_int((int)lrint(Rf));
    g[i] = clip_u8_int((int)lrint(Gf));
    b[i] = clip_u8_int((int)lrint(Bf));
  }
}
static inline void convert_line_bt709(const uint8_t *y, const uint8_t *u,
                                      const uint8_t *v, uint8_t *r, uint8_t *g,
                                      uint8_t *b, int W) {
  for (int i = 0; i < W; i++) {
    double Y = (double)y[i];
    double U = (double)u[i] - 128.0;
    double V = (double)v[i] - 128.0;
    double Rf = Y + 1.5748 * V;
    double Gf = Y - 0.187324 * U - 0.468124 * V;
    double Bf = Y + 1.8556 * U;
    r[i] = clip_u8_int((int)lrint(Rf));
    g[i] = clip_u8_int((int)lrint(Gf));
    b[i] = clip_u8_int((int)lrint(Bf));
  }
}
static inline void convert_line_bt2020(const uint8_t *y, const uint8_t *u,
                                       const uint8_t *v, uint8_t *r, uint8_t *g,
                                       uint8_t *b, int W) {
  for (int i = 0; i < W; i++) {
    double Y = (double)y[i];
    double U = (double)u[i] - 128.0;
    double V = (double)v[i] - 128.0;
    double Rf = Y + 1.4746 * V;
    double Gf = Y - 0.16455 * U - 0.57135 * V;
    double Bf = Y + 1.8814 * U;
    r[i] = clip_u8_int((int)lrint(Rf));
    g[i] = clip_u8_int((int)lrint(Gf));
    b[i] = clip_u8_int((int)lrint(Bf));
  }
}
static double pipeline_run(int n, const uint8_t *Y, const uint8_t *U,
                           const uint8_t *V, uint8_t *R, uint8_t *G, uint8_t *B,
                           int W, int H) {

  for (int row = 0; row < H; row++) {
    const uint8_t *yrow = Y + row * W;
    const uint8_t *urow = U + row * W;
    const uint8_t *vrow = V + row * W;
    uint8_t *rrow = R + row * W;
    uint8_t *grow = G + row * W;
    uint8_t *brow = B + row * W;
#if USE_BT709
    convert_line_bt709(yrow, urow, vrow, rrow, grow, brow, W);
#else
    convert_line_bt601(yrow, urow, vrow, rrow, grow, brow, W);
#endif
  }

  double acc = 0.0;
  for (int i = 0; i < n; i++) {
    acc += (double)R[i] + (double)G[i] + (double)B[i];
  }
  return acc;
}
BENCH_MAIN_SCALAR3(
    T004_Module_016, CSCONV, 4096, 16384, 65536,
    uint8_t *Y = (uint8_t *)malloc((size_t)n * sizeof(uint8_t));
    uint8_t *U = (uint8_t *)malloc((size_t)n * sizeof(uint8_t));
    uint8_t *V = (uint8_t *)malloc((size_t)n * sizeof(uint8_t));
    uint8_t *R = (uint8_t *)malloc((size_t)n * sizeof(uint8_t));
    uint8_t *G = (uint8_t *)malloc((size_t)n * sizeof(uint8_t));
    uint8_t *B = (uint8_t *)malloc((size_t)n * sizeof(uint8_t));
    double ans_scalar = 0.0; int Wc = WIDTH_CS; int Hc = n / Wc;
    ,
    {
      bench_rng64_t rng = bench_rng_init(seed);
      for (int i = 0; i < n; i++) {
        Y[i] = (uint8_t)(bench_rng_next(&rng) & 0xFFu);
        U[i] = (uint8_t)(bench_rng_next(&rng) & 0xFFu);
        V[i] = (uint8_t)(bench_rng_next(&rng) & 0xFFu);
        R[i] = G[i] = B[i] = 0;
      }
    },
    ans_scalar = pipeline_run(n, Y, U, V, R, G, B, Wc, Hc), ans_scalar, free(Y);
    free(U); free(V); free(R); free(G); free(B);)
