
#include "bench_harness.h"
#include "bench_utils.h"
#include <math.h>
#include <stdint.h>
#include <stdlib.h>
static inline uint8_t round_clamp_channel(double r) {
  double fl = floor(r);
  double diff = r - fl;
  long long qv;
  if (diff > 0.5) {
    qv = (long long)(fl + 1.0);
  } else if (diff < 0.5) {
    qv = (long long)(fl);
  } else {
    if (((long long)fl) & 1LL)
      qv = (long long)(fl + 1.0);
    else
      qv = (long long)(fl);
  }
  if (qv < 0)
    qv = 0;
  if (qv > 255)
    qv = 255;
  return (uint8_t)qv;
}
void kernel_run(int C, int H, int W, const double *x, const double *scale,
                const int32_t *zp, uint8_t *q) {
  int HW = H * W;

  for (int c = 0; c < C; c++) {
    double sc = scale[c];
    double inv_sc = 1.0 / sc;
    int32_t z = zp[c];
    for (int hw = 0; hw < HW; hw++) {
      double r = x[c * HW + hw] * inv_sc + (double)z;
      q[c * HW + hw] = round_clamp_channel(r);
    }
  }
}
BENCH_MAIN_ARRAY3_BYTES(
    T002_Ops_059, OP89, 4096, 16384, 65536, int H = 32; int W = 32;
    int C = (case_id == 1 ? 4 : (case_id == 2 ? 16 : 64)); int HW = H * W;
    double *x = malloc((size_t)(C * HW) * sizeof(double));
    double *scale = malloc((size_t)C * sizeof(double));
    int32_t *zp = malloc((size_t)C * sizeof(int32_t));
    uint8_t *q = malloc((size_t)(C * HW) * sizeof(uint8_t));
    , bench_fill_array(x, (size_t)(C * HW), bench_seed(89));
    bench_fill_array_pos(scale, (size_t)C, bench_seed(89) ^ 0x1u);
    {
      bench_rng64_t rng = bench_rng_init(bench_seed(89) ^ 0x2u);
      for (int c = 0; c < C; c++) {
        zp[c] = (int32_t)(bench_rng_next(&rng) & 0xFFu);
      }
    },
    kernel_run(C, H, W, x, scale, zp, q), q, (size_t)(C * HW), free(x);
    free(scale); free(zp); free(q))
