
#include "bench_harness.h"
#include "bench_utils.h"
#include <math.h>
#include <stdint.h>
#include <stdlib.h>
void kernel_run(int Cin, int Cout, int H, int W, const double *x,
                const double *w, double *y) {
  int HW = H * W;

  for (int oc = 0; oc < Cout; oc++) {
    for (int iy = 0; iy < H; iy++) {
      for (int ix = 0; ix < W; ix++) {
        double acc = 0.0;
        for (int ic = 0; ic < Cin; ic++) {
          for (int ky = -1; ky <= 1; ky++) {
            int sy = iy + ky;
            if (sy < 0 || sy >= H)
              continue;
            for (int kx = -1; kx <= 1; kx++) {
              int sx = ix + kx;
              if (sx < 0 || sx >= W)
                continue;
              int kidx = ((oc * Cin + ic) * 3 + (ky + 1)) * 3 + (kx + 1);
              acc += x[ic * HW + sy * W + sx] * w[kidx];
            }
          }
        }
        y[oc * HW + iy * W + ix] = acc;
      }
    }
  }
}
BENCH_MAIN_ARRAY3_D(
    T002_Ops_044, OP74, 4096, 16384, 65536, int H = 32; int W = 32;
    int Cin = (case_id == 1 ? 8 : (case_id == 2 ? 16 : 32));
    int Cout = (case_id == 1 ? 16 : (case_id == 2 ? 32 : 64)); int HW = H * W;
    double *x = malloc((size_t)(Cin * HW) * sizeof(double));
    double *w = malloc((size_t)(Cout * Cin * 9) * sizeof(double));
    double *y = malloc((size_t)(Cout * HW) * sizeof(double)),
    bench_fill_array(x, (size_t)(Cin * HW), bench_seed(74));
    bench_fill_array(w, (size_t)(Cout * Cin * 9), bench_seed(74) ^ 0xAAu);
    , kernel_run(Cin, Cout, H, W, x, w, y), y, (size_t)(Cout * HW), free(x);
    free(w); free(y))
