#include "bench_harness.h"
#include "bench_utils.h"
#include <stdint.h>
#include <stdlib.h>
#define BS 256
static double run_scan(int n, const uint32_t *in, uint64_t *out, int numBlocks,
                       uint64_t *block_sums) {
  for (int b = 0; b < numBlocks; b++) {
    int start = b * BS;
    int len = BS;
    if (start + len > n) {
      len = n - start;
    }
    uint64_t s = 0;
    for (int i = 0; i < len; i++) {
      out[start + i] = s;
      s += (uint64_t)in[start + i];
    }
    block_sums[b] = s;
  }
  uint64_t acc = 0;
  for (int b = 0; b < numBlocks; b++) {
    int start = b * BS;
    int len = BS;
    if (start + len > n) {
      len = n - start;
    }
    uint64_t ofs = acc;
    acc += block_sums[b];
    for (int i = 0; i < len; i++) {
      out[start + i] += ofs;
    }
  }
  double ans = (double)acc;
  double outv = 0.0;
  outv = ans;
  return outv;
}
BENCH_MAIN_SCALAR3(
    T004_Module_044, SCAN, 4096, 16384, 65536,
    int numBlocks = (n + BS - 1) / BS;
    uint32_t *in = (uint32_t *)malloc((size_t)n * sizeof(uint32_t));
    uint64_t *out = (uint64_t *)malloc((size_t)n * sizeof(uint64_t));
    uint64_t *block_sums = (uint64_t *)malloc((size_t)numBlocks *
                                              sizeof(uint64_t));
    double ans_scalar = 0.0;
    ,
    {
      bench_rng64_t rng = bench_rng_init(seed);
      for (int i = 0; i < n; i++) {
        in[i] = (uint32_t)(bench_rng_next(&rng) & 0xFFFFu);
      }
    },
    ans_scalar = run_scan(n, in, out, numBlocks, block_sums), ans_scalar,
    free(in);
    free(out); free(block_sums);)
