#include "bench_harness.h"
#include "bench_utils.h"
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
static inline uint32_t load32_le(const uint8_t *p) {
  return ((uint32_t)p[0]) | ((uint32_t)p[1] << 8) | ((uint32_t)p[2] << 16) |
         ((uint32_t)p[3] << 24);
}
static const uint32_t IV8[8] = {0x6a09e667u, 0xbb67ae85u, 0x3c6ef372u,
                                0xa54ff53au, 0x510e527fu, 0x9b05688cu,
                                0x1f83d9abu, 0x5be0cd19u};
static void compress_chunk(const uint8_t *chunk, size_t len, uint32_t out[8]) {
  uint32_t s[8];
  uint32_t m[8];
  for (int i = 0; i < 8; i++)
    s[i] = IV8[i];
  for (int i = 0; i < 8; i++) {
    uint32_t w = 0;
    size_t base = (size_t)(4 * i);
    if (base < len) {
      size_t rem = len - base;
      if (rem > 4)
        rem = 4;
      for (size_t j = 0; j < rem; j++)
        w |= ((uint32_t)chunk[base + j]) << (8 * j);
    }
    m[i] = w;
  }
  for (int r = 0; r < 6; r++) {
    for (int i = 0; i < 8; i++) {
      uint32_t v = s[i] + m[i] + (s[(i + 1) & 7] ^ 0x9e3779b9u);
      uint32_t rot = (uint32_t)(((i % 5) + 3) & 31);
      v = (v << rot) | (v >> (32 - rot));
      s[i] = v ^ (s[(i + 2) & 7] + 0x7f4a7c15u);
    }
  }
  for (int i = 0; i < 8; i++)
    out[i] = s[i] ^ IV8[i];
}
static void compress_parent(const uint32_t left[8], const uint32_t right[8],
                            uint32_t out[8]) {
  uint32_t v[8];
  for (int i = 0; i < 8; i++)
    v[i] = left[i] ^ right[(i + 3) & 7] ^ 0xA5A5A5A5u;
  for (int r = 0; r < 4; r++) {
    for (int i = 0; i < 8; i++) {
      uint32_t w = v[i] + (v[(i + 1) & 7] ^ 0x3c6ef372u);
      uint32_t rot = (uint32_t)(((i % 5) + 2) & 31);
      w = (w >> rot) | (w << (32 - rot));
      v[i] = w ^ (0x9e3779b9u + v[(i + 2) & 7]);
    }
  }
  for (int i = 0; i < 8; i++)
    out[i] = v[i];
}
static double pipeline_run(int n, const uint8_t *msg) {
  const size_t CHUNK = 32;
  size_t nchunks = (size_t)((n + (int)CHUNK - 1) / (int)CHUNK);
  if (nchunks == 0)
    nchunks = 1;
  uint32_t *hashes = (uint32_t *)malloc(nchunks * 8 * sizeof(uint32_t));
  for (size_t ci = 0; ci < nchunks; ci++) {
    size_t off = ci * CHUNK;
    size_t clen = CHUNK;
    if (off + clen > (size_t)n)
      clen = (size_t)n - off;
    if (off >= (size_t)n)
      clen = 0;
    compress_chunk(msg + off, clen, hashes + ci * 8);
  }
  size_t cur = nchunks;
  while (cur > 1) {
    size_t parents = (cur + 1) / 2;
    uint32_t *next = (uint32_t *)malloc(parents * 8 * sizeof(uint32_t));
    for (size_t pi = 0; pi < parents; pi++) {
      size_t li = 2 * pi;
      size_t ri = li + 1;
      if (ri < cur) {
        compress_parent(hashes + li * 8, hashes + ri * 8, next + pi * 8);
      } else {
        memcpy(next + pi * 8, hashes + li * 8, 8 * sizeof(uint32_t));
      }
    }
    free(hashes);
    hashes = next;
    cur = parents;
  }
  uint64_t final64 = (((uint64_t)hashes[0]) << 32) | (uint64_t)hashes[1];
  free(hashes);
  double outv = 0.0;

  outv = (double)final64;
  return outv;
}
BENCH_MAIN_SCALAR3(
    T004_Module_032, B3TREE, 4096, 16384, 65536,
    uint8_t *msg = (uint8_t *)malloc((size_t)n);
    double ans_scalar = 0.0;
    ,
    {
      bench_rng64_t rng = bench_rng_init(seed);
      for (int i = 0; i < n; i++)
        msg[i] = (uint8_t)(bench_rng_next(&rng) & 0xFF);
    },
    ans_scalar = pipeline_run(n, msg), ans_scalar, free(msg);)
