#include "bench_harness.h"
#include "bench_utils.h"
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
static inline uint32_t rotl32(uint32_t x, int k) {
  return (x << k) | (x >> (32 - k));
}
static inline uint32_t load32_le(const uint8_t *p) {
  return ((uint32_t)p[0]) | ((uint32_t)p[1] << 8) | ((uint32_t)p[2] << 16) |
         ((uint32_t)p[3] << 24);
}
static inline void store32_le(uint8_t *p, uint32_t v) {
  p[0] = (uint8_t)(v & 0xff);
  p[1] = (uint8_t)((v >> 8) & 0xff);
  p[2] = (uint8_t)((v >> 16) & 0xff);
  p[3] = (uint8_t)((v >> 24) & 0xff);
}
static inline uint64_t load64_le(const uint8_t *p) {
  return ((uint64_t)p[0]) | ((uint64_t)p[1] << 8) | ((uint64_t)p[2] << 16) |
         ((uint64_t)p[3] << 24) | ((uint64_t)p[4] << 32) |
         ((uint64_t)p[5] << 40) | ((uint64_t)p[6] << 48) |
         ((uint64_t)p[7] << 56);
}
static inline void quarter_round(uint32_t *a, uint32_t *b, uint32_t *c,
                                 uint32_t *d) {
  *a += *b;
  *d ^= *a;
  *d = rotl32(*d, 16);
  *c += *d;
  *b ^= *c;
  *b = rotl32(*b, 12);
  *a += *b;
  *d ^= *a;
  *d = rotl32(*d, 8);
  *c += *d;
  *b ^= *c;
  *b = rotl32(*b, 7);
}
static void chacha_block(const uint8_t key[32], const uint8_t nonce[12],
                         uint32_t counter, uint32_t out[16]) {
  static const uint32_t cs[4] = {0x61707865u, 0x3320646eu, 0x79622d32u,
                                 0x6b206574u};
  uint32_t st[16];
  st[0] = cs[0];
  st[1] = cs[1];
  st[2] = cs[2];
  st[3] = cs[3];
  st[4] = load32_le(key + 0);
  st[5] = load32_le(key + 4);
  st[6] = load32_le(key + 8);
  st[7] = load32_le(key + 12);
  st[8] = load32_le(key + 16);
  st[9] = load32_le(key + 20);
  st[10] = load32_le(key + 24);
  st[11] = load32_le(key + 28);
  st[12] = counter;
  st[13] = load32_le(nonce + 0);
  st[14] = load32_le(nonce + 4);
  st[15] = load32_le(nonce + 8);
  uint32_t x[16];
  for (int i = 0; i < 16; i++)
    x[i] = st[i];
  for (int r = 0; r < 10; r++) {
    quarter_round(&x[0], &x[4], &x[8], &x[12]);
    quarter_round(&x[1], &x[5], &x[9], &x[13]);
    quarter_round(&x[2], &x[6], &x[10], &x[14]);
    quarter_round(&x[3], &x[7], &x[11], &x[15]);
    quarter_round(&x[0], &x[5], &x[10], &x[15]);
    quarter_round(&x[1], &x[6], &x[11], &x[12]);
    quarter_round(&x[2], &x[7], &x[8], &x[13]);
    quarter_round(&x[3], &x[4], &x[9], &x[14]);
  }
  for (int i = 0; i < 16; i++)
    out[i] = x[i] + st[i];
}
static void chacha_xor(uint8_t *dst, const uint8_t *src, size_t len,
                       const uint8_t key[32], const uint8_t nonce[12]) {
  uint32_t block[16];
  uint8_t ks[64];
  uint32_t ctr = 1;
  size_t off = 0;
  while (off < len) {
    chacha_block(key, nonce, ctr, block);
    for (int i = 0; i < 16; i++)
      store32_le(ks + 4 * i, block[i]);
    size_t nblk = len - off;
    if (nblk > 64)
      nblk = 64;
    for (size_t j = 0; j < nblk; j++)
      dst[off + j] = src[off + j] ^ ks[j];
    off += nblk;
    ctr++;
  }
}
#define MOD61 ((uint64_t)((1ULL << 61) - 1ULL))
static inline uint64_t mul_mod61(uint64_t a, uint64_t b) {
  unsigned __int128 t = (unsigned __int128)a * (unsigned __int128)b;
  uint64_t lo = (uint64_t)(t & (unsigned __int128)MOD61);
  uint64_t hi = (uint64_t)(t >> 61);
  uint64_t r = lo + hi;
  if (r >= MOD61)
    r -= MOD61;
  return r;
}
static uint64_t poly_mac(const uint8_t *aad, size_t alen, const uint8_t *ct,
                         size_t clen, const uint8_t first_block[64]) {
  uint64_t r = load64_le(first_block + 0) & ((1ULL << 60) - 1ULL);
  uint64_t s = load64_le(first_block + 8) % MOD61;
  uint64_t acc = 0;
  for (size_t i = 0; i < alen; i++) {
    acc += aad[i];
    if (acc >= MOD61)
      acc -= MOD61;
    acc = mul_mod61(acc, r);
  }
  for (size_t i = 0; i < clen; i++) {
    acc += ct[i];
    if (acc >= MOD61)
      acc -= MOD61;
    acc = mul_mod61(acc, r);
  }
  acc += (uint64_t)(alen & 0xffffffffu);
  if (acc >= MOD61)
    acc -= MOD61;
  acc += (uint64_t)(clen & 0xffffffffu);
  acc %= MOD61;
  acc += s;
  acc %= MOD61;
  return acc;
}
static double pipeline_run(int n, const uint8_t *key, const uint8_t *nonce,
                           const uint8_t *aad, size_t aad_len,
                           const uint8_t *plain, uint8_t *cipher) {
  uint32_t block0[16];
  chacha_block(key, nonce, 0, block0);
  uint8_t first_block[64];
  for (int i = 0; i < 16; i++)
    store32_le(first_block + 4 * i, block0[i]);
  chacha_xor(cipher, plain, (size_t)n, key, nonce);
  uint64_t tag = poly_mac(aad, aad_len, cipher, (size_t)n, first_block);
  double outv = 0.0;

  outv = (double)tag;
  return outv;
}
BENCH_MAIN_SCALAR3(
    T004_Module_031, CHAPOLY, 4096, 16384, 65536,
    size_t aad_len = (size_t)(n / 8);
    uint8_t *key = (uint8_t *)malloc(32);
    uint8_t *nonce = (uint8_t *)malloc(12);
    uint8_t *aad = (uint8_t *)malloc(aad_len ? aad_len : 1);
    uint8_t *plain = (uint8_t *)malloc((size_t)n);
    uint8_t *cipher = (uint8_t *)malloc((size_t)n); double ans_scalar = 0.0;
    ,
    {
      bench_rng64_t rng = bench_rng_init(seed);
      for (int i = 0; i < 32; i++)
        key[i] = (uint8_t)(bench_rng_next(&rng) & 0xFF);
      for (int i = 0; i < 12; i++)
        nonce[i] = (uint8_t)(bench_rng_next(&rng) & 0xFF);
      for (size_t i = 0; i < aad_len; i++)
        aad[i] = (uint8_t)(bench_rng_next(&rng) & 0xFF);
      for (int i = 0; i < n; i++)
        plain[i] = (uint8_t)(bench_rng_next(&rng) & 0xFF);
    },
    ans_scalar = pipeline_run(n, key, nonce, aad, aad_len, plain, cipher),
    ans_scalar, free(key);
    free(nonce); free(aad); free(plain); free(cipher);)
