#include "bench_harness.h"
#include "bench_utils.h"
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct {
  uint32_t h[8];
  uint8_t buf[64];
  size_t buf_len;
  uint64_t total_len;
} sha256_ctx;
static inline uint32_t rotr32(uint32_t x, unsigned k) {
  return (x >> k) | (x << (32 - k));
}
static inline uint32_t load32_be(const uint8_t *p) {
  return ((uint32_t)p[0] << 24) | ((uint32_t)p[1] << 16) |
         ((uint32_t)p[2] << 8) | ((uint32_t)p[3]);
}
static inline void store32_be(uint8_t *p, uint32_t v) {
  p[0] = (uint8_t)((v >> 24) & 0xff);
  p[1] = (uint8_t)((v >> 16) & 0xff);
  p[2] = (uint8_t)((v >> 8) & 0xff);
  p[3] = (uint8_t)(v & 0xff);
}
static const uint32_t K256[64] = {
    0x428a2f98u, 0x71374491u, 0xb5c0fbcfu, 0xe9b5dba5u, 0x3956c25bu,
    0x59f111f1u, 0x923f82a4u, 0xab1c5ed5u, 0xd807aa98u, 0x12835b01u,
    0x243185beu, 0x550c7dc3u, 0x72be5d74u, 0x80deb1feu, 0x9bdc06a7u,
    0xc19bf174u, 0xe49b69c1u, 0xefbe4786u, 0x0fc19dc6u, 0x240ca1ccu,
    0x2de92c6fu, 0x4a7484aau, 0x5cb0a9dcu, 0x76f988dau, 0x983e5152u,
    0xa831c66du, 0xb00327c8u, 0xbf597fc7u, 0xc6e00bf3u, 0xd5a79147u,
    0x06ca6351u, 0x14292967u, 0x27b70a85u, 0x2e1b2138u, 0x4d2c6dfcu,
    0x53380d13u, 0x650a7354u, 0x766a0abbu, 0x81c2c92eu, 0x92722c85u,
    0xa2bfe8a1u, 0xa81a664bu, 0xc24b8b70u, 0xc76c51a3u, 0xd192e819u,
    0xd6990624u, 0xf40e3585u, 0x106aa070u, 0x19a4c116u, 0x1e376c08u,
    0x2748774cu, 0x34b0bcb5u, 0x391c0cb3u, 0x4ed8aa4au, 0x5b9cca4fu,
    0x682e6ff3u, 0x748f82eeu, 0x78a5636fu, 0x84c87814u, 0x8cc70208u,
    0x90befffau, 0xa4506cebu, 0xbef9a3f7u, 0xc67178f2u};
static void sha256_compress(uint32_t st[8], const uint8_t block[64]) {
  uint32_t w[64];
  for (int t = 0; t < 16; t++)
    w[t] = load32_be(block + 4 * t);
  for (int t = 16; t < 64; t++) {
    uint32_t s0 =
        rotr32(w[t - 15], 7) ^ rotr32(w[t - 15], 18) ^ (w[t - 15] >> 3);
    uint32_t s1 =
        rotr32(w[t - 2], 17) ^ rotr32(w[t - 2], 19) ^ (w[t - 2] >> 10);
    w[t] = w[t - 16] + s0 + w[t - 7] + s1;
  }
  uint32_t a = st[0];
  uint32_t b = st[1];
  uint32_t c = st[2];
  uint32_t d = st[3];
  uint32_t e = st[4];
  uint32_t f = st[5];
  uint32_t g = st[6];
  uint32_t h = st[7];
  for (int t = 0; t < 64; t++) {
    uint32_t S1 = rotr32(e, 6) ^ rotr32(e, 11) ^ rotr32(e, 25);
    uint32_t ch = (e & f) ^ ((~e) & g);
    uint32_t temp1 = h + S1 + ch + K256[t] + w[t];
    uint32_t S0 = rotr32(a, 2) ^ rotr32(a, 13) ^ rotr32(a, 22);
    uint32_t maj = (a & b) ^ (a & c) ^ (b & c);
    uint32_t temp2 = S0 + maj;
    h = g;
    g = f;
    f = e;
    e = d + temp1;
    d = c;
    c = b;
    b = a;
    a = temp1 + temp2;
  }
  st[0] += a;
  st[1] += b;
  st[2] += c;
  st[3] += d;
  st[4] += e;
  st[5] += f;
  st[6] += g;
  st[7] += h;
}
static void sha256_init(sha256_ctx *ctx) {
  ctx->h[0] = 0x6a09e667u;
  ctx->h[1] = 0xbb67ae85u;
  ctx->h[2] = 0x3c6ef372u;
  ctx->h[3] = 0xa54ff53au;
  ctx->h[4] = 0x510e527fu;
  ctx->h[5] = 0x9b05688cu;
  ctx->h[6] = 0x1f83d9abu;
  ctx->h[7] = 0x5be0cd19u;
  ctx->buf_len = 0;
  ctx->total_len = 0;
}
static void sha256_update(sha256_ctx *ctx, const uint8_t *data, size_t len) {
  size_t off = 0;
  while (off < len) {
    size_t space = 64 - ctx->buf_len;
    size_t take = len - off;
    if (take > space)
      take = space;
    memcpy(ctx->buf + ctx->buf_len, data + off, take);
    ctx->buf_len += take;
    off += take;
    if (ctx->buf_len == 64) {
      sha256_compress(ctx->h, ctx->buf);
      ctx->total_len += 64;
      ctx->buf_len = 0;
    }
  }
}
static void sha256_final(sha256_ctx *ctx, uint8_t out[32]) {
  uint64_t bit_len = (ctx->total_len + ctx->buf_len) * 8ULL;
  ctx->buf[ctx->buf_len++] = 0x80;
  if (ctx->buf_len > 56) {
    while (ctx->buf_len < 64)
      ctx->buf[ctx->buf_len++] = 0;
    sha256_compress(ctx->h, ctx->buf);
    ctx->buf_len = 0;
  }
  while (ctx->buf_len < 56)
    ctx->buf[ctx->buf_len++] = 0;
  for (int i = 7; i >= 0; i--) {
    ctx->buf[ctx->buf_len++] = (uint8_t)((bit_len >> (8 * i)) & 0xFF);
  }
  sha256_compress(ctx->h, ctx->buf);
  for (int i = 0; i < 8; i++)
    store32_be(out + 4 * i, ctx->h[i]);
}
static void hmac_sha256(const uint8_t *key, size_t keylen, const uint8_t *msg,
                        size_t msglen, uint8_t out[32]) {
  uint8_t kblk[64];
  uint8_t ipad[64];
  uint8_t opad[64];
  uint8_t inner[32];
  if (keylen > 64) {
    sha256_ctx ctmp;
    sha256_init(&ctmp);
    sha256_update(&ctmp, key, keylen);
    sha256_final(&ctmp, kblk);
    for (size_t i = 32; i < 64; i++)
      kblk[i] = 0;
  } else {
    for (size_t i = 0; i < keylen; i++)
      kblk[i] = key[i];
    for (size_t i = keylen; i < 64; i++)
      kblk[i] = 0;
  }
  for (int i = 0; i < 64; i++) {
    ipad[i] = (uint8_t)(kblk[i] ^ 0x36);
    opad[i] = (uint8_t)(kblk[i] ^ 0x5c);
  }
  sha256_ctx c1;
  sha256_init(&c1);
  sha256_update(&c1, ipad, 64);
  sha256_update(&c1, msg, msglen);
  sha256_final(&c1, inner);
  sha256_ctx c2;
  sha256_init(&c2);
  sha256_update(&c2, opad, 64);
  sha256_update(&c2, inner, 32);
  sha256_final(&c2, out);
}
static double pipeline_run(int n, const uint8_t *key, const uint8_t *msg) {
  uint8_t tag[32];
  hmac_sha256(key, 32, msg, (size_t)n, tag);
  uint64_t v = 0;
  for (int i = 0; i < 8; i++) {
    v = (v << 8) | (uint64_t)tag[i];
  }
  double outv = 0.0;

  outv = (double)v;
  return outv;
}
BENCH_MAIN_SCALAR3(
    T004_Module_033, HMAC256, 4096, 16384, 65536,
    uint8_t *key = (uint8_t *)malloc(32);
    uint8_t *msg = (uint8_t *)malloc((size_t)n); double ans_scalar = 0.0;
    ,
    {
      bench_rng64_t rng = bench_rng_init(seed);
      for (int i = 0; i < 32; i++)
        key[i] = (uint8_t)(bench_rng_next(&rng) & 0xFF);
      for (int i = 0; i < n; i++)
        msg[i] = (uint8_t)(bench_rng_next(&rng) & 0xFF);
    },
    ans_scalar = pipeline_run(n, key, msg), ans_scalar, free(key);
    free(msg);)
