#include "bench_harness.h"
#include "bench_utils.h"
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#define USE_SCALAR 1
static inline uint64_t hash_bytes64(uint64_t h, const char *s, int len) {
  for (int i = 0; i < len; i++) {
    h ^= (uint64_t)(unsigned char)s[i];
    h *= 1099511628211ULL;
  }
  return h;
}
static const char b64tab[64] =
    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
static void b64_encode_scalar(const uint8_t *in, int inlen, char *out,
                              int *outlen) {
  int i = 0;
  int o = 0;
  while (i + 2 < inlen) {
    uint32_t v = ((uint32_t)in[i] << 16) | ((uint32_t)in[i + 1] << 8) |
                 ((uint32_t)in[i + 2]);
    out[o++] = b64tab[(v >> 18) & 0x3f];
    out[o++] = b64tab[(v >> 12) & 0x3f];
    out[o++] = b64tab[(v >> 6) & 0x3f];
    out[o++] = b64tab[v & 0x3f];
    i += 3;
  }
  int rem = inlen - i;
  if (rem == 1) {
    uint32_t v = ((uint32_t)in[i] << 16);
    out[o++] = b64tab[(v >> 18) & 0x3f];
    out[o++] = b64tab[(v >> 12) & 0x3f];
    out[o++] = '=';
    out[o++] = '=';
  } else if (rem == 2) {
    uint32_t v = ((uint32_t)in[i] << 16) | ((uint32_t)in[i + 1] << 8);
    out[o++] = b64tab[(v >> 18) & 0x3f];
    out[o++] = b64tab[(v >> 12) & 0x3f];
    out[o++] = b64tab[(v >> 6) & 0x3f];
    out[o++] = '=';
  }
  *outlen = o;
}
static void b64_encode_simdish(const uint8_t *in, int inlen, char *out,
                               int *outlen) {
  int i = 0;
  int o = 0;
  while (i + 11 < inlen) {
    for (int k = 0; k < 12; k += 3) {
      uint32_t v = ((uint32_t)in[i + k] << 16) |
                   ((uint32_t)in[i + k + 1] << 8) | ((uint32_t)in[i + k + 2]);
      out[o++] = b64tab[(v >> 18) & 0x3f];
      out[o++] = b64tab[(v >> 12) & 0x3f];
      out[o++] = b64tab[(v >> 6) & 0x3f];
      out[o++] = b64tab[v & 0x3f];
    }
    i += 12;
  }
  while (i + 2 < inlen) {
    uint32_t v = ((uint32_t)in[i] << 16) | ((uint32_t)in[i + 1] << 8) |
                 ((uint32_t)in[i + 2]);
    out[o++] = b64tab[(v >> 18) & 0x3f];
    out[o++] = b64tab[(v >> 12) & 0x3f];
    out[o++] = b64tab[(v >> 6) & 0x3f];
    out[o++] = b64tab[v & 0x3f];
    i += 3;
  }
  int rem = inlen - i;
  if (rem == 1) {
    uint32_t v = ((uint32_t)in[i] << 16);
    out[o++] = b64tab[(v >> 18) & 0x3f];
    out[o++] = b64tab[(v >> 12) & 0x3f];
    out[o++] = '=';
    out[o++] = '=';
  } else if (rem == 2) {
    uint32_t v = ((uint32_t)in[i] << 16) | ((uint32_t)in[i + 1] << 8);
    out[o++] = b64tab[(v >> 18) & 0x3f];
    out[o++] = b64tab[(v >> 12) & 0x3f];
    out[o++] = b64tab[(v >> 6) & 0x3f];
    out[o++] = '=';
  }
  *outlen = o;
}
static void b64_encode_dispatch(const uint8_t *in, int inlen, char *out,
                                int *outlen) {
#if USE_SCALAR
  b64_encode_scalar(in, inlen, out, outlen);
#else
  b64_encode_simdish(in, inlen, out, outlen);
#endif
}
static double pipeline_run(const uint8_t *inbuf, int inlen, char *tmpout,
                           int outcap) {
  uint64_t total = 0;

  int outlen = 0;
  (void)outcap;
  b64_encode_dispatch(inbuf, inlen, tmpout, &outlen);
  total = hash_bytes64(total, tmpout, outlen);
  return (double)total;
}
BENCH_MAIN_SCALAR3(
    T004_Module_025, B64ENC, 4096, 16384, 65536,
    uint8_t *bin = (uint8_t *)malloc((size_t)n);
    int outcap = n * 2; char *outbuf = (char *)malloc((size_t)outcap);
    double ans_scalar = 0.0;,
                            {
                              bench_rng64_t rng = bench_rng_init(seed);
                              for (int i = 0; i < n; i++) {
                                bin[i] =
                                    (uint8_t)(bench_rng_next(&rng) & 0xffU);
                              }
                            },
                            ans_scalar = pipeline_run(bin, n, outbuf, outcap);
    , ans_scalar, free(bin); free(outbuf);)
