#include "bench_harness.h"
#include "bench_utils.h"
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MOD64 0xffffffff00000061ULL
#define INV64 16664610971041225823ULL
#define RR64 0xffffff3e000024c1ULL
static inline uint64_t montgomery_reduce(unsigned __int128 t) {
  uint64_t m = (uint64_t)t * (uint64_t)INV64;
  unsigned __int128 u = t + (unsigned __int128)m * (unsigned __int128)MOD64;
  uint64_t res = (uint64_t)(u >> 64);
  if (res >= MOD64)
    res -= MOD64;
  return res;
}
static inline uint64_t montgomery_mul(uint64_t a, uint64_t b) {
  return montgomery_reduce((unsigned __int128)a * (unsigned __int128)b);
}
static inline uint64_t montgomery_from_uint64(uint64_t x) {
  return montgomery_reduce((unsigned __int128)x * (unsigned __int128)RR64);
}
static inline uint64_t montgomery_to_uint64(uint64_t x) {
  return montgomery_reduce((unsigned __int128)x);
}
static uint64_t modexp_montgomery(uint64_t base, uint64_t exp,
                                  uint64_t oneMont) {
  uint64_t result = oneMont;
  uint64_t cur = montgomery_from_uint64(base % MOD64);
  uint64_t e = exp;
  while (e) {
    if (e & 1ULL)
      result = montgomery_mul(result, cur);
    cur = montgomery_mul(cur, cur);
    e >>= 1ULL;
  }
  return montgomery_to_uint64(result);
}
static double pipeline_run(int n, const uint64_t *arr) {
  uint64_t oneMont = montgomery_from_uint64(1ULL);
  uint64_t acc = 0;
  for (int i = 0; i < n; i++) {
    uint64_t base = arr[i] % MOD64;
    uint64_t e = (uint64_t)(i + 1);
    uint64_t v = modexp_montgomery(base, e, oneMont);
    acc += v;
    acc %= MOD64;
  }
  uint64_t total = acc;
  double outv = 0.0;

  outv = (double)total;
  return outv;
}
BENCH_MAIN_SCALAR3(
    T004_Module_034, MONT64, 4096, 16384, 65536,
    uint64_t *arr = (uint64_t *)malloc((size_t)n * sizeof(uint64_t));
    double ans_scalar = 0.0;
    ,
    {
      bench_rng64_t rng = bench_rng_init(seed);
      for (int i = 0; i < n; i++)
        arr[i] = bench_rng_next(&rng);
    },
    ans_scalar = pipeline_run(n, arr), ans_scalar, free(arr);)
