#include "bench_harness.h"
#include "bench_utils.h"
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
void kernel_run(int n, const int *a, double *ans_out) {
  const int K0 = 5;
  if (n < 1) {
    *ans_out = 0.0;
    return;
  }
  int32_t *vA = (int32_t *)malloc((size_t)n * sizeof(int32_t));
  int32_t *vB = (int32_t *)malloc((size_t)n * sizeof(int32_t));
  uint32_t *kA = (uint32_t *)malloc((size_t)n * sizeof(uint32_t));
  uint32_t *kB = (uint32_t *)malloc((size_t)n * sizeof(uint32_t));
  int32_t *uniq = (int32_t *)malloc((size_t)n * sizeof(int32_t));
  int *cidx = (int *)malloc((size_t)n * sizeof(int));
  if (!vA || !vB || !kA || !kB || !uniq || !cidx) {
    if (vA)
      free(vA);
    if (vB)
      free(vB);
    if (kA)
      free(kA);
    if (kB)
      free(kB);
    if (uniq)
      free(uniq);
    if (cidx)
      free(cidx);
    *ans_out = 0.0;
    return;
  }
  for (int i = 0; i < n; i++) {
    vA[i] = (int32_t)a[i];
    kA[i] = (uint32_t)a[i];
  }
  uint32_t *kin = kA;
  uint32_t *kout = kB;
  int32_t *vin = vA;
  int32_t *vout = vB;
  for (int pass = 0; pass < 4; pass++) {
    unsigned int cnt[256];
    memset(cnt, 0, sizeof(cnt));
    unsigned int shift = (unsigned int)(pass * 8U);
    for (int i = 0; i < n; i++) {
      unsigned int bucket = (unsigned int)((kin[i] >> shift) & 0xFFu);
      cnt[bucket]++;
    }
    unsigned int sum = 0;
    for (int b = 0; b < 256; b++) {
      unsigned int tmp = cnt[b];
      cnt[b] = sum;
      sum += tmp;
    }
    for (int i = 0; i < n; i++) {
      unsigned int bucket = (unsigned int)((kin[i] >> shift) & 0xFFu);
      unsigned int pos = cnt[bucket]++;
      kout[pos] = kin[i];
      vout[pos] = vin[i];
    }
    uint32_t *tmpk = kin;
    kin = kout;
    kout = tmpk;
    int32_t *tmpv = vin;
    vin = vout;
    vout = tmpv;
  }
  int m = 0;
  for (int i = 0; i < n; i++) {
    if (i == 0 || vin[i] != vin[i - 1]) {
      uniq[m++] = vin[i];
    }
  }
  for (int i = 0; i < n; i++) {
    int32_t target = (int32_t)a[i];
    int lo = 0, hi = m;
    while (lo < hi) {
      int mid = (lo + hi) >> 1;
      if (uniq[mid] < target)
        lo = mid + 1;
      else
        hi = mid;
    }
    cidx[i] = lo;
  }
  free(vA);
  free(vB);
  free(kA);
  free(kB);
  int MOD = 1000000007;
  int fenw_size = (K0 + 1) * (m + 2);
  int *fenw = (int *)malloc((size_t)fenw_size * sizeof(int));
  if (!fenw) {
    free(uniq);
    free(cidx);
    *ans_out = 0.0;
    return;
  }
  for (int i = 0; i < fenw_size; i++)
    fenw[i] = 0;
  for (int i = 0; i < n; i++) {
    int r = cidx[i];
    for (int len = K0; len >= 2; len--) {
      int sumv = 0;
      int p = r;
      while (p > 0) {
        int idxFen = (len - 1) * (m + 2) + p;
        sumv += fenw[idxFen];
        if (sumv >= MOD)
          sumv -= MOD;
        p -= p & -p;
      }
      int upd = sumv;
      if (upd) {
        int p2 = r + 1;
        while (p2 <= m) {
          int idxFen2 = len * (m + 2) + p2;
          int tmp = fenw[idxFen2] + upd;
          if (tmp >= MOD)
            tmp -= MOD;
          fenw[idxFen2] = tmp;
          p2 += p2 & -p2;
        }
      }
    }
    {
      int p2 = r + 1;
      while (p2 <= m) {
        int idxFen2 = 1 * (m + 2) + p2;
        int tmp = fenw[idxFen2] + 1;
        if (tmp >= MOD)
          tmp -= MOD;
        fenw[idxFen2] = tmp;
        p2 += p2 & -p2;
      }
    }
  }
  int ans_mod = 0;
  {
    int p = m;
    while (p > 0) {
      int idxFen = K0 * (m + 2) + p;
      ans_mod += fenw[idxFen];
      if (ans_mod >= MOD)
        ans_mod -= MOD;
      p -= p & -p;
    }
  }
  double base_ans = (double)ans_mod;
  double ans_double = 0.0;

  ans_double = base_ans;

  *ans_out = ans_double;
  free(uniq);
  free(cidx);
  free(fenw);
}
BENCH_MAIN_SCALAR3(
    T003_Code_035, KINC, 4096, 16384, 65536,
    int *a = (int *)malloc((size_t)n * sizeof(int));
    double ans_scalar = 0.0;,
                            {
                              bench_rng64_t rng = bench_rng_init(seed);
                              for (int i = 0; i < n; i++) {
                                a[i] = (int)(bench_rng_next(&rng) % 1000000ULL);
                              }
                            },
                            kernel_run(n, a, &ans_scalar), ans_scalar, free(a);)
