#include "bench_harness.h"
#include "bench_utils.h"
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
void kernel_run(int n, const int *a, double *ans_out) {
  const int K = 1000;
  int best_final = 0;

  int n1 = n + 1;
  int *pref = (int *)malloc((size_t)n1 * sizeof(int));
  int *cid = (int *)malloc((size_t)n1 * sizeof(int));
  int *rid = (int *)malloc((size_t)n1 * sizeof(int));
  int32_t *vA = (int32_t *)malloc((size_t)n1 * sizeof(int32_t));
  int32_t *vB = (int32_t *)malloc((size_t)n1 * sizeof(int32_t));
  uint32_t *kA = (uint32_t *)malloc((size_t)n1 * sizeof(uint32_t));
  uint32_t *kB = (uint32_t *)malloc((size_t)n1 * sizeof(uint32_t));
  int32_t *unique_vals = (int32_t *)malloc((size_t)n1 * sizeof(int32_t));
  if (!pref || !cid || !rid || !vA || !vB || !kA || !kB || !unique_vals) {
    if (pref)
      free(pref);
    if (cid)
      free(cid);
    if (rid)
      free(rid);
    if (vA)
      free(vA);
    if (vB)
      free(vB);
    if (kA)
      free(kA);
    if (kB)
      free(kB);
    if (unique_vals)
      free(unique_vals);
    *ans_out = 0.0;
    return;
  }
  pref[0] = 0;
  for (int i = 0; i < n; i++) {
    pref[i + 1] = pref[i] + a[i];
  }
  for (int i = 0; i < n1; i++) {
    vA[i] = (int32_t)pref[i];
    kA[i] = ((uint32_t)pref[i]) ^ 0x80000000u;
  }
  uint32_t *kin = kA;
  uint32_t *kout = kB;
  int32_t *vin = vA;
  int32_t *vout = vB;
  for (int pass = 0; pass < 4; pass++) {
    unsigned int cnt[256];
    memset(cnt, 0, sizeof(cnt));
    unsigned int shift = (unsigned int)(pass * 8U);
    for (int i = 0; i < n1; i++) {
      unsigned int bucket = (unsigned int)((kin[i] >> shift) & 0xFFu);
      cnt[bucket]++;
    }
    unsigned int sum = 0;
    for (int b = 0; b < 256; b++) {
      unsigned int tmp = cnt[b];
      cnt[b] = sum;
      sum += tmp;
    }
    for (int i = 0; i < n1; i++) {
      unsigned int bucket = (unsigned int)((kin[i] >> shift) & 0xFFu);
      unsigned int pos = cnt[bucket]++;
      kout[pos] = kin[i];
      vout[pos] = vin[i];
    }
    uint32_t *tmpk = kin;
    kin = kout;
    kout = tmpk;
    int32_t *tmpv = vin;
    vin = vout;
    vout = tmpv;
  }
  int m = 0;
  for (int i = 0; i < n1; i++) {
    if (i == 0 || vin[i] != vin[i - 1]) {
      unique_vals[m++] = vin[i];
    }
  }
  for (int i = 0; i < n1; i++) {
    int32_t target = (int32_t)pref[i];
    int lo = 0;
    int hi = m;
    while (lo < hi) {
      int mid = (lo + hi) >> 1;
      if (unique_vals[mid] < target)
        lo = mid + 1;
      else
        hi = mid;
    }
    cid[i] = lo;
    rid[i] = m - 1 - lo;
  }
  int BIG = n + 1;
  int *fenw = (int *)malloc((size_t)m * sizeof(int));
  if (!fenw) {
    free(pref);
    free(cid);
    free(rid);
    free(vA);
    free(vB);
    free(kA);
    free(kB);
    free(unique_vals);
    *ans_out = 0.0;
    return;
  }
  for (int i = 0; i < m; i++)
    fenw[i] = BIG;
  {
    int p = rid[0];
    while (p < m) {
      if (0 < fenw[p])
        fenw[p] = 0;
      p = p | (p + 1);
    }
  }
  int best = 0;
  for (int j = 1; j < n1; j++) {
    int32_t threshold = (int32_t)(pref[j] - K);
    int lo = 0;
    int hi = m;
    while (lo < hi) {
      int mid = (lo + hi) >> 1;
      if (unique_vals[mid] < threshold)
        lo = mid + 1;
      else
        hi = mid;
    }
    if (lo < m) {
      int rid_T = m - 1 - lo;
      int res = BIG;
      int q = rid_T;
      while (q >= 0) {
        if (fenw[q] < res)
          res = fenw[q];
        q = (q & (q + 1)) - 1;
      }
      if (res != BIG) {
        int cand = j - res;
        if (cand > best)
          best = cand;
      }
    }
    {
      int p2 = rid[j];
      while (p2 < m) {
        if (j < fenw[p2])
          fenw[p2] = j;
        p2 = p2 | (p2 + 1);
      }
    }
  }
  best_final = best;
  free(pref);
  free(cid);
  free(rid);
  free(vA);
  free(vB);
  free(kA);
  free(kB);
  free(unique_vals);
  free(fenw);

  *ans_out = (double)best_final;
}
BENCH_MAIN_SCALAR3(
    T003_Code_003, LSUMLEK, 4096, 16384, 65536,
    int *a = (int *)malloc((size_t)n * sizeof(int));
    double ans_scalar = 0.0;
    ,
    {
      bench_rng64_t rng = bench_rng_init(seed);
      for (int i = 0; i < n; i++) {
        a[i] = (int)((bench_rng_next(&rng) % 101ULL) - 50LL);
      }
    },
    kernel_run(n, a, &ans_scalar), ans_scalar, free(a);)
