#include "bench_harness.h"
#include "bench_utils.h"
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
void kernel_run(int n, const int *arr, double *ans_out) {
  int N = n;
  if (N > 1024)
    N = 1024;
  if (N < 1)
    N = 1;
  long long *prefix = (long long *)malloc((size_t)(N + 1) * sizeof(long long));
  long long *dp_prev = (long long *)malloc((size_t)(N + 1) * sizeof(long long));
  long long *dp_cur = (long long *)malloc((size_t)(N + 1) * sizeof(long long));
  if (!prefix || !dp_prev || !dp_cur) {
    if (prefix)
      free(prefix);
    if (dp_prev)
      free(dp_prev);
    if (dp_cur)
      free(dp_cur);
    *ans_out = 0.0;
    return;
  }
  prefix[0] = 0;
  for (int i = 1; i <= N; i++) {
    prefix[i] = prefix[i - 1] + (long long)arr[i - 1];
  }
  const long long INFLL = 0x3fffffffffffffffLL;
  for (int i = 0; i <= N; i++) {
    dp_prev[i] = (i == 0) ? 0 : INFLL;
  }
  int T = 4;
  struct Frame {
    int l, r;
    int optL, optR;
    int phase;
    int mid;
    int bestK;
  };
  struct Frame *stack =
      (struct Frame *)malloc((size_t)(4 * N + 32) * sizeof(struct Frame));
  if (!stack) {
    free(prefix);
    free(dp_prev);
    free(dp_cur);
    *ans_out = 0.0;
    return;
  }
  for (int iter = 0; iter < T; iter++) {
    dp_cur[0] = 0;
    int sp = 0;
    stack[sp].l = 1;
    stack[sp].r = N;
    stack[sp].optL = 0;
    stack[sp].optR = N - 1;
    stack[sp].phase = 0;
    sp++;
    while (sp > 0) {
      struct Frame *f = &stack[sp - 1];
      if (f->phase == 0) {
        f->mid = (f->l + f->r) >> 1;
        long long bestVal = INFLL;
        int bestKLocal = f->optL;
        int kstart = f->optL;
        int kend = f->optR;
        if (kend > f->mid - 1)
          kend = f->mid - 1;
        for (int k = kstart; k <= kend; k++) {
          long long diff = prefix[f->mid] - prefix[k];
          long long cand = dp_prev[k] + diff * diff;
          if (cand < bestVal) {
            bestVal = cand;
            bestKLocal = k;
          }
        }
        dp_cur[f->mid] = bestVal;
        f->bestK = bestKLocal;
        f->phase = 1;
        if (f->l <= f->mid - 1) {
          stack[sp].l = f->l;
          stack[sp].r = f->mid - 1;
          stack[sp].optL = f->optL;
          stack[sp].optR = bestKLocal;
          stack[sp].phase = 0;
          sp++;
          continue;
        }
      }
      if (f->phase == 1) {
        f->phase = 2;
        if (f->mid + 1 <= f->r) {
          stack[sp].l = f->mid + 1;
          stack[sp].r = f->r;
          stack[sp].optL = f->bestK;
          stack[sp].optR = f->optR;
          stack[sp].phase = 0;
          sp++;
          continue;
        }
      }
      sp--;
    }
    for (int i2 = 0; i2 <= N; i2++) {
      dp_prev[i2] = dp_cur[i2];
    }
  }
  long long final_ans = dp_prev[N];
  double ans_double = 0.0;
  double base_ans = (double)final_ans;

  ans_double = base_ans;

  *ans_out = ans_double;
  free(stack);
  free(prefix);
  free(dp_prev);
  free(dp_cur);
}
BENCH_MAIN_SCALAR3(
    T003_Code_031, DCDP, 4096, 16384, 65536,
    int *arr = (int *)malloc((size_t)n * sizeof(int));
    double ans_scalar = 0.0;
    ,
    {
      bench_rng64_t rng = bench_rng_init(seed);
      for (int i = 0; i < n; i++) {
        arr[i] = (int)(bench_rng_next(&rng) % 100ULL) + 1;
      }
    },
    kernel_run(n, arr, &ans_scalar), ans_scalar, free(arr);)
