{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "machine_shape": "hm",
      "gpuType": "A100"
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "If you use online code editor, such as Google co-lab and online Python notebook, please install needed package. Thanks."
      ],
      "metadata": {
        "id": "EpH88pREgnIq"
      }
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "yYw13zhwrFq_",
        "outputId": "82486fce-a29b-473e-9f44-be0b77de37e7"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Requirement already satisfied: lightgbm in /usr/local/lib/python3.12/dist-packages (4.6.0)\n",
            "Requirement already satisfied: xgboost in /usr/local/lib/python3.12/dist-packages (3.2.0)\n",
            "Requirement already satisfied: catboost in /usr/local/lib/python3.12/dist-packages (1.2.10)\n",
            "Requirement already satisfied: ngboost in /usr/local/lib/python3.12/dist-packages (0.5.9)\n",
            "Requirement already satisfied: numpy>=1.17.0 in /usr/local/lib/python3.12/dist-packages (from lightgbm) (2.0.2)\n",
            "Requirement already satisfied: scipy in /usr/local/lib/python3.12/dist-packages (from lightgbm) (1.16.3)\n",
            "Requirement already satisfied: nvidia-nccl-cu12 in /usr/local/lib/python3.12/dist-packages (from xgboost) (2.29.3)\n",
            "Requirement already satisfied: graphviz in /usr/local/lib/python3.12/dist-packages (from catboost) (0.21)\n",
            "Requirement already satisfied: matplotlib in /usr/local/lib/python3.12/dist-packages (from catboost) (3.10.0)\n",
            "Requirement already satisfied: pandas<4.0,>=0.24 in /usr/local/lib/python3.12/dist-packages (from catboost) (2.2.2)\n",
            "Requirement already satisfied: plotly in /usr/local/lib/python3.12/dist-packages (from catboost) (5.24.1)\n",
            "Requirement already satisfied: six in /usr/local/lib/python3.12/dist-packages (from catboost) (1.17.0)\n",
            "Requirement already satisfied: lifelines>=0.25 in /usr/local/lib/python3.12/dist-packages (from ngboost) (0.30.1)\n",
            "Requirement already satisfied: scikit-learn<2.0,>=1.6 in /usr/local/lib/python3.12/dist-packages (from ngboost) (1.6.1)\n",
            "Requirement already satisfied: sympy>=1.12 in /usr/local/lib/python3.12/dist-packages (from ngboost) (1.14.0)\n",
            "Requirement already satisfied: tqdm>=4.3 in /usr/local/lib/python3.12/dist-packages (from ngboost) (4.67.3)\n",
            "Requirement already satisfied: autograd>=1.5 in /usr/local/lib/python3.12/dist-packages (from lifelines>=0.25->ngboost) (1.8.0)\n",
            "Requirement already satisfied: autograd-gamma>=0.3 in /usr/local/lib/python3.12/dist-packages (from lifelines>=0.25->ngboost) (0.5.0)\n",
            "Requirement already satisfied: formulaic>=0.2.2 in /usr/local/lib/python3.12/dist-packages (from lifelines>=0.25->ngboost) (1.2.1)\n",
            "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib->catboost) (1.3.3)\n",
            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib->catboost) (0.12.1)\n",
            "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib->catboost) (4.61.1)\n",
            "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib->catboost) (1.4.9)\n",
            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib->catboost) (26.0)\n",
            "Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib->catboost) (11.3.0)\n",
            "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib->catboost) (3.3.2)\n",
            "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.12/dist-packages (from matplotlib->catboost) (2.9.0.post0)\n",
            "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas<4.0,>=0.24->catboost) (2025.2)\n",
            "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas<4.0,>=0.24->catboost) (2025.3)\n",
            "Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn<2.0,>=1.6->ngboost) (1.5.3)\n",
            "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn<2.0,>=1.6->ngboost) (3.6.0)\n",
            "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.12/dist-packages (from sympy>=1.12->ngboost) (1.3.0)\n",
            "Requirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.12/dist-packages (from plotly->catboost) (9.1.4)\n",
            "Requirement already satisfied: interface-meta>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from formulaic>=0.2.2->lifelines>=0.25->ngboost) (1.3.0)\n",
            "Requirement already satisfied: narwhals>=1.17 in /usr/local/lib/python3.12/dist-packages (from formulaic>=0.2.2->lifelines>=0.25->ngboost) (2.17.0)\n",
            "Requirement already satisfied: typing-extensions>=4.2.0 in /usr/local/lib/python3.12/dist-packages (from formulaic>=0.2.2->lifelines>=0.25->ngboost) (4.15.0)\n",
            "Requirement already satisfied: wrapt>=1.0 in /usr/local/lib/python3.12/dist-packages (from formulaic>=0.2.2->lifelines>=0.25->ngboost) (2.1.1)\n"
          ]
        }
      ],
      "source": [
        "pip install lightgbm xgboost catboost ngboost"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Following code implements and evaluates several leakage-free prediction methods, including single-stage LightGBM, CRR++, Scale-Bucket Experts, and a Recency Ensemble. The code loads and preprocesses the data, engineers relevant features, splits the data temporally, trains each model, and reports key performance metrics on the test set."
      ],
      "metadata": {
        "id": "_MCAamKfgfjw"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# ============================================================\n",
        "# PAI TRACE JOB DURATION PREDICTION — SCRIPT 1: FOUR METHODS\n",
        "# ============================================================\n",
        "\n",
        "import tarfile\n",
        "import pathlib\n",
        "import warnings\n",
        "import time\n",
        "import gc\n",
        "from dataclasses import dataclass\n",
        "from typing import Dict, List, Tuple\n",
        "\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "\n",
        "from sklearn.preprocessing import StandardScaler\n",
        "from sklearn.cluster import KMeans\n",
        "from sklearn.ensemble import RandomForestClassifier\n",
        "from sklearn.metrics import mean_absolute_error, mean_squared_error\n",
        "from scipy.stats import spearmanr\n",
        "\n",
        "from lightgbm import LGBMRegressor, early_stopping, log_evaluation\n",
        "\n",
        "warnings.filterwarnings(\"ignore\")\n",
        "RNG = 42\n",
        "np.random.seed(RNG)\n",
        "\n",
        "# ============================================================\n",
        "# I/O\n",
        "# ============================================================\n",
        "ARCHIVES = [\n",
        "    \"/content/pai_group_tag_table.tar.gz\",\n",
        "    \"/content/pai_job_table.tar.gz\",\n",
        "    \"/content/pai_task_table.tar.gz\",\n",
        "]\n",
        "EXTRACT_DIR = pathlib.Path(\"/content/extracted\")\n",
        "\n",
        "\n",
        "def extract_archives():\n",
        "    EXTRACT_DIR.mkdir(parents=True, exist_ok=True)\n",
        "    for gz in ARCHIVES:\n",
        "        p = pathlib.Path(gz)\n",
        "        if not p.exists():\n",
        "            print(f\"  [WARN] Archive not found: {gz}\")\n",
        "            continue\n",
        "        with tarfile.open(gz, mode=\"r:gz\") as tar:\n",
        "            tar.extractall(path=EXTRACT_DIR)\n",
        "    print(f\"[IO] Extracted to {EXTRACT_DIR}\")\n",
        "\n",
        "\n",
        "@dataclass\n",
        "class Tables:\n",
        "    job: pd.DataFrame\n",
        "    gtag: pd.DataFrame\n",
        "\n",
        "\n",
        "def load_tables() -> Tables:\n",
        "    job_cols = [\"job_name\", \"inst_id\", \"user\", \"status\", \"start_time\", \"end_time\"]\n",
        "    task_cols = [\n",
        "        \"job_name\", \"task_name\", \"inst_num\", \"status\", \"start_time\", \"end_time\",\n",
        "        \"plan_cpu\", \"plan_mem\", \"plan_gpu\", \"gpu_type\",\n",
        "    ]\n",
        "    grp_cols = [\"inst_id\", \"user_from_group_table\", \"gpu_type_spec\", \"group\", \"workload\"]\n",
        "\n",
        "    job = pd.read_csv(EXTRACT_DIR / \"pai_job_table.csv\", header=None, names=job_cols, low_memory=False)\n",
        "    task = pd.read_csv(EXTRACT_DIR / \"pai_task_table.csv\", header=None, names=task_cols, low_memory=False)\n",
        "    gtag = pd.read_csv(EXTRACT_DIR / \"pai_group_tag_table.csv\", header=None, names=grp_cols, low_memory=False)\n",
        "\n",
        "    job = job[job.status == \"Terminated\"].copy()\n",
        "    task = task[task.status == \"Terminated\"].copy()\n",
        "\n",
        "    for df in (job, task):\n",
        "        df[\"start_time\"] = pd.to_numeric(df[\"start_time\"], errors=\"coerce\")\n",
        "        df[\"end_time\"] = pd.to_numeric(df[\"end_time\"], errors=\"coerce\")\n",
        "\n",
        "    for c in [\"plan_cpu\", \"plan_gpu\", \"plan_mem\", \"inst_num\"]:\n",
        "        task[c] = pd.to_numeric(task[c], errors=\"coerce\")\n",
        "\n",
        "    task[\"cpu_total\"] = (task[\"plan_cpu\"] / 100.0) * task[\"inst_num\"]\n",
        "    task[\"gpu_total\"] = (task[\"plan_gpu\"] / 100.0) * task[\"inst_num\"]\n",
        "    task[\"mem_total\"] = task[\"plan_mem\"] * task[\"inst_num\"]\n",
        "\n",
        "    job_span = (\n",
        "        task.groupby(\"job_name\")\n",
        "        .agg(\n",
        "            min_start=(\"start_time\", \"min\"),\n",
        "            max_end=(\"end_time\", \"max\"),\n",
        "            total_inst_num=(\"inst_num\", \"sum\"),\n",
        "            total_plan_cpu=(\"cpu_total\", \"sum\"),\n",
        "            total_plan_mem=(\"mem_total\", \"sum\"),\n",
        "            total_plan_gpu=(\"gpu_total\", \"sum\"),\n",
        "            num_tasks=(\"task_name\", \"nunique\"),\n",
        "        )\n",
        "        .reset_index()\n",
        "    )\n",
        "    job_span[\"p_star\"] = (job_span[\"max_end\"] - job_span[\"min_start\"]).clip(lower=0)\n",
        "\n",
        "    base = job[[\"job_name\", \"inst_id\", \"user\", \"start_time\"]].merge(\n",
        "        job_span, on=\"job_name\", how=\"inner\"\n",
        "    )\n",
        "    base = base[base[\"p_star\"] > 0].copy()\n",
        "    base.rename(columns={\"start_time\": \"submit_time\"}, inplace=True)\n",
        "\n",
        "    gkeep = gtag[[\"inst_id\", \"group\", \"workload\", \"gpu_type_spec\"]].copy()\n",
        "    base = base.merge(gkeep, on=\"inst_id\", how=\"left\")\n",
        "    for c in [\"group\", \"workload\", \"gpu_type_spec\", \"user\"]:\n",
        "        base[c] = base[c].fillna(\"Unknown\").astype(str)\n",
        "\n",
        "    base.dropna(subset=[\"submit_time\"], inplace=True)\n",
        "\n",
        "    del job, task, gtag, job_span\n",
        "    gc.collect()\n",
        "\n",
        "    return Tables(job=base, gtag=gkeep)\n",
        "\n",
        "\n",
        "# ============================================================\n",
        "# SPLIT & METRICS\n",
        "# ============================================================\n",
        "def time_split(df, train_frac=0.70, val_frac=0.15):\n",
        "    qt = df[\"submit_time\"].quantile([train_frac, train_frac + val_frac]).values\n",
        "    t_train, t_val = int(qt[0]), int(qt[1])\n",
        "    idx_tr = df.index[df[\"submit_time\"] < t_train]\n",
        "    idx_va = df.index[(df[\"submit_time\"] >= t_train) & (df[\"submit_time\"] < t_val)]\n",
        "    idx_te = df.index[df[\"submit_time\"] >= t_val]\n",
        "    return (idx_tr, idx_va, idx_te), (t_train, t_val)\n",
        "\n",
        "\n",
        "def compute_metrics(y_true, y_pred):\n",
        "    y_true = np.asarray(y_true, dtype=float).clip(min=1e-9)\n",
        "    y_pred = np.asarray(y_pred, dtype=float).clip(min=1e-9)\n",
        "    mae = float(mean_absolute_error(y_true, y_pred))\n",
        "    rmsle = float(np.sqrt(mean_squared_error(np.log1p(y_true), np.log1p(y_pred))))\n",
        "    rel = np.abs(y_pred - y_true) / y_true\n",
        "    cov25 = float(100.0 * np.mean(rel <= 0.25))\n",
        "    cov50 = float(100.0 * np.mean(rel <= 0.50))\n",
        "    rho = float(spearmanr(y_true, y_pred)[0]) if len(y_true) > 1 else np.nan\n",
        "    return {\"MAE\": mae, \"RMSLE\": rmsle, \"Cov25\": cov25, \"Cov50\": cov50, \"Spearman\": rho}\n",
        "\n",
        "\n",
        "def print_metrics(tag, m):\n",
        "    print(\n",
        "        f\"[{tag:22s}]  Cov@25% {m['Cov25']:5.1f}% | Cov@50% {m['Cov50']:5.1f}% \"\n",
        "        f\"| RMSLE {m['RMSLE']:.3f} | MAE {m['MAE']:7.1f} | ρ {m['Spearman']:.3f}\"\n",
        "    )\n",
        "\n",
        "\n",
        "# ============================================================\n",
        "# LEAKAGE-FREE FEATURE LAYER\n",
        "# ============================================================\n",
        "def add_core_features(df):\n",
        "    df = df.copy()\n",
        "    for c in [\"total_inst_num\", \"total_plan_cpu\", \"total_plan_mem\", \"total_plan_gpu\", \"num_tasks\"]:\n",
        "        df[c] = pd.to_numeric(df[c], errors=\"coerce\").fillna(0)\n",
        "\n",
        "    for c in [\"total_plan_cpu\", \"total_plan_gpu\", \"total_plan_mem\", \"total_inst_num\", \"num_tasks\"]:\n",
        "        df[f\"log1p_{c}\"] = np.log1p(df[c])\n",
        "\n",
        "    df[\"cpu_per_inst\"] = df[\"total_plan_cpu\"] / np.maximum(1.0, df[\"total_inst_num\"])\n",
        "    df[\"mem_per_gpu\"] = df[\"total_plan_mem\"] / np.maximum(1.0, df[\"total_plan_gpu\"])\n",
        "    df[\"tasks_per_inst\"] = df[\"num_tasks\"] / np.maximum(1.0, df[\"total_inst_num\"])\n",
        "\n",
        "    hour = ((df[\"submit_time\"] // 3600) % 24).astype(int)\n",
        "    wday = (((df[\"submit_time\"] // 3600) // 24) % 7).astype(int)\n",
        "    how = (wday * 24 + hour).astype(int)\n",
        "    df[\"hour\"] = hour\n",
        "    df[\"wday\"] = wday\n",
        "    df[\"how\"] = how\n",
        "    df[\"sin24\"] = np.sin(2 * np.pi * hour / 24.0)\n",
        "    df[\"cos24\"] = np.cos(2 * np.pi * hour / 24.0)\n",
        "    df[\"sin168\"] = np.sin(2 * np.pi * how / 168.0)\n",
        "    df[\"cos168\"] = np.cos(2 * np.pi * how / 168.0)\n",
        "    return df\n",
        "\n",
        "\n",
        "def add_causal_histories_strict(df, idx_tr, idx_va, idx_te, mu0):\n",
        "    \"\"\"\n",
        "    Fully vectorized causal histories. No slow groupby.apply().\n",
        "    TRAIN: per-group expanding mean/count via transform.\n",
        "    VAL/TEST: frozen end-of-train aggregates via map().\n",
        "    \"\"\"\n",
        "    df = df.copy()\n",
        "    df[\"y_log\"] = np.log1p(df[\"p_star\"].clip(lower=0))\n",
        "\n",
        "    for pref in [\"grp\", \"usr\"]:\n",
        "        df[f\"{pref}_hist_mean\"] = mu0\n",
        "        df[f\"{pref}_hist_count\"] = 0.0\n",
        "        df[f\"{pref}_dt_prev_log1p\"] = 0.0\n",
        "        for w in (3, 5, 10):\n",
        "            df[f\"{pref}_roll_mean_w{w}\"] = mu0\n",
        "\n",
        "    # ------ STEP 1: TRAIN expanding stats (vectorized) ------\n",
        "    df_tr = df.loc[idx_tr].sort_values(\"submit_time\").copy()\n",
        "\n",
        "    for key, pref in [(\"group\", \"grp\"), (\"user\", \"usr\")]:\n",
        "        grp = df_tr.groupby(key)\n",
        "\n",
        "        # Expanding mean excluding current row\n",
        "        csum = grp[\"y_log\"].transform(\"cumsum\")\n",
        "        exp_cnt = grp.cumcount() + 1\n",
        "        shifted_csum = csum - df_tr[\"y_log\"]\n",
        "        shifted_cnt = exp_cnt - 1\n",
        "        hist_mean = (shifted_csum / shifted_cnt.replace(0, np.nan)).fillna(mu0)\n",
        "\n",
        "        df.loc[df_tr.index, f\"{pref}_hist_mean\"] = hist_mean.values\n",
        "        df.loc[df_tr.index, f\"{pref}_hist_count\"] = shifted_cnt.astype(float).values\n",
        "\n",
        "        # Time since previous (per-group shift)\n",
        "        prev_ts = grp[\"submit_time\"].shift(1)\n",
        "        dt = (df_tr[\"submit_time\"] - prev_ts).fillna(0)\n",
        "        df.loc[df_tr.index, f\"{pref}_dt_prev_log1p\"] = np.log1p(dt).values\n",
        "\n",
        "        # Rolling means on shifted y_log (groupby.rolling — fast C path)\n",
        "        shifted_y = grp[\"y_log\"].shift(1)\n",
        "        df_tr[f\"_sy_{pref}\"] = shifted_y\n",
        "\n",
        "        for w in (3, 5, 10):\n",
        "            roll = (\n",
        "                df_tr.groupby(key)[f\"_sy_{pref}\"]\n",
        "                .rolling(w, min_periods=1)\n",
        "                .mean()\n",
        "                .reset_index(level=0, drop=True)\n",
        "                .fillna(mu0)\n",
        "            )\n",
        "            df.loc[df_tr.index, f\"{pref}_roll_mean_w{w}\"] = roll.values\n",
        "\n",
        "        df_tr.drop(columns=[f\"_sy_{pref}\"], inplace=True)\n",
        "\n",
        "    # ------ STEP 2: FROZEN stats for val/test (vectorized) ------\n",
        "    for key, pref in [(\"group\", \"grp\"), (\"user\", \"usr\")]:\n",
        "        train_stats = df.loc[idx_tr].groupby(key)[\"y_log\"].agg([\"mean\", \"count\"])\n",
        "        mean_map = train_stats[\"mean\"]\n",
        "        count_map = train_stats[\"count\"]\n",
        "        last_ts_map = df.loc[idx_tr].groupby(key)[\"submit_time\"].last()\n",
        "\n",
        "        df_tr_sorted = df.loc[idx_tr].sort_values(\"submit_time\")\n",
        "        roll_maps = {}\n",
        "        for w in (3, 5, 10):\n",
        "            roll_maps[w] = df_tr_sorted.groupby(key)[\"y_log\"].apply(lambda s: s.tail(w).mean())\n",
        "\n",
        "        for split_idx in [idx_va, idx_te]:\n",
        "            entities = df.loc[split_idx, key]\n",
        "            df.loc[split_idx, f\"{pref}_hist_mean\"] = entities.map(mean_map).fillna(mu0).values\n",
        "            df.loc[split_idx, f\"{pref}_hist_count\"] = entities.map(count_map).fillna(0.0).values\n",
        "\n",
        "            last_ts = entities.map(last_ts_map)\n",
        "            dt = df.loc[split_idx, \"submit_time\"].values - last_ts.values\n",
        "            dt = np.where(np.isnan(dt), 0.0, np.maximum(dt, 0.0))\n",
        "            df.loc[split_idx, f\"{pref}_dt_prev_log1p\"] = np.log1p(dt)\n",
        "\n",
        "            for w in (3, 5, 10):\n",
        "                df.loc[split_idx, f\"{pref}_roll_mean_w{w}\"] = entities.map(roll_maps[w]).fillna(mu0).values\n",
        "\n",
        "    # EB shrinkage\n",
        "    n = df[\"grp_hist_count\"].values\n",
        "    mu = df[\"grp_hist_mean\"].values\n",
        "    lam = 10.0\n",
        "    df[\"grp_mean_eb\"] = (n * mu + lam * mu0) / (n + lam)\n",
        "\n",
        "    return df\n",
        "\n",
        "\n",
        "def fit_scale_buckets(train_series, quantiles):\n",
        "    edges = np.quantile(train_series.dropna(), quantiles).tolist()\n",
        "    def bucketize(series):\n",
        "        return np.searchsorted(edges, series, side=\"right\").astype(int)\n",
        "    return edges, bucketize\n",
        "\n",
        "\n",
        "def map_from_train(series, train_idx):\n",
        "    cats = series.loc[train_idx].astype(str).unique()\n",
        "    mapping = {c: i for i, c in enumerate(cats)}\n",
        "    return series.astype(str).map(mapping).fillna(len(mapping)).astype(int)\n",
        "\n",
        "\n",
        "def build_feature_table(df_raw, idx_tr, idx_va, idx_te, t_train):\n",
        "    df = add_core_features(df_raw)\n",
        "\n",
        "    mu0 = float(np.log1p(df.loc[idx_tr, \"p_star\"].clip(lower=0)).mean())\n",
        "    if not np.isfinite(mu0):\n",
        "        mu0 = float(np.log1p(df[\"p_star\"].clip(lower=0)).mean())\n",
        "\n",
        "    df = add_causal_histories_strict(df, idx_tr, idx_va, idx_te, mu0)\n",
        "\n",
        "    _, inst_bucketize = fit_scale_buckets(df.loc[idx_tr, \"total_inst_num\"], [0.5, 0.9, 0.99])\n",
        "    _, gpu_bucketize = fit_scale_buckets(df.loc[idx_tr, \"total_plan_gpu\"], [0.5, 0.9, 0.99])\n",
        "    df[\"inst_bucket\"] = inst_bucketize(df[\"total_inst_num\"])\n",
        "    df[\"gpu_bucket\"] = gpu_bucketize(df[\"total_plan_gpu\"])\n",
        "    df[\"scale_bucket\"] = np.maximum(df[\"inst_bucket\"], df[\"gpu_bucket\"])\n",
        "\n",
        "    for c in [\"user\", \"group\", \"workload\", \"gpu_type_spec\"]:\n",
        "        df[f\"{c}_code\"] = map_from_train(df[c], idx_tr)\n",
        "\n",
        "    FEATURES = [\n",
        "        \"log1p_total_plan_cpu\", \"log1p_total_plan_gpu\", \"log1p_total_plan_mem\",\n",
        "        \"log1p_total_inst_num\", \"log1p_num_tasks\",\n",
        "        \"cpu_per_inst\", \"mem_per_gpu\", \"tasks_per_inst\",\n",
        "        \"sin24\", \"cos24\", \"sin168\", \"cos168\", \"wday\", \"hour\",\n",
        "        \"inst_bucket\", \"gpu_bucket\", \"scale_bucket\",\n",
        "        \"grp_hist_mean\", \"grp_hist_count\", \"grp_mean_eb\",\n",
        "        \"grp_roll_mean_w3\", \"grp_roll_mean_w5\", \"grp_roll_mean_w10\",\n",
        "        \"grp_dt_prev_log1p\",\n",
        "        \"usr_hist_mean\", \"usr_hist_count\", \"usr_dt_prev_log1p\",\n",
        "        \"user_code\", \"group_code\", \"workload_code\", \"gpu_type_spec_code\",\n",
        "    ]\n",
        "\n",
        "    df[FEATURES] = df[FEATURES].replace([np.inf, -np.inf], np.nan).fillna(0.0)\n",
        "    return df, FEATURES, mu0\n",
        "\n",
        "\n",
        "# ============================================================\n",
        "# M1 — Single-Stage LGBM\n",
        "# ============================================================\n",
        "def train_m1_single_stage(df, FEATS, idx_tr, idx_va, idx_te):\n",
        "    y_log = np.log1p(df[\"p_star\"].values)\n",
        "    X_tr, y_tr = df.loc[idx_tr, FEATS], y_log[idx_tr]\n",
        "    X_va, y_va = df.loc[idx_va, FEATS], y_log[idx_va]\n",
        "    X_te = df.loc[idx_te, FEATS]\n",
        "    y_te = df.loc[idx_te, \"p_star\"].values\n",
        "\n",
        "    mono_features = [\"log1p_total_inst_num\", \"log1p_total_plan_gpu\", \"log1p_num_tasks\"]\n",
        "    mono_vec = [1 if f in mono_features else 0 for f in FEATS]\n",
        "\n",
        "    try:\n",
        "        model = LGBMRegressor(\n",
        "            n_estimators=800, learning_rate=0.05, num_leaves=63,\n",
        "            min_child_samples=20, subsample=0.8, colsample_bytree=0.8,\n",
        "            monotone_constraints=mono_vec, random_state=RNG, n_jobs=-1,\n",
        "        )\n",
        "    except TypeError:\n",
        "        model = LGBMRegressor(\n",
        "            n_estimators=800, learning_rate=0.05, num_leaves=63,\n",
        "            min_child_samples=20, subsample=0.8, colsample_bytree=0.8,\n",
        "            random_state=RNG, n_jobs=-1,\n",
        "        )\n",
        "\n",
        "    model.fit(\n",
        "        X_tr, y_tr,\n",
        "        eval_set=[(X_va, y_va)],\n",
        "        callbacks=[early_stopping(50, verbose=False), log_evaluation(0)],\n",
        "    )\n",
        "    yhat = np.expm1(model.predict(X_te)).clip(min=0)\n",
        "    return compute_metrics(y_te, yhat), yhat\n",
        "\n",
        "\n",
        "# ============================================================\n",
        "# M2 — CRR++\n",
        "# ============================================================\n",
        "def train_m2_crr_plus(df, FEATS, idx_tr, idx_va, idx_te, K=5, min_family=1500):\n",
        "    y_log = np.log1p(df[\"p_star\"].values)\n",
        "    X_tr = df.loc[idx_tr, FEATS].copy()\n",
        "    X_va = df.loc[idx_va, FEATS].copy()\n",
        "    X_te = df.loc[idx_te, FEATS].copy()\n",
        "    y_tr, y_va = y_log[idx_tr], y_log[idx_va]\n",
        "    y_te = df.loc[idx_te, \"p_star\"].values\n",
        "\n",
        "    scaler = StandardScaler().fit(X_tr.values)\n",
        "    Z_tr = scaler.transform(X_tr.values)\n",
        "    Z_va = scaler.transform(X_va.values)\n",
        "    Z_te = scaler.transform(X_te.values)\n",
        "\n",
        "    kmeans = KMeans(n_clusters=K, n_init=10, random_state=RNG).fit(Z_tr)\n",
        "    fam_tr = kmeans.labels_\n",
        "    fam_va = kmeans.predict(Z_va)\n",
        "\n",
        "    router = RandomForestClassifier(\n",
        "        n_estimators=150, max_depth=12, class_weight=\"balanced\",\n",
        "        n_jobs=-1, random_state=RNG,\n",
        "    )\n",
        "    router.fit(Z_tr, fam_tr)\n",
        "\n",
        "    fallback = LGBMRegressor(\n",
        "        n_estimators=600, learning_rate=0.05, num_leaves=63,\n",
        "        min_child_samples=20, subsample=0.8, colsample_bytree=0.8,\n",
        "        random_state=RNG, n_jobs=-1,\n",
        "    )\n",
        "    fallback.fit(\n",
        "        X_tr, y_tr,\n",
        "        eval_set=[(X_va, y_va)],\n",
        "        callbacks=[early_stopping(50, verbose=False), log_evaluation(0)],\n",
        "    )\n",
        "\n",
        "    experts = {}\n",
        "    for k in range(K):\n",
        "        mask_tr = fam_tr == k\n",
        "        if mask_tr.sum() < min_family:\n",
        "            continue\n",
        "        model = LGBMRegressor(\n",
        "            n_estimators=600, learning_rate=0.05, num_leaves=63,\n",
        "            min_child_samples=20, subsample=0.8, colsample_bytree=0.8,\n",
        "            random_state=RNG + 100 + k, n_jobs=-1,\n",
        "        )\n",
        "        mask_va = fam_va == k\n",
        "        if mask_va.sum() >= 200:\n",
        "            model.fit(\n",
        "                X_tr[mask_tr], y_tr[mask_tr],\n",
        "                eval_set=[(X_va[mask_va], y_va[mask_va])],\n",
        "                callbacks=[early_stopping(50, verbose=False), log_evaluation(0)],\n",
        "            )\n",
        "        else:\n",
        "            model.fit(X_tr[mask_tr], y_tr[mask_tr])\n",
        "        experts[k] = model\n",
        "\n",
        "    fam_hat = router.predict(Z_te)\n",
        "    y_pred_log = np.zeros(len(X_te), dtype=float)\n",
        "    for i, k in enumerate(fam_hat):\n",
        "        model = experts.get(k, fallback)\n",
        "        y_pred_log[i] = model.predict(X_te.iloc[i : i + 1])[0]\n",
        "\n",
        "    yhat = np.expm1(y_pred_log).clip(min=0)\n",
        "    return compute_metrics(y_te, yhat), yhat\n",
        "\n",
        "\n",
        "# ============================================================\n",
        "# M3 — Scale-Bucket Experts\n",
        "# ============================================================\n",
        "def train_m3_scale_experts(df, FEATS, idx_tr, idx_va, idx_te, min_bucket=2000):\n",
        "    X_tr = df.loc[idx_tr, FEATS]\n",
        "    X_va = df.loc[idx_va, FEATS]\n",
        "    X_te = df.loc[idx_te, FEATS]\n",
        "    y_tr_log = np.log1p(df.loc[idx_tr, \"p_star\"].values)\n",
        "    y_va_log = np.log1p(df.loc[idx_va, \"p_star\"].values)\n",
        "    y_te = df.loc[idx_te, \"p_star\"].values\n",
        "\n",
        "    global_model = LGBMRegressor(\n",
        "        n_estimators=500, learning_rate=0.05, num_leaves=63,\n",
        "        min_child_samples=20, subsample=0.8, colsample_bytree=0.8,\n",
        "        random_state=RNG, n_jobs=-1,\n",
        "    )\n",
        "    global_model.fit(\n",
        "        X_tr, y_tr_log,\n",
        "        eval_set=[(X_va, y_va_log)],\n",
        "        callbacks=[early_stopping(40, verbose=False), log_evaluation(0)],\n",
        "    )\n",
        "\n",
        "    experts = {}\n",
        "    for b in sorted(df.loc[idx_tr, \"scale_bucket\"].unique()):\n",
        "        tr_mask = df.loc[idx_tr, \"scale_bucket\"].values == b\n",
        "        if tr_mask.sum() < min_bucket:\n",
        "            continue\n",
        "        model = LGBMRegressor(\n",
        "            n_estimators=400, learning_rate=0.05, num_leaves=63,\n",
        "            min_child_samples=20, random_state=RNG + 200 + b, n_jobs=-1,\n",
        "        )\n",
        "        va_mask = df.loc[idx_va, \"scale_bucket\"].values == b\n",
        "        if va_mask.sum() >= 200:\n",
        "            model.fit(\n",
        "                X_tr[tr_mask], y_tr_log[tr_mask],\n",
        "                eval_set=[(X_va[va_mask], y_va_log[va_mask])],\n",
        "                callbacks=[early_stopping(30, verbose=False), log_evaluation(0)],\n",
        "            )\n",
        "        else:\n",
        "            model.fit(X_tr[tr_mask], y_tr_log[tr_mask])\n",
        "        experts[b] = model\n",
        "\n",
        "    buckets_te = df.loc[idx_te, \"scale_bucket\"].values\n",
        "    y_pred_log = global_model.predict(X_te)\n",
        "    for b, model in experts.items():\n",
        "        mask = buckets_te == b\n",
        "        if mask.any():\n",
        "            y_pred_log[mask] = model.predict(X_te[mask])\n",
        "\n",
        "    yhat = np.expm1(y_pred_log).clip(min=0)\n",
        "    return compute_metrics(y_te, yhat), yhat\n",
        "\n",
        "\n",
        "# ============================================================\n",
        "# M4 — Recency Ensemble\n",
        "# ============================================================\n",
        "def train_m4_recency_ensemble(df, FEATS, idx_tr, idx_va, idx_te, recent_q=0.70, mix=0.30):\n",
        "    y_tr_log = np.log1p(df.loc[idx_tr, \"p_star\"].values)\n",
        "    y_va_log = np.log1p(df.loc[idx_va, \"p_star\"].values)\n",
        "    y_te = df.loc[idx_te, \"p_star\"].values\n",
        "\n",
        "    X_tr = df.loc[idx_tr, FEATS].values\n",
        "    X_va = df.loc[idx_va, FEATS].values\n",
        "    X_te = df.loc[idx_te, FEATS].values\n",
        "\n",
        "    full = LGBMRegressor(\n",
        "        n_estimators=500, learning_rate=0.04, num_leaves=63,\n",
        "        min_child_samples=60, subsample=0.8, colsample_bytree=0.8,\n",
        "        reg_alpha=0.2, reg_lambda=0.8, random_state=RNG, n_jobs=-1,\n",
        "    )\n",
        "    full.fit(\n",
        "        X_tr, y_tr_log,\n",
        "        eval_set=[(X_va, y_va_log)],\n",
        "        callbacks=[early_stopping(40, verbose=False), log_evaluation(0)],\n",
        "    )\n",
        "\n",
        "    t_cut = df.loc[idx_tr, \"submit_time\"].quantile(recent_q)\n",
        "    tr_recent_mask = df.loc[idx_tr, \"submit_time\"].values >= t_cut\n",
        "    if tr_recent_mask.sum() >= 1000:\n",
        "        recent = LGBMRegressor(\n",
        "            n_estimators=300, learning_rate=0.05, num_leaves=31,\n",
        "            min_child_samples=40, random_state=RNG + 1, n_jobs=-1,\n",
        "        )\n",
        "        recent.fit(\n",
        "            X_tr[tr_recent_mask], y_tr_log[tr_recent_mask],\n",
        "            eval_set=[(X_va, y_va_log)],\n",
        "            callbacks=[early_stopping(30, verbose=False), log_evaluation(0)],\n",
        "        )\n",
        "        y_full = np.expm1(full.predict(X_te))\n",
        "        y_recent = np.expm1(recent.predict(X_te))\n",
        "        yhat = ((1.0 - mix) * y_full + mix * y_recent).clip(min=0)\n",
        "    else:\n",
        "        yhat = np.expm1(full.predict(X_te)).clip(min=0)\n",
        "\n",
        "    return compute_metrics(y_te, yhat), yhat\n",
        "\n",
        "\n",
        "# ============================================================\n",
        "# RECURRING SIGNATURE MASK\n",
        "# ============================================================\n",
        "def compute_recurring_mask(df, idx_tr, idx_te):\n",
        "    \"\"\"\n",
        "    Compute recurring mask using same signature definition as Script 2.\n",
        "    A test job is 'recurring' if its (user, group, workload, cpu_b, gpu_b,\n",
        "    mem_b, inst_b) signature appeared at least once in training.\n",
        "    \"\"\"\n",
        "    train_df = df.loc[idx_tr]\n",
        "\n",
        "    def qbin(col, q=10):\n",
        "        edges = np.quantile(train_df[col].clip(lower=1e-6), np.linspace(0, 1, q + 1))\n",
        "        edges[0], edges[-1] = -np.inf, np.inf\n",
        "        return np.searchsorted(edges, df[col].values, side=\"right\") - 1\n",
        "\n",
        "    cpu_b = qbin(\"total_plan_cpu\", 10)\n",
        "    gpu_b = qbin(\"total_plan_gpu\", 10)\n",
        "    mem_b = qbin(\"total_plan_mem\", 10)\n",
        "    inst_b = qbin(\"total_inst_num\", 10)\n",
        "\n",
        "    sig = (\n",
        "        df[\"user\"].astype(str) + \"|\" + df[\"group\"].astype(str) + \"|\"\n",
        "        + df[\"workload\"].astype(str) + \"|\"\n",
        "        + pd.Series(cpu_b, index=df.index).astype(str) + \"-\"\n",
        "        + pd.Series(gpu_b, index=df.index).astype(str) + \"-\"\n",
        "        + pd.Series(mem_b, index=df.index).astype(str) + \"-\"\n",
        "        + pd.Series(inst_b, index=df.index).astype(str)\n",
        "    )\n",
        "\n",
        "    train_sigs = set(sig.loc[idx_tr].unique())\n",
        "    rec_mask = sig.loc[idx_te].isin(train_sigs).values\n",
        "    return rec_mask\n",
        "\n",
        "\n",
        "# ============================================================\n",
        "# DRIVER\n",
        "# ============================================================\n",
        "def main():\n",
        "    print(\"=\" * 78)\n",
        "    print(\"PAI TRACE — FOUR LEAKAGE-FREE METHODS (BUG-FIXED) + RECURRING ANALYSIS\")\n",
        "    print(\"=\" * 78)\n",
        "    t0 = time.time()\n",
        "\n",
        "    extract_archives()\n",
        "    tbl = load_tables()\n",
        "    df_raw = tbl.job.copy()\n",
        "    print(f\"Loaded jobs: {len(df_raw):,}\")\n",
        "\n",
        "    splits, (t_train, t_val) = time_split(df_raw, train_frac=0.70, val_frac=0.15)\n",
        "    idx_tr, idx_va, idx_te = splits\n",
        "    print(f\"Split sizes  | train {len(idx_tr):,} | val {len(idx_va):,} | test {len(idx_te):,}\")\n",
        "    print(f\"Cut times    | train< {t_train} | val< {t_val}\")\n",
        "\n",
        "    df, FEATS, mu0 = build_feature_table(df_raw, idx_tr, idx_va, idx_te, t_train)\n",
        "    print(f\"Features: {len(FEATS)}\")\n",
        "    print()\n",
        "\n",
        "    results = {}\n",
        "\n",
        "    print(\"Training M1: Single-Stage LGBM ...\")\n",
        "    m1, y1 = train_m1_single_stage(df, FEATS, idx_tr, idx_va, idx_te)\n",
        "    print_metrics(\"M1 Single-Stage\", m1)\n",
        "    results[\"M1\"] = (m1, y1)\n",
        "\n",
        "    print(\"\\nTraining M2: CRR++ ...\")\n",
        "    m2, y2 = train_m2_crr_plus(df, FEATS, idx_tr, idx_va, idx_te)\n",
        "    print_metrics(\"M2 CRR++\", m2)\n",
        "    results[\"M2\"] = (m2, y2)\n",
        "\n",
        "    print(\"\\nTraining M3: Scale-Experts ...\")\n",
        "    m3, y3 = train_m3_scale_experts(df, FEATS, idx_tr, idx_va, idx_te)\n",
        "    print_metrics(\"M3 Scale-Experts\", m3)\n",
        "    results[\"M3\"] = (m3, y3)\n",
        "\n",
        "    print(\"\\nTraining M4: Recency-Ensemble ...\")\n",
        "    m4, y4 = train_m4_recency_ensemble(df, FEATS, idx_tr, idx_va, idx_te)\n",
        "    print_metrics(\"M4 Recency-Ensemble\", m4)\n",
        "    results[\"M4\"] = (m4, y4)\n",
        "\n",
        "    # ============================================================\n",
        "    # RECURRING vs NON-RECURRING ANALYSIS\n",
        "    # ============================================================\n",
        "    print(\"\\n\" + \"=\" * 78)\n",
        "    print(\"RECURRING vs NON-RECURRING ANALYSIS\")\n",
        "    print(\"=\" * 78)\n",
        "\n",
        "    rec_mask = compute_recurring_mask(df, idx_tr, idx_te)\n",
        "    n_rec = rec_mask.sum()\n",
        "    n_new = (~rec_mask).sum()\n",
        "    print(f\"Recurring    : {n_rec:,} ({100 * n_rec / len(rec_mask):.1f}%)\")\n",
        "    print(f\"Non-recurring: {n_new:,} ({100 * n_new / len(rec_mask):.1f}%)\")\n",
        "    print()\n",
        "\n",
        "    y_te = df.loc[idx_te, \"p_star\"].values\n",
        "\n",
        "    for name, (m_all, yhat) in results.items():\n",
        "        m_rec = compute_metrics(y_te[rec_mask], yhat[rec_mask])\n",
        "        m_new = compute_metrics(y_te[~rec_mask], yhat[~rec_mask])\n",
        "        print(\n",
        "            f\"[{name}]  All: Cov@25={m_all['Cov25']:5.1f}%  Cov@50={m_all['Cov50']:5.1f}%  \"\n",
        "            f\"RMSLE={m_all['RMSLE']:.3f}  ρ={m_all['Spearman']:.3f}\"\n",
        "        )\n",
        "        print(\n",
        "            f\"       Rec: Cov@25={m_rec['Cov25']:5.1f}%  Cov@50={m_rec['Cov50']:5.1f}%  \"\n",
        "            f\"RMSLE={m_rec['RMSLE']:.3f}  ρ={m_rec['Spearman']:.3f}\"\n",
        "        )\n",
        "        print(\n",
        "            f\"       New: Cov@25={m_new['Cov25']:5.1f}%  Cov@50={m_new['Cov50']:5.1f}%  \"\n",
        "            f\"RMSLE={m_new['RMSLE']:.3f}  ρ={m_new['Spearman']:.3f}\"\n",
        "        )\n",
        "        print()\n",
        "\n",
        "    print(f\"Total wall time: {time.time() - t0:.1f}s\")\n",
        "    print(\"=\" * 78)\n",
        "    return df, idx_tr, idx_te, results, rec_mask\n",
        "\n",
        "\n",
        "if __name__ == \"__main__\":\n",
        "    df, idx_tr, idx_te, results, rec_mask = main()"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "g--cg0XRIeaJ",
        "outputId": "6e685c6c-ba9b-4f66-bd59-ace4e390e4a1"
      },
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "==============================================================================\n",
            "PAI TRACE — FOUR LEAKAGE-FREE METHODS (BUG-FIXED) + RECURRING ANALYSIS\n",
            "==============================================================================\n",
            "[IO] Extracted to /content/extracted\n",
            "Loaded jobs: 732,355\n",
            "Split sizes  | train 512,647 | val 109,854 | test 109,854\n",
            "Cut times    | train< 5020579 | val< 5764872\n",
            "Features: 31\n",
            "\n",
            "Training M1: Single-Stage LGBM ...\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018250 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 4490\n",
            "[LightGBM] [Info] Number of data points in the train set: 512647, number of used features: 31\n",
            "[LightGBM] [Info] Start training from score 6.645564\n",
            "[M1 Single-Stage       ]  Cov@25%  19.5% | Cov@50%  36.2% | RMSLE 1.568 | MAE  2860.2 | ρ 0.690\n",
            "\n",
            "Training M2: CRR++ ...\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022963 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 4490\n",
            "[LightGBM] [Info] Number of data points in the train set: 512647, number of used features: 31\n",
            "[LightGBM] [Info] Start training from score 6.645564\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008180 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 3455\n",
            "[LightGBM] [Info] Number of data points in the train set: 143581, number of used features: 30\n",
            "[LightGBM] [Info] Start training from score 7.593020\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006740 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 3747\n",
            "[LightGBM] [Info] Number of data points in the train set: 163902, number of used features: 31\n",
            "[LightGBM] [Info] Start training from score 5.375273\n",
            "[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004609 seconds.\n",
            "You can set `force_col_wise=true` to remove the overhead.\n",
            "[LightGBM] [Info] Total Bins 4550\n",
            "[LightGBM] [Info] Number of data points in the train set: 64894, number of used features: 31\n",
            "[LightGBM] [Info] Start training from score 7.460048\n",
            "[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010651 seconds.\n",
            "You can set `force_col_wise=true` to remove the overhead.\n",
            "[LightGBM] [Info] Total Bins 3912\n",
            "[LightGBM] [Info] Number of data points in the train set: 79061, number of used features: 31\n",
            "[LightGBM] [Info] Start training from score 6.068514\n",
            "[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004968 seconds.\n",
            "You can set `force_col_wise=true` to remove the overhead.\n",
            "[LightGBM] [Info] Total Bins 4068\n",
            "[LightGBM] [Info] Number of data points in the train set: 61209, number of used features: 30\n",
            "[LightGBM] [Info] Start training from score 7.706417\n",
            "[M2 CRR++              ]  Cov@25%  20.2% | Cov@50%  37.7% | RMSLE 1.621 | MAE  2993.3 | ρ 0.665\n",
            "\n",
            "Training M3: Scale-Experts ...\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.064121 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 4490\n",
            "[LightGBM] [Info] Number of data points in the train set: 512647, number of used features: 31\n",
            "[LightGBM] [Info] Start training from score 6.645564\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023983 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 3982\n",
            "[LightGBM] [Info] Number of data points in the train set: 441426, number of used features: 29\n",
            "[LightGBM] [Info] Start training from score 6.517727\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002618 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 4392\n",
            "[LightGBM] [Info] Number of data points in the train set: 61390, number of used features: 30\n",
            "[LightGBM] [Info] Start training from score 7.365021\n",
            "[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000824 seconds.\n",
            "You can set `force_col_wise=true` to remove the overhead.\n",
            "[LightGBM] [Info] Total Bins 4067\n",
            "[LightGBM] [Info] Number of data points in the train set: 9831, number of used features: 30\n",
            "[LightGBM] [Info] Start training from score 7.892976\n",
            "[M3 Scale-Experts      ]  Cov@25%  19.6% | Cov@50%  38.0% | RMSLE 1.611 | MAE  3058.8 | ρ 0.673\n",
            "\n",
            "Training M4: Recency-Ensemble ...\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019954 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 4490\n",
            "[LightGBM] [Info] Number of data points in the train set: 512647, number of used features: 31\n",
            "[LightGBM] [Info] Start training from score 6.645564\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006963 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 4435\n",
            "[LightGBM] [Info] Number of data points in the train set: 153794, number of used features: 31\n",
            "[LightGBM] [Info] Start training from score 6.664944\n",
            "[M4 Recency-Ensemble   ]  Cov@25%  20.2% | Cov@50%  36.9% | RMSLE 1.578 | MAE  2799.4 | ρ 0.685\n",
            "\n",
            "==============================================================================\n",
            "RECURRING vs NON-RECURRING ANALYSIS\n",
            "==============================================================================\n",
            "Recurring    : 57,102 (52.0%)\n",
            "Non-recurring: 52,752 (48.0%)\n",
            "\n",
            "[M1]  All: Cov@25= 19.5%  Cov@50= 36.2%  RMSLE=1.568  ρ=0.690\n",
            "       Rec: Cov@25= 28.5%  Cov@50= 51.6%  RMSLE=1.000  ρ=0.862\n",
            "       New: Cov@25=  9.8%  Cov@50= 19.6%  RMSLE=2.009  ρ=0.506\n",
            "\n",
            "[M2]  All: Cov@25= 20.2%  Cov@50= 37.7%  RMSLE=1.621  ρ=0.665\n",
            "       Rec: Cov@25= 29.2%  Cov@50= 53.1%  RMSLE=0.974  ρ=0.877\n",
            "       New: Cov@25= 10.5%  Cov@50= 21.1%  RMSLE=2.108  ρ=0.452\n",
            "\n",
            "[M3]  All: Cov@25= 19.6%  Cov@50= 38.0%  RMSLE=1.611  ρ=0.673\n",
            "       Rec: Cov@25= 28.0%  Cov@50= 53.3%  RMSLE=0.992  ρ=0.871\n",
            "       New: Cov@25= 10.6%  Cov@50= 21.4%  RMSLE=2.083  ρ=0.476\n",
            "\n",
            "[M4]  All: Cov@25= 20.2%  Cov@50= 36.9%  RMSLE=1.578  ρ=0.685\n",
            "       Rec: Cov@25= 28.2%  Cov@50= 50.9%  RMSLE=1.002  ρ=0.871\n",
            "       New: Cov@25= 11.6%  Cov@50= 21.8%  RMSLE=2.024  ρ=0.474\n",
            "\n",
            "Total wall time: 213.6s\n",
            "==============================================================================\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# ============================================================\n",
        "# PAI TRACE JOB DURATION PREDICTION — Advanced Methods\n",
        "# ============================================================\n",
        "\n",
        "\n",
        "import tarfile, pathlib, warnings, gc, time\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "\n",
        "from sklearn.metrics import mean_absolute_error, mean_squared_error\n",
        "from sklearn.isotonic import IsotonicRegression\n",
        "from sklearn.linear_model import Ridge\n",
        "from scipy.stats import spearmanr\n",
        "\n",
        "from lightgbm import LGBMRegressor, LGBMClassifier, early_stopping, log_evaluation\n",
        "\n",
        "warnings.filterwarnings(\"ignore\")\n",
        "np.random.seed(42)\n",
        "RANDOM_STATE = 42\n",
        "\n",
        "# ============================================================\n",
        "# CONFIGURATION\n",
        "# ============================================================\n",
        "ARCHIVES = [\n",
        "    \"/content/pai_group_tag_table.tar.gz\",\n",
        "    \"/content/pai_job_table.tar.gz\",\n",
        "    \"/content/pai_task_table.tar.gz\",\n",
        "]\n",
        "EXTRACT_DIR = pathlib.Path(\"/content/extracted\")\n",
        "\n",
        "# ============================================================\n",
        "# 1. DATA LOADING\n",
        "# ============================================================\n",
        "def extract_archives():\n",
        "    \"\"\"Extract tar.gz archives.\"\"\"\n",
        "    EXTRACT_DIR.mkdir(parents=True, exist_ok=True)\n",
        "    for gz in ARCHIVES:\n",
        "        with tarfile.open(gz, mode=\"r:gz\") as tar:\n",
        "            tar.extractall(path=EXTRACT_DIR)\n",
        "    print(f\"[IO] Extracted files to {EXTRACT_DIR}\")\n",
        "\n",
        "\n",
        "def load_tables():\n",
        "    \"\"\"\n",
        "    Load PAI CSVs and compute job-level p_star and resource totals.\n",
        "\n",
        "    Key points:\n",
        "    - p_star = job makespan (envelope across all tasks)\n",
        "    - Resources are per-instance percentages converted to counts, then summed\n",
        "    \"\"\"\n",
        "    job_cols   = [\"job_name\", \"inst_id\", \"user\", \"status\", \"start_time\", \"end_time\"]\n",
        "    task_cols  = [\"job_name\", \"task_name\", \"inst_num\", \"status\", \"start_time\", \"end_time\",\n",
        "                  \"plan_cpu\", \"plan_mem\", \"plan_gpu\", \"gpu_type\"]\n",
        "    group_cols = [\"inst_id\", \"user_from_group_table\", \"gpu_type_spec\", \"group\", \"workload\"]\n",
        "\n",
        "    job  = pd.read_csv(EXTRACT_DIR / \"pai_job_table.csv\",  header=None, names=job_cols,  low_memory=False)\n",
        "    task = pd.read_csv(EXTRACT_DIR / \"pai_task_table.csv\", header=None, names=task_cols, low_memory=False)\n",
        "    gtag = pd.read_csv(EXTRACT_DIR / \"pai_group_tag_table.csv\", header=None, names=group_cols, low_memory=False)\n",
        "\n",
        "    # Keep only terminated\n",
        "    job  = job[job.status  == \"Terminated\"].copy()\n",
        "    task = task[task.status == \"Terminated\"].copy()\n",
        "\n",
        "    # Timestamps to numeric\n",
        "    for df in (job, task):\n",
        "        df[\"start_time\"] = pd.to_numeric(df[\"start_time\"], errors=\"coerce\")\n",
        "        df[\"end_time\"]   = pd.to_numeric(df[\"end_time\"],   errors=\"coerce\")\n",
        "\n",
        "    # Convert resource fields to numeric\n",
        "    for c in [\"plan_cpu\", \"plan_gpu\", \"plan_mem\", \"inst_num\"]:\n",
        "        task[c] = pd.to_numeric(task[c], errors=\"coerce\")\n",
        "\n",
        "    # Convert percentages to counts per instance, then sum per task\n",
        "    task[\"cpu_per_inst\"] = task[\"plan_cpu\"] / 100.0  # 600.0 -> 6 vCPU\n",
        "    task[\"gpu_per_inst\"] = task[\"plan_gpu\"] / 100.0  # 50.0 -> 0.5 GPU\n",
        "    task[\"mem_per_inst\"] = task[\"plan_mem\"]          # GB\n",
        "\n",
        "    task[\"cpu_total\"] = task[\"cpu_per_inst\"] * task[\"inst_num\"]\n",
        "    task[\"gpu_total\"] = task[\"gpu_per_inst\"] * task[\"inst_num\"]\n",
        "    task[\"mem_total\"] = task[\"mem_per_inst\"] * task[\"inst_num\"]\n",
        "\n",
        "    # Job envelope makespan p_star\n",
        "    job_span = (task.groupby(\"job_name\")\n",
        "                    .agg(min_start=(\"start_time\", \"min\"),\n",
        "                         max_end=(\"end_time\", \"max\"),\n",
        "                         total_inst_num=(\"inst_num\", \"sum\"),\n",
        "                         total_plan_cpu=(\"cpu_total\", \"sum\"),\n",
        "                         total_plan_mem=(\"mem_total\", \"sum\"),\n",
        "                         total_plan_gpu=(\"gpu_total\", \"sum\"),\n",
        "                         num_tasks=(\"task_name\", \"nunique\"))\n",
        "                    .reset_index())\n",
        "    job_span[\"p_star\"] = (job_span[\"max_end\"] - job_span[\"min_start\"]).clip(lower=0)\n",
        "\n",
        "    # Merge job metadata + group tags\n",
        "    base = job[[\"job_name\", \"inst_id\", \"user\", \"start_time\"]].merge(job_span, on=\"job_name\", how=\"inner\")\n",
        "    base = base[base[\"p_star\"] > 0].copy()\n",
        "\n",
        "    base = base.merge(gtag[[\"inst_id\", \"group\", \"workload\", \"gpu_type_spec\"]],\n",
        "                      on=\"inst_id\", how=\"left\")\n",
        "\n",
        "    for c in [\"group\", \"workload\", \"gpu_type_spec\", \"user\"]:\n",
        "        base[c] = base[c].fillna(\"Unknown\").astype(str)\n",
        "\n",
        "    base.rename(columns={\"start_time\": \"submit_time\"}, inplace=True)\n",
        "\n",
        "    # Cleanup\n",
        "    del job, task, gtag, job_span\n",
        "    gc.collect()\n",
        "    return base\n",
        "\n",
        "\n",
        "# ============================================================\n",
        "# 2. TIME-BASED SPLITTING\n",
        "# ============================================================\n",
        "def time_split(df, train_frac=0.70, val_frac=0.15):\n",
        "    \"\"\"Split indices by submit_time quantiles (train/val/test).\"\"\"\n",
        "    qt = df[\"submit_time\"].quantile([train_frac, train_frac + val_frac]).values\n",
        "    t_train, t_val = int(qt[0]), int(qt[1])\n",
        "    idx_tr = df.index[df[\"submit_time\"] < t_train]\n",
        "    idx_va = df.index[(df[\"submit_time\"] >= t_train) & (df[\"submit_time\"] < t_val)]\n",
        "    idx_te = df.index[df[\"submit_time\"] >= t_val]\n",
        "    return (idx_tr, idx_va, idx_te), (t_train, t_val)\n",
        "\n",
        "\n",
        "# ============================================================\n",
        "# 3. FEATURE ENGINEERING (LEAKAGE-FREE)\n",
        "# ============================================================\n",
        "def add_basic_features(df):\n",
        "    \"\"\"Basic features: logs, ratios, temporal (no leakage).\"\"\"\n",
        "    df = df.copy()\n",
        "\n",
        "    # Log transforms\n",
        "    for c in [\"total_plan_cpu\", \"total_plan_gpu\", \"total_plan_mem\", \"total_inst_num\", \"num_tasks\"]:\n",
        "        df[f\"log_{c}\"] = np.log1p(df[c].clip(lower=0))\n",
        "\n",
        "    # Per-instance ratios\n",
        "    df[\"cpu_per_inst\"]   = df[\"total_plan_cpu\"] / np.maximum(1.0, df[\"total_inst_num\"])\n",
        "    df[\"gpu_per_inst\"]   = df[\"total_plan_gpu\"] / np.maximum(1.0, df[\"total_inst_num\"])\n",
        "    df[\"mem_per_inst\"]   = df[\"total_plan_mem\"] / np.maximum(1.0, df[\"total_inst_num\"])\n",
        "    df[\"tasks_per_inst\"] = df[\"num_tasks\"]      / np.maximum(1.0, df[\"total_inst_num\"])\n",
        "\n",
        "    # Temporal (cyclic)\n",
        "    hour = ((df[\"submit_time\"] // 3600) % 24).astype(int)\n",
        "    dow  = ((df[\"submit_time\"] // 86400) % 7).astype(int)\n",
        "    df[\"hour\"] = hour\n",
        "    df[\"dow\"]  = dow\n",
        "    df[\"sin_hour\"]   = np.sin(2 * np.pi * hour / 24.0)\n",
        "    df[\"cos_hour\"]   = np.cos(2 * np.pi * hour / 24.0)\n",
        "    df[\"is_weekend\"] = (df[\"dow\"] >= 5).astype(int)\n",
        "    return df\n",
        "\n",
        "\n",
        "def fit_label_encoder_from_train(train_series):\n",
        "    \"\"\"Train-only label encoder (unseen values -> -1).\"\"\"\n",
        "    classes = pd.Index(train_series.dropna().unique().tolist())\n",
        "    mapping = {v: i for i, v in enumerate(classes)}\n",
        "    def _transform(s):\n",
        "        return s.map(mapping).fillna(-1).astype(int)\n",
        "    return _transform\n",
        "\n",
        "\n",
        "def make_signatures(df, idx_tr, global_mean_log):\n",
        "    \"\"\"\n",
        "    Create recurrence signatures and compute train-only statistics.\n",
        "\n",
        "    Leakage prevention:\n",
        "    - Quantile edges computed on train only\n",
        "    - Signature stats computed on train only\n",
        "    - EB shrinkage uses train-only prior\n",
        "    \"\"\"\n",
        "    df = df.copy()\n",
        "    train_df = df.loc[idx_tr]\n",
        "\n",
        "    # Train-only quantile buckets\n",
        "    def qbin(col, q=10):\n",
        "        edges = np.quantile(train_df[col].clip(lower=1e-6), np.linspace(0, 1, q + 1))\n",
        "        edges[0], edges[-1] = -np.inf, np.inf\n",
        "        return np.searchsorted(edges, df[col].values, side=\"right\") - 1\n",
        "\n",
        "    df[\"cpu_b\"]  = qbin(\"total_plan_cpu\", 10)\n",
        "    df[\"gpu_b\"]  = qbin(\"total_plan_gpu\", 10)\n",
        "    df[\"mem_b\"]  = qbin(\"total_plan_mem\", 10)\n",
        "    df[\"inst_b\"] = qbin(\"total_inst_num\", 10)\n",
        "\n",
        "    # Signature: user|group|workload|resource_buckets\n",
        "    df[\"task_signature\"] = (\n",
        "        df[\"user\"].astype(str) + \"|\" + df[\"group\"].astype(str) + \"|\" +\n",
        "        df[\"workload\"].astype(str) + \"|\" + df[\"cpu_b\"].astype(str) + \"-\" +\n",
        "        df[\"gpu_b\"].astype(str) + \"-\" + df[\"mem_b\"].astype(str) + \"-\" +\n",
        "        df[\"inst_b\"].astype(str)\n",
        "    )\n",
        "\n",
        "    # Train-only signature statistics (on p_star in log domain)\n",
        "    df[\"p_star_log\"] = np.log1p(df[\"p_star\"])\n",
        "    sig_stats = (df.loc[idx_tr].groupby(\"task_signature\")[\"p_star_log\"]\n",
        "                   .agg(sig_mean=\"mean\",\n",
        "                        sig_median=\"median\",\n",
        "                        sig_std=\"std\",\n",
        "                        sig_count=\"count\",\n",
        "                        sig_q25=lambda x: x.quantile(0.25),\n",
        "                        sig_q75=lambda x: x.quantile(0.75))\n",
        "                   .reset_index())\n",
        "\n",
        "    # EB shrinkage on signature means (log scale)\n",
        "    n_s = sig_stats[\"sig_count\"].astype(float)\n",
        "    mu_s = sig_stats[\"sig_mean\"].astype(float)\n",
        "    lambda_shrink = 5.0\n",
        "    sig_stats[\"sig_mean_shrink\"] = (n_s * mu_s + lambda_shrink * global_mean_log) / (n_s + lambda_shrink)\n",
        "\n",
        "    # Merge stats back to full dataset\n",
        "    df = df.merge(sig_stats, on=\"task_signature\", how=\"left\")\n",
        "    return df\n",
        "\n",
        "\n",
        "def add_causal_histories_no_leak(df, idx_tr, idx_va, idx_te, global_mean_log):\n",
        "    \"\"\"\n",
        "    Compute strictly causal histories WITHOUT leakage.\n",
        "\n",
        "    Critical fix:\n",
        "    - Train: Expanding stats computed ONLY within training data\n",
        "    - Val/Test: Use FROZEN aggregate statistics from training data\n",
        "    - No cross-contamination between samples in same split\n",
        "\n",
        "    Returns: df with history features added\n",
        "    \"\"\"\n",
        "    df = df.sort_values(\"submit_time\").reset_index(drop=True).copy()\n",
        "\n",
        "    # Initialize all history columns with global prior\n",
        "    for key, prefix in [(\"group\", \"gro\"), (\"user\", \"use\")]:\n",
        "        df[f\"{prefix}_hist_mean\"]  = global_mean_log\n",
        "        df[f\"{prefix}_hist_count\"] = 0.0\n",
        "        df[f\"{prefix}_ewm\"]        = global_mean_log\n",
        "        df[f\"{prefix}_dt_prev\"]    = 0.0\n",
        "\n",
        "    # === TRAIN: Compute expanding stats ONLY on training data ===\n",
        "    df_tr = df.loc[idx_tr].copy()\n",
        "    df_tr[\"p_star_log\"] = np.log1p(df_tr[\"p_star\"])\n",
        "\n",
        "    for key, prefix in [(\"group\", \"gro\"), (\"user\", \"use\")]:\n",
        "        grp = df_tr.groupby(key)[\"p_star_log\"]\n",
        "\n",
        "        # Expanding mean/count with shift(1) for causality\n",
        "        hist_mean = grp.apply(\n",
        "            lambda s: s.expanding().mean().shift(1)\n",
        "        ).reset_index(level=0, drop=True)\n",
        "\n",
        "        hist_count = grp.apply(\n",
        "            lambda s: s.expanding().count().shift(1)\n",
        "        ).reset_index(level=0, drop=True)\n",
        "\n",
        "        ewm = grp.apply(\n",
        "            lambda s: s.ewm(span=10, adjust=False).mean().shift(1)\n",
        "        ).reset_index(level=0, drop=True)\n",
        "\n",
        "        # Assign to training indices\n",
        "        df.loc[idx_tr, f\"{prefix}_hist_mean\"]  = hist_mean.fillna(global_mean_log)\n",
        "        df.loc[idx_tr, f\"{prefix}_hist_count\"] = hist_count.fillna(0)\n",
        "        df.loc[idx_tr, f\"{prefix}_ewm\"]        = ewm.fillna(global_mean_log)\n",
        "\n",
        "        # Time since previous submission (within group/user)\n",
        "        dt = df_tr.groupby(key)[\"submit_time\"].diff()\n",
        "        df.loc[idx_tr, f\"{prefix}_dt_prev\"] = np.log1p(dt).fillna(0)\n",
        "\n",
        "    # === VAL/TEST: Use FROZEN final statistics from training ===\n",
        "    # Compute final (non-shifted) aggregates from full training data\n",
        "    train_stats = {}\n",
        "    for key in [\"group\", \"user\"]:\n",
        "        stats = (df_tr.groupby(key)[\"p_star_log\"]\n",
        "                 .agg(mean=\"mean\", count=\"count\")\n",
        "                 .to_dict(\"index\"))\n",
        "        train_stats[key] = stats\n",
        "\n",
        "    # Apply frozen stats to validation and test\n",
        "    for idx_split in [idx_va, idx_te]:\n",
        "        for key, prefix in [(\"group\", \"gro\"), (\"user\", \"use\")]:\n",
        "            for i in df.loc[idx_split].index:\n",
        "                entity = df.loc[i, key]\n",
        "                if entity in train_stats[key]:\n",
        "                    df.loc[i, f\"{prefix}_hist_mean\"]  = train_stats[key][entity][\"mean\"]\n",
        "                    df.loc[i, f\"{prefix}_hist_count\"] = train_stats[key][entity][\"count\"]\n",
        "                    df.loc[i, f\"{prefix}_ewm\"]        = train_stats[key][entity][\"mean\"]\n",
        "                # dt_prev remains 0 for val/test (no previous submission info)\n",
        "\n",
        "    return df\n",
        "\n",
        "\n",
        "def engineer_all_features(df, idx_tr, idx_va, idx_te):\n",
        "    \"\"\"\n",
        "    Complete feature engineering pipeline (leakage-free).\n",
        "\n",
        "    Order of operations:\n",
        "    1. Basic features (logs, ratios, temporal)\n",
        "    2. Signatures and signature stats (train-only)\n",
        "    3. Causal histories (sequential, no leakage)\n",
        "    4. Group-level EB shrinkage\n",
        "    5. Flags\n",
        "    \"\"\"\n",
        "    # Global prior (log scale, train only)\n",
        "    global_mean_log = np.log1p(df.loc[idx_tr, \"p_star\"]).mean()\n",
        "\n",
        "    df = add_basic_features(df)\n",
        "    df = make_signatures(df, idx_tr, global_mean_log)\n",
        "    df = add_causal_histories_no_leak(df, idx_tr, idx_va, idx_te, global_mean_log)\n",
        "\n",
        "    # Group-level EB shrinkage (using historical mean)\n",
        "    n   = df[\"gro_hist_count\"].fillna(0.0)\n",
        "    mu  = df[\"gro_hist_mean\"].fillna(global_mean_log)\n",
        "    lam = 5.0\n",
        "    df[\"grp_mean_eb\"] = (n * mu + lam * global_mean_log) / (n + lam)\n",
        "\n",
        "    # Fill remaining NaNs\n",
        "    for c in [\"sig_mean\", \"sig_median\", \"sig_q25\", \"sig_q75\"]:\n",
        "        if c in df.columns:\n",
        "            df[c] = df[c].fillna(global_mean_log)\n",
        "\n",
        "    df[\"sig_count\"] = df[\"sig_count\"].fillna(0)\n",
        "    df[\"sig_std\"]   = df[\"sig_std\"].fillna(1.0)\n",
        "\n",
        "    # Flags\n",
        "    df[\"is_recurring\"] = (df[\"sig_count\"] > 0).astype(int)\n",
        "    df[\"global_mean\"]  = global_mean_log\n",
        "\n",
        "    return df, global_mean_log\n",
        "\n",
        "\n",
        "# ============================================================\n",
        "# 4. MATRIX PREPARATION\n",
        "# ============================================================\n",
        "def prepare_matrices(df, idx_tr, idx_va, idx_te):\n",
        "    \"\"\"\n",
        "    Prepare feature matrices and target vectors.\n",
        "\n",
        "    Returns: feature_names, X_train, X_val, X_test, y_train_log, y_val_log, y_test\n",
        "    \"\"\"\n",
        "    NUM_FEATS = [\n",
        "        # Resources/logs/ratios\n",
        "        \"log_total_plan_cpu\", \"log_total_plan_gpu\", \"log_total_plan_mem\",\n",
        "        \"log_total_inst_num\", \"log_num_tasks\",\n",
        "        \"cpu_per_inst\", \"gpu_per_inst\", \"mem_per_inst\", \"tasks_per_inst\",\n",
        "        # Temporal\n",
        "        \"hour\", \"dow\", \"sin_hour\", \"cos_hour\", \"is_weekend\",\n",
        "        # Signature stats\n",
        "        \"sig_mean\", \"sig_median\", \"sig_q25\", \"sig_q75\", \"sig_count\", \"sig_mean_shrink\",\n",
        "        # Histories\n",
        "        \"gro_hist_mean\", \"gro_hist_count\", \"gro_ewm\", \"gro_dt_prev\",\n",
        "        \"use_hist_mean\", \"use_hist_count\", \"use_ewm\", \"use_dt_prev\",\n",
        "        # Priors\n",
        "        \"grp_mean_eb\",\n",
        "    ]\n",
        "\n",
        "    # Filter to existing columns\n",
        "    NUM_FEATS = [f for f in NUM_FEATS if f in df.columns]\n",
        "\n",
        "    # Train-only encoders for categoricals\n",
        "    for c in [\"user\", \"group\", \"workload\", \"gpu_type_spec\"]:\n",
        "        enc = fit_label_encoder_from_train(df.loc[idx_tr, c])\n",
        "        df[f\"{c}_enc\"] = enc(df[c])\n",
        "        NUM_FEATS.append(f\"{c}_enc\")\n",
        "\n",
        "    # Extract matrices\n",
        "    Xtr = df.loc[idx_tr, NUM_FEATS].replace([np.inf, -np.inf], np.nan).fillna(0).values\n",
        "    Xva = df.loc[idx_va, NUM_FEATS].replace([np.inf, -np.inf], np.nan).fillna(0).values\n",
        "    Xte = df.loc[idx_te, NUM_FEATS].replace([np.inf, -np.inf], np.nan).fillna(0).values\n",
        "\n",
        "    ytr_log = np.log1p(df.loc[idx_tr, \"p_star\"].values)\n",
        "    yva_log = np.log1p(df.loc[idx_va, \"p_star\"].values)\n",
        "    yte     = df.loc[idx_te, \"p_star\"].values\n",
        "\n",
        "    return NUM_FEATS, Xtr, Xva, Xte, ytr_log, yva_log, yte\n",
        "\n",
        "\n",
        "# ============================================================\n",
        "# 5. METRICS\n",
        "# ============================================================\n",
        "def rmsle(y, yhat):\n",
        "    y = np.maximum(y, 0)\n",
        "    yhat = np.maximum(yhat, 0)\n",
        "    return float(np.sqrt(mean_squared_error(np.log1p(y), np.log1p(yhat))))\n",
        "\n",
        "\n",
        "def coverage_at(y, yhat, pct=0.25):\n",
        "    y = np.maximum(y, 1e-12)\n",
        "    rel = np.abs(yhat - y) / y\n",
        "    return 100.0 * float(np.mean(rel <= pct))\n",
        "\n",
        "\n",
        "def metrics(y, yhat):\n",
        "    return dict(\n",
        "        MAE=float(mean_absolute_error(y, yhat)),\n",
        "        RMSLE=rmsle(y, yhat),\n",
        "        Cov25=coverage_at(y, yhat, 0.25),\n",
        "        Cov50=coverage_at(y, yhat, 0.50),\n",
        "        Spearman=float(spearmanr(y, yhat).correlation) if np.std(y) > 0 and np.std(yhat) > 0 else np.nan\n",
        "    )\n",
        "\n",
        "\n",
        "def print_metrics(tag, y, yhat):\n",
        "    m = metrics(y, yhat)\n",
        "    print(f\"[{tag:16s}]  Cov@25% {m['Cov25']:5.1f}% | Cov@50% {m['Cov50']:5.1f}% | \"\n",
        "          f\"RMSLE {m['RMSLE']:.3f} | MAE {m['MAE']:7.1f} | ρ {m['Spearman']:.3f}\")\n",
        "    return m\n",
        "\n",
        "\n",
        "# ============================================================\n",
        "# 6. LIGHTGBM HELPER\n",
        "# ============================================================\n",
        "def fit_lgbm(params, Xtr, ytr_log, Xva=None, yva_log=None, n_estimators=600, es=50):\n",
        "    \"\"\"Train LightGBM with early stopping.\"\"\"\n",
        "    model = LGBMRegressor(n_estimators=n_estimators, random_state=RANDOM_STATE, **params)\n",
        "    if Xva is not None and yva_log is not None:\n",
        "        model.fit(Xtr, ytr_log,\n",
        "                  eval_set=[(Xva, yva_log)],\n",
        "                  callbacks=[early_stopping(es, verbose=False), log_evaluation(0)])\n",
        "    else:\n",
        "        model.fit(Xtr, ytr_log)\n",
        "    return model\n",
        "\n",
        "\n",
        "# ============================================================\n",
        "# 7. PREDICTION METHODS\n",
        "# ============================================================\n",
        "def method_cqr(Xtr, Xva, Xte, ytr_log, yva_log, df, idx_va, idx_te):\n",
        "    \"\"\"\n",
        "    M1: Conformal Quantile Regression\n",
        "\n",
        "    Combines quantile predictions with conformal intervals,\n",
        "    blended with priors using Ridge regression.\n",
        "    \"\"\"\n",
        "    # Base L2 model\n",
        "    base = fit_lgbm(\n",
        "        dict(learning_rate=0.05, num_leaves=63, min_child_samples=20, subsample=0.8),\n",
        "        Xtr, ytr_log, Xva, yva_log\n",
        "    )\n",
        "    base_va = base.predict(Xva)\n",
        "    base_te = base.predict(Xte)\n",
        "\n",
        "    # Quantile models\n",
        "    qL = fit_lgbm(\n",
        "        dict(objective=\"quantile\", alpha=0.1, learning_rate=0.05, num_leaves=63),\n",
        "        Xtr, ytr_log, Xva, yva_log, n_estimators=400, es=40\n",
        "    )\n",
        "    qM = fit_lgbm(\n",
        "        dict(objective=\"quantile\", alpha=0.5, learning_rate=0.05, num_leaves=63),\n",
        "        Xtr, ytr_log, Xva, yva_log, n_estimators=400, es=40\n",
        "    )\n",
        "    qU = fit_lgbm(\n",
        "        dict(objective=\"quantile\", alpha=0.9, learning_rate=0.05, num_leaves=63),\n",
        "        Xtr, ytr_log, Xva, yva_log, n_estimators=400, es=40\n",
        "    )\n",
        "\n",
        "    qL_va, qM_va, qU_va = qL.predict(Xva), qM.predict(Xva), qU.predict(Xva)\n",
        "    qL_te, qM_te, qU_te = qL.predict(Xte), qM.predict(Xte), qU.predict(Xte)\n",
        "\n",
        "    # Conformal calibration\n",
        "    r = np.maximum(qL_va - yva_log, yva_log - qU_va)\n",
        "    k = np.quantile(np.maximum(r, 0.0), 0.60)\n",
        "    L_va, U_va = qL_va - k, qU_va + k\n",
        "    L_te, U_te = qL_te - k, qU_te + k\n",
        "\n",
        "    # Clipped median\n",
        "    qM_clip_va = np.clip(qM_va, L_va, U_va)\n",
        "    qM_clip_te = np.clip(qM_te, L_te, U_te)\n",
        "\n",
        "    # Blender features\n",
        "    Z_va = np.c_[\n",
        "        L_va, qM_va, U_va, qM_clip_va, base_va,\n",
        "        df.loc[idx_va, \"grp_mean_eb\"].values,\n",
        "        df.loc[idx_va, \"sig_median\"].fillna(df.loc[idx_va, \"grp_mean_eb\"]).values\n",
        "    ]\n",
        "\n",
        "    ridge = Ridge(alpha=1.0, fit_intercept=True, random_state=RANDOM_STATE)\n",
        "    ridge.fit(Z_va, yva_log)\n",
        "\n",
        "    Z_te = np.c_[\n",
        "        L_te, qM_te, U_te, qM_clip_te, base_te,\n",
        "        df.loc[idx_te, \"grp_mean_eb\"].values,\n",
        "        df.loc[idx_te, \"sig_median\"].fillna(df.loc[idx_te, \"grp_mean_eb\"]).values\n",
        "    ]\n",
        "\n",
        "    yhat_log = ridge.predict(Z_te)\n",
        "    return np.expm1(yhat_log).clip(min=0)\n",
        "\n",
        "\n",
        "def method_hras(df, idx_te):\n",
        "    \"\"\"\n",
        "    M3: Hierarchical Recurrence-Aware Shrinkage\n",
        "\n",
        "    Uses signature-level EB shrinkage with hierarchical fallbacks.\n",
        "    \"\"\"\n",
        "    pred_log = (df.loc[idx_te, \"sig_mean_shrink\"]\n",
        "                .fillna(df.loc[idx_te, \"grp_mean_eb\"])\n",
        "                .fillna(df.loc[idx_te, \"global_mean\"])\n",
        "                .values)\n",
        "    return np.expm1(pred_log).clip(min=0)\n",
        "\n",
        "\n",
        "def method_isotonic(Xtr, Xva, Xte, ytr_log, yva_log):\n",
        "    \"\"\"\n",
        "    M4: Isotonic Calibration\n",
        "\n",
        "    Quantile median + isotonic regression for calibration.\n",
        "    \"\"\"\n",
        "    qmed = fit_lgbm(\n",
        "        dict(objective=\"quantile\", alpha=0.5, learning_rate=0.05, num_leaves=63),\n",
        "        Xtr, ytr_log, Xva, yva_log, n_estimators=800, es=60\n",
        "    )\n",
        "    pred_va = qmed.predict(Xva)\n",
        "    pred_te = qmed.predict(Xte)\n",
        "\n",
        "    iso = IsotonicRegression(out_of_bounds=\"clip\")\n",
        "    iso.fit(pred_va, yva_log)\n",
        "    yhat_log = iso.predict(pred_te)\n",
        "    return np.expm1(yhat_log).clip(min=0)\n",
        "\n",
        "\n",
        "def method_meta_stack(Xtr, Xva, Xte, ytr_log, yva_log, df, idx_va, idx_te):\n",
        "    \"\"\"\n",
        "    M5: Meta-Learning Stack\n",
        "\n",
        "    Ensemble of diverse base models with meta-learner.\n",
        "    \"\"\"\n",
        "    base_models = []\n",
        "\n",
        "    # Base model 1: Standard L2\n",
        "    m1 = fit_lgbm(\n",
        "        dict(learning_rate=0.05, num_leaves=63, min_child_samples=20, subsample=0.8),\n",
        "        Xtr, ytr_log, Xva, yva_log\n",
        "    )\n",
        "    base_models.append(m1)\n",
        "\n",
        "    # Base model 2: Regularized\n",
        "    m2 = fit_lgbm(\n",
        "        dict(learning_rate=0.03, num_leaves=127, min_child_samples=50,\n",
        "             reg_alpha=0.5, reg_lambda=1.0, subsample=0.7),\n",
        "        Xtr, ytr_log, Xva, yva_log, n_estimators=400\n",
        "    )\n",
        "    base_models.append(m2)\n",
        "\n",
        "    # Base model 3: Quantile median\n",
        "    m3 = fit_lgbm(\n",
        "        dict(objective=\"quantile\", alpha=0.5, learning_rate=0.05, num_leaves=63),\n",
        "        Xtr, ytr_log, Xva, yva_log, n_estimators=400\n",
        "    )\n",
        "    base_models.append(m3)\n",
        "\n",
        "    # Base model 4: Huber\n",
        "    m4 = fit_lgbm(\n",
        "        dict(objective=\"huber\", alpha=0.9, learning_rate=0.05, num_leaves=63),\n",
        "        Xtr, ytr_log, Xva, yva_log, n_estimators=400\n",
        "    )\n",
        "    base_models.append(m4)\n",
        "\n",
        "    # Generate meta-features\n",
        "    va_preds = np.column_stack([m.predict(Xva) for m in base_models])\n",
        "    te_preds = np.column_stack([m.predict(Xte) for m in base_models])\n",
        "\n",
        "    # Dispersion\n",
        "    va_std = np.std(va_preds, axis=1).reshape(-1, 1)\n",
        "    te_std = np.std(te_preds, axis=1).reshape(-1, 1)\n",
        "\n",
        "    # Context priors\n",
        "    va_ctx = np.column_stack([\n",
        "        df.loc[idx_va, \"grp_mean_eb\"].values,\n",
        "        df.loc[idx_va, \"sig_median\"].fillna(df.loc[idx_va, \"grp_mean_eb\"]).values\n",
        "    ])\n",
        "    te_ctx = np.column_stack([\n",
        "        df.loc[idx_te, \"grp_mean_eb\"].values,\n",
        "        df.loc[idx_te, \"sig_median\"].fillna(df.loc[idx_te, \"grp_mean_eb\"]).values\n",
        "    ])\n",
        "\n",
        "    Z_va = np.hstack([va_preds, va_std, va_ctx])\n",
        "    Z_te = np.hstack([te_preds, te_std, te_ctx])\n",
        "\n",
        "    # Meta-learner\n",
        "    meta = LGBMRegressor(\n",
        "        n_estimators=200, learning_rate=0.05, num_leaves=31,\n",
        "        min_child_samples=30, random_state=RANDOM_STATE\n",
        "    )\n",
        "    meta.fit(Z_va, yva_log)\n",
        "    yhat_log = meta.predict(Z_te)\n",
        "    return np.expm1(yhat_log).clip(min=0)\n",
        "\n",
        "\n",
        "def method_two_stage(Xtr, Xva, Xte, ytr_log, yva_log):\n",
        "    \"\"\"\n",
        "    M6: Two-Stage Classification + Regression\n",
        "\n",
        "    Classify by duration tertiles, then blend expert predictions.\n",
        "    \"\"\"\n",
        "    # Classify into duration tertiles\n",
        "    p30, p70 = np.percentile(ytr_log, [30, 70])\n",
        "    y_class = np.zeros_like(ytr_log, dtype=int)\n",
        "    y_class[ytr_log < p30] = 0\n",
        "    y_class[(ytr_log >= p30) & (ytr_log < p70)] = 1\n",
        "    y_class[ytr_log >= p70] = 2\n",
        "\n",
        "    # Train classifier\n",
        "    clf = LGBMClassifier(\n",
        "        n_estimators=300, learning_rate=0.05, num_leaves=31,\n",
        "        min_child_samples=20, random_state=RANDOM_STATE\n",
        "    )\n",
        "    clf.fit(Xtr, y_class)\n",
        "    probs_te = clf.predict_proba(Xte)\n",
        "\n",
        "    # Train per-class experts\n",
        "    experts = {}\n",
        "    for c in range(3):\n",
        "        mask = (y_class == c)\n",
        "        if mask.sum() > 100:\n",
        "            expert = LGBMRegressor(\n",
        "                n_estimators=400, learning_rate=0.05, num_leaves=63,\n",
        "                min_child_samples=20, random_state=RANDOM_STATE\n",
        "            )\n",
        "            expert.fit(Xtr[mask], ytr_log[mask])\n",
        "            experts[c] = expert\n",
        "\n",
        "    # Fallback\n",
        "    fallback = fit_lgbm(\n",
        "        dict(learning_rate=0.05, num_leaves=63),\n",
        "        Xtr, ytr_log, n_estimators=400\n",
        "    )\n",
        "\n",
        "    # Blend predictions weighted by class probabilities\n",
        "    yhat_log = np.zeros(len(Xte))\n",
        "    for c in range(3):\n",
        "        model = experts.get(c, fallback)\n",
        "        yhat_log += probs_te[:, c] * model.predict(Xte)\n",
        "\n",
        "    return np.expm1(yhat_log).clip(min=0)\n",
        "\n",
        "\n",
        "def method_recency(Xtr, Xva, Xte, ytr_log, yva_log, df, idx_tr):\n",
        "    \"\"\"\n",
        "    M7: Recency Ensemble\n",
        "\n",
        "    Combines full-history model with recent-window models,\n",
        "    weighted by validation performance.\n",
        "    \"\"\"\n",
        "    times_tr = df.loc[idx_tr, \"submit_time\"].values\n",
        "    tmax, tmin = times_tr.max(), times_tr.min()\n",
        "\n",
        "    models = []\n",
        "\n",
        "    # Full history with time-decay weights\n",
        "    time_w = np.exp(-0.5 * (tmax - times_tr) / (tmax - tmin + 1.0))\n",
        "    m1 = LGBMRegressor(\n",
        "        n_estimators=400, learning_rate=0.05, num_leaves=63,\n",
        "        min_child_samples=20, random_state=RANDOM_STATE\n",
        "    )\n",
        "    m1.fit(Xtr, ytr_log, sample_weight=time_w)\n",
        "    models.append((\"full\", m1))\n",
        "\n",
        "    # Recent 50%\n",
        "    cut50 = np.percentile(times_tr, 50)\n",
        "    mask50 = times_tr >= cut50\n",
        "    if mask50.sum() > 1000:\n",
        "        m2 = fit_lgbm(\n",
        "            dict(learning_rate=0.06, num_leaves=63),\n",
        "            Xtr[mask50], ytr_log[mask50], n_estimators=300\n",
        "        )\n",
        "        models.append((\"recent_50\", m2))\n",
        "\n",
        "    # Recent 20%\n",
        "    cut20 = np.percentile(times_tr, 80)\n",
        "    mask20 = times_tr >= cut20\n",
        "    if mask20.sum() > 500:\n",
        "        m3 = fit_lgbm(\n",
        "            dict(learning_rate=0.08, num_leaves=31),\n",
        "            Xtr[mask20], ytr_log[mask20], n_estimators=200\n",
        "        )\n",
        "        models.append((\"recent_20\", m3))\n",
        "\n",
        "    # Validation-adaptive weights (inverse MAE)\n",
        "    errs = []\n",
        "    for name, model in models:\n",
        "        pred_va = model.predict(Xva)\n",
        "        errs.append(np.mean(np.abs(pred_va - yva_log)))\n",
        "    errs = np.array(errs)\n",
        "    w = 1.0 / (errs + 1e-2)\n",
        "    w = w / w.sum()\n",
        "\n",
        "    # Weighted ensemble\n",
        "    yhat_log = np.zeros(len(Xte))\n",
        "    for i, (name, model) in enumerate(models):\n",
        "        yhat_log += w[i] * model.predict(Xte)\n",
        "\n",
        "    return np.expm1(yhat_log).clip(min=0)\n",
        "\n",
        "\n",
        "# ============================================================\n",
        "# 8. MAIN PIPELINE\n",
        "# ============================================================\n",
        "def main():\n",
        "    print(\"=\" * 78)\n",
        "    print(\"PAI TRACE — JOB DURATION PREDICTION (LEAKAGE-FREE)\")\n",
        "    print(\"=\" * 78)\n",
        "\n",
        "    t0 = time.time()\n",
        "\n",
        "    # Load data\n",
        "    print(\"\\n[1/6] Extracting & loading ...\")\n",
        "    extract_archives()\n",
        "    df = load_tables()\n",
        "    print(f\"  Loaded jobs: {len(df):,}\")\n",
        "\n",
        "    # Time-based splits\n",
        "    print(\"\\n[2/6] Time-based splits ...\")\n",
        "    (idx_tr, idx_va, idx_te), (t_train, t_val) = time_split(df, 0.70, 0.15)\n",
        "    print(f\"  Train: {len(idx_tr):,}  (< {t_train})\")\n",
        "    print(f\"  Val  : {len(idx_va):,}  ([{t_train}, {t_val}))\")\n",
        "    print(f\"  Test : {len(idx_te):,}  (>= {t_val})\")\n",
        "\n",
        "    # Feature engineering (leakage-free!)\n",
        "    print(\"\\n[3/6] Feature engineering ...\")\n",
        "    df, global_mean_log = engineer_all_features(df, idx_tr, idx_va, idx_te)\n",
        "    print(f\"  Feature columns: {df.shape[1]}\")\n",
        "\n",
        "    # Prepare matrices\n",
        "    print(\"\\n[4/6] Preparing matrices ...\")\n",
        "    NUM_FEATS, Xtr, Xva, Xte, ytr_log, yva_log, yte = prepare_matrices(df, idx_tr, idx_va, idx_te)\n",
        "    print(f\"  #Features: {Xtr.shape[1]}\")\n",
        "\n",
        "    # Run methods\n",
        "    print(\"\\n[5/6] Running methods ...\")\n",
        "    results, predictions = {}, {}\n",
        "\n",
        "    print(\"  M1: CQR ...\", end=\" \")\n",
        "    yhat = method_cqr(Xtr, Xva, Xte, ytr_log, yva_log, df, idx_va, idx_te)\n",
        "    results[\"M1 CQR\"] = print_metrics(\"M1 CQR\", yte, yhat)\n",
        "    predictions[\"M1\"] = yhat\n",
        "\n",
        "    print(\"  M3: HRAS ...\", end=\" \")\n",
        "    yhat = method_hras(df, idx_te)\n",
        "    results[\"M3 HRAS\"] = print_metrics(\"M3 HRAS\", yte, yhat)\n",
        "    predictions[\"M3\"] = yhat\n",
        "\n",
        "    print(\"  M4: Isotonic ...\", end=\" \")\n",
        "    yhat = method_isotonic(Xtr, Xva, Xte, ytr_log, yva_log)\n",
        "    results[\"M4 Iso\"] = print_metrics(\"M4 Iso\", yte, yhat)\n",
        "    predictions[\"M4\"] = yhat\n",
        "\n",
        "    print(\"  M5: Meta-Stack ...\", end=\" \")\n",
        "    yhat = method_meta_stack(Xtr, Xva, Xte, ytr_log, yva_log, df, idx_va, idx_te)\n",
        "    results[\"M5 Meta\"] = print_metrics(\"M5 Meta\", yte, yhat)\n",
        "    predictions[\"M5\"] = yhat\n",
        "\n",
        "    print(\"  M6: Two-Stage ...\", end=\" \")\n",
        "    yhat = method_two_stage(Xtr, Xva, Xte, ytr_log, yva_log)\n",
        "    results[\"M6 Two\"] = print_metrics(\"M6 Two\", yte, yhat)\n",
        "    predictions[\"M6\"] = yhat\n",
        "\n",
        "    print(\"  M7: Recency ...\", end=\" \")\n",
        "    yhat = method_recency(Xtr, Xva, Xte, ytr_log, yva_log, df, idx_tr)\n",
        "    results[\"M7 Rec\"] = print_metrics(\"M7 Rec\", yte, yhat)\n",
        "    predictions[\"M7\"] = yhat\n",
        "\n",
        "    # Recurring vs new analysis\n",
        "    print(\"\\n[6/6] Recurring vs new signatures ...\")\n",
        "    rec_mask = df.loc[idx_te, \"is_recurring\"].values.astype(bool)\n",
        "    new_mask = ~rec_mask\n",
        "    print(f\"  Recurring: {rec_mask.sum():,} ({100*rec_mask.mean():.1f}%)\")\n",
        "    print(f\"  New      : {new_mask.sum():,} ({100*new_mask.mean():.1f}%)\")\n",
        "\n",
        "    if rec_mask.any():\n",
        "        for k, yhat in predictions.items():\n",
        "            m = metrics(yte[rec_mask], yhat[rec_mask])\n",
        "            print(f\"  [{k:8s}] on Recurring  Cov@25% {m['Cov25']:5.1f}% | \"\n",
        "                  f\"Cov@50% {m['Cov50']:5.1f}% | RMSLE {m['RMSLE']:.3f} | MAE {m['MAE']:7.1f}\")\n",
        "\n",
        "    print(\"\\n\" + \"=\" * 78)\n",
        "    print(f\"Total wall time: {time.time()-t0:.1f}s\")\n",
        "    print(\"=\" * 78)\n",
        "\n",
        "    return df, idx_te, predictions, results\n",
        "\n",
        "\n",
        "if __name__ == \"__main__\":\n",
        "    df, idx_te, predictions, results = main()"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "XImUmqu93uMC",
        "outputId": "827fe4e0-0470-429f-f381-130ed7f6bccf"
      },
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "==============================================================================\n",
            "PAI TRACE — JOB DURATION PREDICTION (LEAKAGE-FREE)\n",
            "==============================================================================\n",
            "\n",
            "[1/6] Extracting & loading ...\n",
            "[IO] Extracted files to /content/extracted\n",
            "  Loaded jobs: 732,355\n",
            "\n",
            "[2/6] Time-based splits ...\n",
            "  Train: 512,647  (< 5020579)\n",
            "  Val  : 109,854  ([5020579, 5764872))\n",
            "  Test : 109,854  (>= 5764872)\n",
            "\n",
            "[3/6] Feature engineering ...\n",
            "  Feature columns: 53\n",
            "\n",
            "[4/6] Preparing matrices ...\n",
            "  #Features: 33\n",
            "\n",
            "[5/6] Running methods ...\n",
            "  M1: CQR ... [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.048078 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 5583\n",
            "[LightGBM] [Info] Number of data points in the train set: 512647, number of used features: 33\n",
            "[LightGBM] [Info] Start training from score 6.602821\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.048386 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 5583\n",
            "[LightGBM] [Info] Number of data points in the train set: 512647, number of used features: 33\n",
            "[LightGBM] [Info] Start training from score 3.761200\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.048959 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 5583\n",
            "[LightGBM] [Info] Number of data points in the train set: 512647, number of used features: 33\n",
            "[LightGBM] [Info] Start training from score 6.490724\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.050364 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 5583\n",
            "[LightGBM] [Info] Number of data points in the train set: 512647, number of used features: 33\n",
            "[LightGBM] [Info] Start training from score 9.469421\n",
            "[M1 CQR          ]  Cov@25%  49.0% | Cov@50%  67.5% | RMSLE 0.944 | MAE  2445.0 | ρ 0.897\n",
            "  M3: HRAS ... [M3 HRAS         ]  Cov@25%  25.6% | Cov@50%  46.9% | RMSLE 1.373 | MAE  3922.3 | ρ 0.801\n",
            "  M4: Isotonic ... [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033176 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 5583\n",
            "[LightGBM] [Info] Number of data points in the train set: 512647, number of used features: 33\n",
            "[LightGBM] [Info] Start training from score 6.490724\n",
            "[M4 Iso          ]  Cov@25%  45.9% | Cov@50%  67.0% | RMSLE 0.975 | MAE  2449.5 | ρ 0.888\n",
            "  M5: Meta-Stack ... [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.047500 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 5583\n",
            "[LightGBM] [Info] Number of data points in the train set: 512647, number of used features: 33\n",
            "[LightGBM] [Info] Start training from score 6.602821\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.044410 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 5583\n",
            "[LightGBM] [Info] Number of data points in the train set: 512647, number of used features: 33\n",
            "[LightGBM] [Info] Start training from score 6.602821\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032004 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 5583\n",
            "[LightGBM] [Info] Number of data points in the train set: 512647, number of used features: 33\n",
            "[LightGBM] [Info] Start training from score 6.490724\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030108 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 5583\n",
            "[LightGBM] [Info] Number of data points in the train set: 512647, number of used features: 33\n",
            "[LightGBM] [Info] Start training from score 6.602821\n",
            "[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001976 seconds.\n",
            "You can set `force_col_wise=true` to remove the overhead.\n",
            "[LightGBM] [Info] Total Bins 1785\n",
            "[LightGBM] [Info] Number of data points in the train set: 109854, number of used features: 7\n",
            "[LightGBM] [Info] Start training from score 6.606521\n",
            "[M5 Meta         ]  Cov@25%  51.2% | Cov@50%  70.6% | RMSLE 0.871 | MAE  2278.7 | ρ 0.912\n",
            "  M6: Two-Stage ... [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.035328 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 5583\n",
            "[LightGBM] [Info] Number of data points in the train set: 512647, number of used features: 33\n",
            "[LightGBM] [Info] Start training from score -1.205249\n",
            "[LightGBM] [Info] Start training from score -0.915344\n",
            "[LightGBM] [Info] Start training from score -1.203960\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011346 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 5178\n",
            "[LightGBM] [Info] Number of data points in the train set: 153598, number of used features: 33\n",
            "[LightGBM] [Info] Start training from score 4.120841\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011842 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 5424\n",
            "[LightGBM] [Info] Number of data points in the train set: 205253, number of used features: 33\n",
            "[LightGBM] [Info] Start training from score 6.524948\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008464 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 5580\n",
            "[LightGBM] [Info] Number of data points in the train set: 153796, number of used features: 33\n",
            "[LightGBM] [Info] Start training from score 9.185533\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033290 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 5583\n",
            "[LightGBM] [Info] Number of data points in the train set: 512647, number of used features: 33\n",
            "[LightGBM] [Info] Start training from score 6.602821\n",
            "[M6 Two          ]  Cov@25%  49.8% | Cov@50%  68.8% | RMSLE 0.945 | MAE  2285.9 | ρ 0.897\n",
            "  M7: Recency ... [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032865 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 5583\n",
            "[LightGBM] [Info] Number of data points in the train set: 512647, number of used features: 33\n",
            "[LightGBM] [Info] Start training from score 6.590434\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016482 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 5528\n",
            "[LightGBM] [Info] Number of data points in the train set: 256324, number of used features: 33\n",
            "[LightGBM] [Info] Start training from score 6.549674\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005439 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 5515\n",
            "[LightGBM] [Info] Number of data points in the train set: 102531, number of used features: 33\n",
            "[LightGBM] [Info] Start training from score 6.358823\n",
            "[M7 Rec          ]  Cov@25%  39.9% | Cov@50%  63.2% | RMSLE 0.948 | MAE  2788.5 | ρ 0.896\n",
            "\n",
            "[6/6] Recurring vs new signatures ...\n",
            "  Recurring: 89,287 (81.3%)\n",
            "  New      : 20,567 (18.7%)\n",
            "  [M1      ] on Recurring  Cov@25%  57.1% | Cov@50%  76.8% | RMSLE 0.718 | MAE  1797.8\n",
            "  [M3      ] on Recurring  Cov@25%  29.1% | Cov@50%  52.5% | RMSLE 1.153 | MAE  3459.3\n",
            "  [M4      ] on Recurring  Cov@25%  53.4% | Cov@50%  76.2% | RMSLE 0.733 | MAE  1765.9\n",
            "  [M5      ] on Recurring  Cov@25%  58.3% | Cov@50%  77.5% | RMSLE 0.698 | MAE  1657.3\n",
            "  [M6      ] on Recurring  Cov@25%  58.0% | Cov@50%  78.0% | RMSLE 0.716 | MAE  1621.3\n",
            "  [M7      ] on Recurring  Cov@25%  45.5% | Cov@50%  70.9% | RMSLE 0.738 | MAE  2255.6\n",
            "\n",
            "==============================================================================\n",
            "Total wall time: 479.9s\n",
            "==============================================================================\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# ============================================================\n",
        "# ABLATION STUDY\n",
        "# ============================================================\n",
        "\n",
        "def engineer_features_by_config(df, idx_tr, idx_va, idx_te, config=\"all\"):\n",
        "    \"\"\"\n",
        "    Engineer features for different ablation configurations.\n",
        "\n",
        "    Configs:\n",
        "    - \"resources\": Resources + Temporal only (14 features)\n",
        "    - \"group\": Resources + Temporal + Signatures + Group features (27 features)\n",
        "    - \"all\": All features including User (33 features)\n",
        "\n",
        "    All configurations are leakage-free.\n",
        "    \"\"\"\n",
        "    # Global prior (train only)\n",
        "    global_mean_log = np.log1p(df.loc[idx_tr, \"p_star\"]).mean()\n",
        "\n",
        "    # ALWAYS add basic features (resources + temporal)\n",
        "    df = add_basic_features(df)\n",
        "\n",
        "    # Resources + Temporal features (14)\n",
        "    RESOURCE_TEMPORAL = [\n",
        "        # Resources (9)\n",
        "        \"log_total_plan_cpu\", \"log_total_plan_gpu\", \"log_total_plan_mem\",\n",
        "        \"log_total_inst_num\", \"log_num_tasks\",\n",
        "        \"cpu_per_inst\", \"gpu_per_inst\", \"mem_per_inst\", \"tasks_per_inst\",\n",
        "        # Temporal (5)\n",
        "        \"hour\", \"dow\", \"sin_hour\", \"cos_hour\", \"is_weekend\",\n",
        "    ]\n",
        "\n",
        "    if config == \"resources\":\n",
        "        # Resources + Temporal only (14 features)\n",
        "        return df, RESOURCE_TEMPORAL, global_mean_log\n",
        "\n",
        "    # ============================================\n",
        "    # Add Group features (for \"group\" and \"all\")\n",
        "    # ============================================\n",
        "    if config in [\"group\", \"all\"]:\n",
        "        # Add signatures (train-only)\n",
        "        df = make_signatures(df, idx_tr, global_mean_log)\n",
        "\n",
        "        # Add causal histories (leakage-free)\n",
        "        df = add_causal_histories_no_leak(df, idx_tr, idx_va, idx_te, global_mean_log)\n",
        "\n",
        "        # Group-level EB shrinkage\n",
        "        n = df[\"gro_hist_count\"].fillna(0.0)\n",
        "        mu = df[\"gro_hist_mean\"].fillna(global_mean_log)\n",
        "        lam = 5.0\n",
        "        df[\"grp_mean_eb\"] = (n * mu + lam * global_mean_log) / (n + lam)\n",
        "\n",
        "        # Fill NaNs for signature features\n",
        "        for c in [\"sig_mean\", \"sig_median\", \"sig_q25\", \"sig_q75\"]:\n",
        "            if c in df.columns:\n",
        "                df[c] = df[c].fillna(global_mean_log)\n",
        "        df[\"sig_count\"] = df[\"sig_count\"].fillna(0)\n",
        "\n",
        "        # Group feature list\n",
        "        GROUP_FEATS = RESOURCE_TEMPORAL + [\n",
        "            # Signatures (6)\n",
        "            \"sig_mean\", \"sig_median\", \"sig_q25\", \"sig_q75\", \"sig_count\", \"sig_mean_shrink\",\n",
        "            # Group histories (4)\n",
        "            \"gro_hist_mean\", \"gro_hist_count\", \"gro_ewm\", \"gro_dt_prev\",\n",
        "            # Group EB (1)\n",
        "            \"grp_mean_eb\",\n",
        "        ]  # 14 + 6 + 4 + 1 = 25 (before encodings)\n",
        "\n",
        "        # Add group encodings\n",
        "        enc_group = fit_label_encoder_from_train(df.loc[idx_tr, \"group\"])\n",
        "        df[\"group_enc\"] = enc_group(df[\"group\"])\n",
        "        enc_workload = fit_label_encoder_from_train(df.loc[idx_tr, \"workload\"])\n",
        "        df[\"workload_enc\"] = enc_workload(df[\"workload\"])\n",
        "\n",
        "        GROUP_FEATS += [\"group_enc\", \"workload_enc\"]  # 27 total\n",
        "\n",
        "        if config == \"group\":\n",
        "            return df, GROUP_FEATS, global_mean_log\n",
        "\n",
        "    # ============================================\n",
        "    # Add User features (for \"all\" only)\n",
        "    # ============================================\n",
        "    if config == \"all\":\n",
        "        # Fill NaNs for user features (already computed in add_causal_histories_no_leak)\n",
        "        for c in [\"use_hist_mean\", \"use_ewm\"]:\n",
        "            df[c] = df[c].fillna(global_mean_log)\n",
        "        for c in [\"use_hist_count\", \"use_dt_prev\"]:\n",
        "            df[c] = df[c].fillna(0.0)\n",
        "\n",
        "        # All features list\n",
        "        ALL_FEATS = GROUP_FEATS + [\n",
        "            # User histories (4)\n",
        "            \"use_hist_mean\", \"use_hist_count\", \"use_ewm\", \"use_dt_prev\",\n",
        "        ]  # 27 + 4 = 31 (before user encodings)\n",
        "\n",
        "        # Add user encodings\n",
        "        enc_user = fit_label_encoder_from_train(df.loc[idx_tr, \"user\"])\n",
        "        df[\"user_enc\"] = enc_user(df[\"user\"])\n",
        "        enc_gpu = fit_label_encoder_from_train(df.loc[idx_tr, \"gpu_type_spec\"])\n",
        "        df[\"gpu_type_spec_enc\"] = enc_gpu(df[\"gpu_type_spec\"])\n",
        "\n",
        "        ALL_FEATS += [\"user_enc\", \"gpu_type_spec_enc\"]  # 33 total\n",
        "\n",
        "        return df, ALL_FEATS, global_mean_log\n",
        "\n",
        "    raise ValueError(f\"Unknown config: {config}\")\n",
        "\n",
        "\n",
        "def train_lgbm_simple(X_tr, y_tr_log, X_te, y_te):\n",
        "    \"\"\"Simple LGBM training without validation set.\"\"\"\n",
        "    model = LGBMRegressor(\n",
        "        n_estimators=600,\n",
        "        learning_rate=0.05,\n",
        "        num_leaves=63,\n",
        "        min_child_samples=20,\n",
        "        subsample=0.8,\n",
        "        colsample_bytree=0.8,\n",
        "        random_state=RANDOM_STATE,\n",
        "        n_jobs=-1\n",
        "    )\n",
        "    model.fit(X_tr, y_tr_log)\n",
        "    yhat_log = model.predict(X_te)\n",
        "    yhat = np.expm1(yhat_log).clip(min=0)\n",
        "    return yhat\n",
        "\n",
        "\n",
        "# ============================================================\n",
        "# OVERFITTING ANALYSIS FUNCTIONS\n",
        "# ============================================================\n",
        "\n",
        "def analysis_train_test_gap(df, idx_tr, idx_te):\n",
        "    \"\"\"\n",
        "    Analysis 1: Train-Test Performance Gap (5-fold CV)\n",
        "    \"\"\"\n",
        "    from sklearn.model_selection import KFold\n",
        "\n",
        "    print(\"\\n\" + \"=\"*70)\n",
        "    print(\"[Analysis 1] Train-Test Performance Gap ...\")\n",
        "    print(\"=\"*70)\n",
        "\n",
        "    # Use the SAME feature engineering as main methods\n",
        "    df_eng, feats, gm_log = engineer_features_by_config(\n",
        "        df.copy(), idx_tr, pd.Index([]), idx_te, config=\"all\"\n",
        "    )\n",
        "\n",
        "    X_te = df_eng.loc[idx_te, feats].replace([np.inf, -np.inf], np.nan).fillna(0).values\n",
        "    y_te = df_eng.loc[idx_te, \"p_star\"].values\n",
        "\n",
        "    # 5-fold CV on training data\n",
        "    kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)\n",
        "    cv_scores = []\n",
        "\n",
        "    for fold, (train_idx, val_idx) in enumerate(kf.split(idx_tr)):\n",
        "        fold_tr_idx = idx_tr[train_idx]\n",
        "        fold_va_idx = idx_tr[val_idx]\n",
        "\n",
        "        # Engineer features for this fold\n",
        "        df_fold, fold_feats, _ = engineer_features_by_config(\n",
        "            df.copy(), fold_tr_idx, fold_va_idx, pd.Index([]), config=\"all\"\n",
        "        )\n",
        "\n",
        "        X_fold_tr = df_fold.loc[fold_tr_idx, fold_feats].replace([np.inf, -np.inf], np.nan).fillna(0).values\n",
        "        y_fold_tr_log = np.log1p(df_fold.loc[fold_tr_idx, \"p_star\"].values)\n",
        "\n",
        "        X_fold_va = df_fold.loc[fold_va_idx, fold_feats].replace([np.inf, -np.inf], np.nan).fillna(0).values\n",
        "        y_fold_va = df_fold.loc[fold_va_idx, \"p_star\"].values\n",
        "\n",
        "        # Train and evaluate\n",
        "        yhat_va = train_lgbm_simple(X_fold_tr, y_fold_tr_log, X_fold_va, y_fold_va)\n",
        "        cov25 = coverage_at(y_fold_va, yhat_va, 0.25)\n",
        "        cv_scores.append(cov25)\n",
        "\n",
        "    # Test performance\n",
        "    df_final, final_feats, _ = engineer_features_by_config(\n",
        "        df.copy(), idx_tr, pd.Index([]), idx_te, config=\"all\"\n",
        "    )\n",
        "    X_tr_final = df_final.loc[idx_tr, final_feats].replace([np.inf, -np.inf], np.nan).fillna(0).values\n",
        "    y_tr_final_log = np.log1p(df_final.loc[idx_tr, \"p_star\"].values)\n",
        "\n",
        "    yhat_te = train_lgbm_simple(X_tr_final, y_tr_final_log, X_te, y_te)\n",
        "    test_cov25 = coverage_at(y_te, yhat_te, 0.25)\n",
        "\n",
        "    # Summary\n",
        "    print(\"\\n\" + \"=\"*70)\n",
        "    print(\"TRAIN-TEST GAP SUMMARY\")\n",
        "    print(\"=\"*70)\n",
        "    cv_mean = np.mean(cv_scores)\n",
        "    cv_std = np.std(cv_scores)\n",
        "    gap = cv_mean - test_cov25\n",
        "\n",
        "    print(f\"Training Cov@25% (5-fold CV): {cv_mean:.1f}% ± {cv_std:.1f}%\")\n",
        "    print(f\"Test Cov@25%:                 {test_cov25:.1f}%\")\n",
        "    print(f\"Gap:                          {gap:+.1f}%\")\n",
        "\n",
        "    if gap >= 5.0:\n",
        "        print(\"⚠ CHECK - Possible overfitting (gap >= 5%)\")\n",
        "    else:\n",
        "        print(\"✓ PASS - Minimal overfitting (gap < 5%)\")\n",
        "\n",
        "    return {\"train_mean\": cv_mean, \"train_std\": cv_std, \"test\": test_cov25, \"gap\": gap}\n",
        "\n",
        "\n",
        "def analysis_seen_unseen_users(df, idx_tr, idx_te):\n",
        "    \"\"\"\n",
        "    Analysis 2: Seen vs Unseen Users\n",
        "    \"\"\"\n",
        "    print(\"\\n\" + \"=\"*70)\n",
        "    print(\"[Analysis 2] Seen vs Unseen Users ...\")\n",
        "    print(\"=\"*70)\n",
        "\n",
        "    # Identify train users\n",
        "    train_users = set(df.loc[idx_tr, \"user\"].unique())\n",
        "\n",
        "    # Split test by user presence\n",
        "    df_test = df.loc[idx_te].copy()\n",
        "    df_test[\"user_seen\"] = df_test[\"user\"].isin(train_users)\n",
        "\n",
        "    idx_te_seen = df.loc[idx_te].index[df_test[\"user_seen\"].values]\n",
        "    idx_te_unseen = df.loc[idx_te].index[~df_test[\"user_seen\"].values]\n",
        "\n",
        "    print(f\"\\nTotal train users: {len(train_users):,}\")\n",
        "    print(f\"Total test jobs: {len(idx_te):,}\")\n",
        "    print(f\"  - From SEEN users: {len(idx_te_seen):,} ({100*len(idx_te_seen)/len(idx_te):.1f}%)\")\n",
        "    print(f\"  - From UNSEEN users: {len(idx_te_unseen):,} ({100*len(idx_te_unseen)/len(idx_te):.1f}%)\")\n",
        "\n",
        "    # Train model\n",
        "    df_eng, feats, _ = engineer_features_by_config(\n",
        "        df.copy(), idx_tr, pd.Index([]), idx_te, config=\"all\"\n",
        "    )\n",
        "\n",
        "    X_tr = df_eng.loc[idx_tr, feats].replace([np.inf, -np.inf], np.nan).fillna(0).values\n",
        "    y_tr_log = np.log1p(df_eng.loc[idx_tr, \"p_star\"].values)\n",
        "\n",
        "    # Evaluate on seen users\n",
        "    if len(idx_te_seen) > 0:\n",
        "        X_te_seen = df_eng.loc[idx_te_seen, feats].replace([np.inf, -np.inf], np.nan).fillna(0).values\n",
        "        y_te_seen = df_eng.loc[idx_te_seen, \"p_star\"].values\n",
        "        yhat_seen = train_lgbm_simple(X_tr, y_tr_log, X_te_seen, y_te_seen)\n",
        "        m_seen = metrics(y_te_seen, yhat_seen)\n",
        "    else:\n",
        "        m_seen = None\n",
        "\n",
        "    # Evaluate on unseen users\n",
        "    if len(idx_te_unseen) > 0:\n",
        "        X_te_unseen = df_eng.loc[idx_te_unseen, feats].replace([np.inf, -np.inf], np.nan).fillna(0).values\n",
        "        y_te_unseen = df_eng.loc[idx_te_unseen, \"p_star\"].values\n",
        "        yhat_unseen = train_lgbm_simple(X_tr, y_tr_log, X_te_unseen, y_te_unseen)\n",
        "        m_unseen = metrics(y_te_unseen, yhat_unseen)\n",
        "    else:\n",
        "        m_unseen = None\n",
        "\n",
        "    # Summary\n",
        "    print(\"\\n\" + \"=\"*70)\n",
        "    print(\"SEEN vs UNSEEN USERS ANALYSIS\")\n",
        "    print(\"=\"*70)\n",
        "    print(\"\\nBest Model:\")\n",
        "    if m_seen:\n",
        "        print(f\"  Seen users    - Cov@25%: {m_seen['Cov25']:5.1f}% | Cov@50%: {m_seen['Cov50']:5.1f}% | RMSLE: {m_seen['RMSLE']:.3f}\")\n",
        "    if m_unseen:\n",
        "        print(f\"  Unseen users  - Cov@25%: {m_unseen['Cov25']:5.1f}% | Cov@50%: {m_unseen['Cov50']:5.1f}% | RMSLE: {m_unseen['RMSLE']:.3f}\")\n",
        "\n",
        "    if m_seen and m_unseen:\n",
        "        gap = m_seen['Cov25'] - m_unseen['Cov25']\n",
        "        print(f\"  Gap           - {gap:+.1f}% (positive = better on seen)\")\n",
        "\n",
        "    return {\"seen\": m_seen, \"unseen\": m_unseen}\n",
        "\n",
        "\n",
        "def analysis_feature_ablation(df, idx_tr, idx_te):\n",
        "    \"\"\"\n",
        "    Analysis 3: Feature Ablation Study\n",
        "    \"\"\"\n",
        "    print(\"\\n\" + \"=\"*70)\n",
        "    print(\"[Analysis 3] Feature Ablation ...\")\n",
        "    print(\"=\"*70)\n",
        "\n",
        "    results = {}\n",
        "\n",
        "    for config, desc in [\n",
        "        (\"resources\", \"Resources only (no User, no Group)\"),\n",
        "        (\"group\", \"Group + Resources (no User)\"),\n",
        "        (\"all\", \"All features (User + Group + Resources)\")\n",
        "    ]:\n",
        "\n",
        "        print(f\"\\n[Ablation] {desc} ...\")\n",
        "\n",
        "        # Engineer features for this config\n",
        "        df_eng, feats, _ = engineer_features_by_config(\n",
        "            df.copy(), idx_tr, pd.Index([]), idx_te, config=config\n",
        "        )\n",
        "\n",
        "        print(f\"  Features: {len(feats)}\")\n",
        "\n",
        "        # Prepare matrices\n",
        "        X_tr = df_eng.loc[idx_tr, feats].replace([np.inf, -np.inf], np.nan).fillna(0).values\n",
        "        y_tr_log = np.log1p(df_eng.loc[idx_tr, \"p_star\"].values)\n",
        "\n",
        "        X_te = df_eng.loc[idx_te, feats].replace([np.inf, -np.inf], np.nan).fillna(0).values\n",
        "        y_te = df_eng.loc[idx_te, \"p_star\"].values\n",
        "\n",
        "        # Train and predict\n",
        "        yhat = train_lgbm_simple(X_tr, y_tr_log, X_te, y_te)\n",
        "\n",
        "        # Metrics\n",
        "        m = print_metrics(f\"{config[:15]}\", y_te, yhat)\n",
        "        results[config] = m\n",
        "\n",
        "    # Summary\n",
        "    print(\"\\n\" + \"=\"*70)\n",
        "    print(\"ABLATION SUMMARY\")\n",
        "    print(\"=\"*70)\n",
        "    print(f\"{'Configuration':<30} {'Cov@25%':<10} {'Cov@50%':<10} {'RMSLE':<10}\")\n",
        "    print(\"-\" * 70)\n",
        "    print(f\"{'Resources only':<30} {results['resources']['Cov25']:5.1f}%     {results['resources']['Cov50']:5.1f}%     {results['resources']['RMSLE']:.3f}\")\n",
        "    print(f\"{'Group + Resources':<30} {results['group']['Cov25']:5.1f}%     {results['group']['Cov50']:5.1f}%     {results['group']['RMSLE']:.3f}\")\n",
        "    print(f\"{'All features':<30} {results['all']['Cov25']:5.1f}%     {results['all']['Cov50']:5.1f}%     {results['all']['RMSLE']:.3f}\")\n",
        "\n",
        "    # Incremental gains\n",
        "    group_gain = results['group']['Cov25'] - results['resources']['Cov25']\n",
        "    user_gain = results['all']['Cov25'] - results['group']['Cov25']\n",
        "\n",
        "    print(f\"\\n→ Group adds: {group_gain:+.1f}%\")\n",
        "    print(f\"→ User adds:  {user_gain:+.1f}%\")\n",
        "\n",
        "    if group_gain > abs(user_gain):\n",
        "        print(\"→ Group is the PRIMARY predictive signal\")\n",
        "    else:\n",
        "        print(\"→ User is the PRIMARY predictive signal\")\n",
        "\n",
        "    return results\n",
        "\n",
        "\n",
        "# ============================================================\n",
        "# MAIN OVERFITTING ANALYSIS\n",
        "# ============================================================\n",
        "\n",
        "def run_overfitting_analysis():\n",
        "    \"\"\"\n",
        "    Complete overfitting analysis pipeline.\n",
        "    \"\"\"\n",
        "    print(\"=\" * 78)\n",
        "    print(\"PAI TRACE — OVERFITTING ANALYSIS\")\n",
        "    print(\"=\" * 78)\n",
        "\n",
        "    t0 = time.time()\n",
        "\n",
        "    # Load data\n",
        "    print(\"\\n[1/5] Extracting & loading ...\")\n",
        "    extract_archives()\n",
        "    df = load_tables()\n",
        "    print(f\"  Loaded jobs: {len(df):,}\")\n",
        "\n",
        "    # Time-based splits\n",
        "    print(\"\\n[2/5] Time-based splits ...\")\n",
        "    (idx_tr, idx_va, idx_te), (t_train, t_val) = time_split(df, 0.70, 0.15)\n",
        "    print(f\"  Train: {len(idx_tr):,}\")\n",
        "    print(f\"  Val  : {len(idx_va):,}\")\n",
        "    print(f\"  Test : {len(idx_te):,}\")\n",
        "\n",
        "    # Feature engineering (for baseline)\n",
        "    print(\"\\n[3/5] Feature engineering ...\")\n",
        "    df_eng, feats, gm = engineer_features_by_config(\n",
        "        df.copy(), idx_tr, pd.Index([]), idx_te, config=\"all\"\n",
        "    )\n",
        "    print(f\"  Feature columns: {df_eng.shape[1]}\")\n",
        "\n",
        "    # Train best model\n",
        "    print(\"\\n[4/5] Training best model (LGBM with all features) ...\")\n",
        "    X_tr = df_eng.loc[idx_tr, feats].replace([np.inf, -np.inf], np.nan).fillna(0).values\n",
        "    y_tr_log = np.log1p(df_eng.loc[idx_tr, \"p_star\"].values)\n",
        "    X_te = df_eng.loc[idx_te, feats].replace([np.inf, -np.inf], np.nan).fillna(0).values\n",
        "    y_te = df_eng.loc[idx_te, \"p_star\"].values\n",
        "\n",
        "    yhat_te = train_lgbm_simple(X_tr, y_tr_log, X_te, y_te)\n",
        "\n",
        "    print(\"\\nBest Model (All Features) on Test Set:\")\n",
        "    print_metrics(\"Test\", y_te, yhat_te)\n",
        "\n",
        "    # Run analyses\n",
        "    print(\"\\n\" + \"=\" * 78)\n",
        "    print(\"[5/5] OVERFITTING ANALYSES\")\n",
        "    print(\"=\" * 78)\n",
        "\n",
        "    r1 = analysis_train_test_gap(df, idx_tr, idx_te)\n",
        "    r2 = analysis_seen_unseen_users(df, idx_tr, idx_te)\n",
        "    r3 = analysis_feature_ablation(df, idx_tr, idx_te)\n",
        "\n",
        "    # Final summary\n",
        "    print(\"\\n\" + \"=\" * 78)\n",
        "    print(\"FINAL SUMMARY FOR REBUTTAL\")\n",
        "    print(\"=\" * 78)\n",
        "\n",
        "    print(f\"\\n1. Train-Test Gap ({'Minimal' if r1['gap'] < 5 else 'Moderate'} Overfitting):\")\n",
        "    print(f\"   Train: {r1['train_mean']:.1f}% | Test: {r1['test']:.1f}% | Gap: {r1['gap']:+.1f}%\")\n",
        "\n",
        "    print(f\"\\n2. Seen vs Unseen Users (Generalization Test):\")\n",
        "    if r2['seen'] and r2['unseen']:\n",
        "        gap = r2['seen']['Cov25'] - r2['unseen']['Cov25']\n",
        "        print(f\"   Best Model:\")\n",
        "        print(f\"     Seen: {r2['seen']['Cov25']:.1f}% | Unseen: {r2['unseen']['Cov25']:.1f}% | Gap: {gap:+.1f}%\")\n",
        "\n",
        "    print(f\"\\n3. Ablation Study (Feature Importance Hierarchy):\")\n",
        "    print(f\"   Resources only:     {r3['resources']['Cov25']:.1f}%\")\n",
        "    print(f\"   + Group:            {r3['group']['Cov25']:.1f}%\")\n",
        "    print(f\"   + Group + User:     {r3['all']['Cov25']:.1f}%\")\n",
        "\n",
        "    group_gain = r3['group']['Cov25'] - r3['resources']['Cov25']\n",
        "    user_gain = r3['all']['Cov25'] - r3['group']['Cov25']\n",
        "    print(f\"\\n   → Group adds: {group_gain:+.1f}%\")\n",
        "    print(f\"   → User adds:  {user_gain:+.1f}%\")\n",
        "    print(f\"   → {'Group' if group_gain > abs(user_gain) else 'User'} is the PRIMARY predictive signal\")\n",
        "\n",
        "    print(\"\\n\" + \"=\" * 78)\n",
        "    print(f\"Total wall time: {time.time()-t0:.1f}s\")\n",
        "    print(\"=\" * 78)\n",
        "\n",
        "    return df, idx_te, {\"gap\": r1, \"users\": r2, \"ablation\": r3}\n",
        "\n",
        "\n",
        "if __name__ == \"__main__\":\n",
        "    df, idx_te, results = run_overfitting_analysis()"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "4EaA2oCQTsD1",
        "outputId": "7afc2fc6-6612-4317-92cf-897ffef6a5cb"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "==============================================================================\n",
            "PAI TRACE — OVERFITTING ANALYSIS\n",
            "==============================================================================\n",
            "\n",
            "[1/5] Extracting & loading ...\n",
            "[IO] Extracted files to /content/extracted\n",
            "  Loaded jobs: 732,355\n",
            "\n",
            "[2/5] Time-based splits ...\n",
            "  Train: 512,647\n",
            "  Val  : 109,854\n",
            "  Test : 109,854\n",
            "\n",
            "[3/5] Feature engineering ...\n",
            "  Feature columns: 55\n",
            "\n",
            "[4/5] Training best model (LGBM with all features) ...\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027580 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 5583\n",
            "[LightGBM] [Info] Number of data points in the train set: 512647, number of used features: 33\n",
            "[LightGBM] [Info] Start training from score 6.602821\n",
            "\n",
            "Best Model (All Features) on Test Set:\n",
            "[Test            ]  Cov@25%  45.8% | Cov@50%  66.7% | RMSLE 0.942 | MAE  2471.2 | ρ 0.900\n",
            "\n",
            "==============================================================================\n",
            "[5/5] OVERFITTING ANALYSES\n",
            "==============================================================================\n",
            "\n",
            "======================================================================\n",
            "[Analysis 1] Train-Test Performance Gap ...\n",
            "======================================================================\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021549 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 5558\n",
            "[LightGBM] [Info] Number of data points in the train set: 410117, number of used features: 33\n",
            "[LightGBM] [Info] Start training from score 6.601439\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021258 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 5577\n",
            "[LightGBM] [Info] Number of data points in the train set: 410117, number of used features: 33\n",
            "[LightGBM] [Info] Start training from score 6.603714\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016995 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 5576\n",
            "[LightGBM] [Info] Number of data points in the train set: 410118, number of used features: 33\n",
            "[LightGBM] [Info] Start training from score 6.598918\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021440 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 5577\n",
            "[LightGBM] [Info] Number of data points in the train set: 410118, number of used features: 33\n",
            "[LightGBM] [Info] Start training from score 6.605653\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022093 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 5572\n",
            "[LightGBM] [Info] Number of data points in the train set: 410118, number of used features: 33\n",
            "[LightGBM] [Info] Start training from score 6.604379\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027481 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 5583\n",
            "[LightGBM] [Info] Number of data points in the train set: 512647, number of used features: 33\n",
            "[LightGBM] [Info] Start training from score 6.602821\n",
            "\n",
            "======================================================================\n",
            "TRAIN-TEST GAP SUMMARY\n",
            "======================================================================\n",
            "Training Cov@25% (5-fold CV): 46.7% ± 1.3%\n",
            "Test Cov@25%:                 45.8%\n",
            "Gap:                          +1.0%\n",
            "✓ PASS - Minimal overfitting (gap < 5%)\n",
            "\n",
            "======================================================================\n",
            "[Analysis 2] Seen vs Unseen Users ...\n",
            "======================================================================\n",
            "\n",
            "Total train users: 1,194\n",
            "Total test jobs: 109,854\n",
            "  - From SEEN users: 107,877 (98.2%)\n",
            "  - From UNSEEN users: 1,977 (1.8%)\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022634 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 5583\n",
            "[LightGBM] [Info] Number of data points in the train set: 512647, number of used features: 33\n",
            "[LightGBM] [Info] Start training from score 6.602821\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022225 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 5583\n",
            "[LightGBM] [Info] Number of data points in the train set: 512647, number of used features: 33\n",
            "[LightGBM] [Info] Start training from score 6.602821\n",
            "\n",
            "======================================================================\n",
            "SEEN vs UNSEEN USERS ANALYSIS\n",
            "======================================================================\n",
            "\n",
            "Best Model:\n",
            "  Seen users    - Cov@25%:  45.9% | Cov@50%:  66.8% | RMSLE: 0.941\n",
            "  Unseen users  - Cov@25%:  40.1% | Cov@50%:  58.8% | RMSLE: 0.972\n",
            "  Gap           - +5.7% (positive = better on seen)\n",
            "\n",
            "======================================================================\n",
            "[Analysis 3] Feature Ablation ...\n",
            "======================================================================\n",
            "\n",
            "[Ablation] Resources only (no User, no Group) ...\n",
            "  Features: 14\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018189 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 1276\n",
            "[LightGBM] [Info] Number of data points in the train set: 512647, number of used features: 14\n",
            "[LightGBM] [Info] Start training from score 6.645564\n",
            "[resources       ]  Cov@25%  14.9% | Cov@50%  28.8% | RMSLE 1.857 | MAE  3656.4 | ρ 0.452\n",
            "\n",
            "[Ablation] Group + Resources (no User) ...\n",
            "  Features: 27\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022649 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 4304\n",
            "[LightGBM] [Info] Number of data points in the train set: 512647, number of used features: 27\n",
            "[LightGBM] [Info] Start training from score 6.602821\n",
            "[group           ]  Cov@25%  35.1% | Cov@50%  53.1% | RMSLE 1.075 | MAE  2943.2 | ρ 0.861\n",
            "\n",
            "[Ablation] All features (User + Group + Resources) ...\n",
            "  Features: 33\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019939 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 5583\n",
            "[LightGBM] [Info] Number of data points in the train set: 512647, number of used features: 33\n",
            "[LightGBM] [Info] Start training from score 6.602821\n",
            "[all             ]  Cov@25%  45.8% | Cov@50%  66.7% | RMSLE 0.942 | MAE  2471.2 | ρ 0.900\n",
            "\n",
            "======================================================================\n",
            "ABLATION SUMMARY\n",
            "======================================================================\n",
            "Configuration                  Cov@25%    Cov@50%    RMSLE     \n",
            "----------------------------------------------------------------------\n",
            "Resources only                  14.9%      28.8%     1.857\n",
            "Group + Resources               35.1%      53.1%     1.075\n",
            "All features                    45.8%      66.7%     0.942\n",
            "\n",
            "→ Group adds: +20.2%\n",
            "→ User adds:  +10.6%\n",
            "→ Group is the PRIMARY predictive signal\n",
            "\n",
            "==============================================================================\n",
            "FINAL SUMMARY FOR REBUTTAL\n",
            "==============================================================================\n",
            "\n",
            "1. Train-Test Gap (Minimal Overfitting):\n",
            "   Train: 46.7% | Test: 45.8% | Gap: +1.0%\n",
            "\n",
            "2. Seen vs Unseen Users (Generalization Test):\n",
            "   Best Model:\n",
            "     Seen: 45.9% | Unseen: 40.1% | Gap: +5.7%\n",
            "\n",
            "3. Ablation Study (Feature Importance Hierarchy):\n",
            "   Resources only:     14.9%\n",
            "   + Group:            35.1%\n",
            "   + Group + User:     45.8%\n",
            "\n",
            "   → Group adds: +20.2%\n",
            "   → User adds:  +10.6%\n",
            "   → Group is the PRIMARY predictive signal\n",
            "\n",
            "==============================================================================\n",
            "Total wall time: 2715.2s\n",
            "==============================================================================\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# =====================================================================\n",
        "# TOTAL COMPLETION TIME SCHEDULING EVALUATION\n",
        "# =====================================================================\n",
        "\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "import heapq\n",
        "from dataclasses import dataclass\n",
        "from typing import List, Dict, Tuple, Optional\n",
        "from scipy.stats import spearmanr\n",
        "import time\n",
        "\n",
        "EPS = 1e-12\n",
        "\n",
        "\n",
        "@dataclass\n",
        "class Job:\n",
        "    job_id: int\n",
        "    arrival_time: float   # r_j\n",
        "    true_size: float      # p*_j\n",
        "    predicted_size: float # \\hat p_j\n",
        "\n",
        "def build_jobs_from_df(scheduling_df: pd.DataFrame) -> List[Job]:\n",
        "    \"\"\"Build job list from dataframe.\"\"\"\n",
        "    jobs: List[Job] = [\n",
        "        Job(\n",
        "            job_id=int(row.job_id),\n",
        "            arrival_time=float(row.submit_time),\n",
        "            true_size=float(row.p_star),\n",
        "            predicted_size=float(row.y_pred),\n",
        "        )\n",
        "        for row in scheduling_df.itertuples(index=False)\n",
        "    ]\n",
        "    return jobs\n",
        "\n",
        "\n",
        "def simulate_srpt(jobs: List[Job]) -> Tuple[float, pd.DataFrame]:\n",
        "    \"\"\"Optimal ΣC_j with release times (preemptive SRPT on true remaining time).\"\"\"\n",
        "    J = sorted(jobs, key=lambda x: x.arrival_time)\n",
        "    n, i, t, totalC = len(J), 0, 0.0, 0.0\n",
        "    metrics = []\n",
        "    ready: List[Tuple[float, float, int, Job]] = []\n",
        "\n",
        "    while i < n or ready:\n",
        "        if not ready and i < n and t < J[i].arrival_time:\n",
        "            t = J[i].arrival_time\n",
        "        while i < n and J[i].arrival_time <= t + EPS:\n",
        "            heapq.heappush(ready, (J[i].true_size, J[i].arrival_time, J[i].job_id, J[i]))\n",
        "            i += 1\n",
        "        if not ready:\n",
        "            continue\n",
        "        rem, rj, jid, job = heapq.heappop(ready)\n",
        "        next_arrival = J[i].arrival_time if i < n else float('inf')\n",
        "        run = min(rem, next_arrival - t)\n",
        "        if run <= EPS:\n",
        "            heapq.heappush(ready, (rem, rj, jid, job))\n",
        "            t = next_arrival\n",
        "            continue\n",
        "        t += run\n",
        "        rem -= run\n",
        "        if rem <= EPS:\n",
        "            Cj = t\n",
        "            totalC += Cj\n",
        "            metrics.append({\n",
        "                'job_id': jid, 'arrival_time': rj, 'completion_time': Cj,\n",
        "                'flow_time': Cj - rj, 'wait_time': Cj - rj - job.true_size,\n",
        "                'slowdown': (Cj - rj) / max(job.true_size, EPS),\n",
        "            })\n",
        "        else:\n",
        "            heapq.heappush(ready, (rem, rj, jid, job))\n",
        "\n",
        "    return totalC, pd.DataFrame(metrics)\n",
        "\n",
        "def simulate_fifo(jobs: List[Job]) -> Tuple[float, pd.DataFrame]:\n",
        "    \"\"\"FIFO (non-preemptive).\"\"\"\n",
        "    J = sorted(jobs, key=lambda x: x.arrival_time)\n",
        "    t, totalC = 0.0, 0.0\n",
        "    metrics = []\n",
        "\n",
        "    for job in J:\n",
        "        if t < job.arrival_time:\n",
        "            t = job.arrival_time\n",
        "        start = t\n",
        "        t += job.true_size\n",
        "        Cj = t\n",
        "        totalC += Cj\n",
        "        metrics.append({\n",
        "            'job_id': job.job_id, 'arrival_time': job.arrival_time, 'completion_time': Cj,\n",
        "            'flow_time': Cj - job.arrival_time, 'wait_time': start - job.arrival_time,\n",
        "            'slowdown': (Cj - job.arrival_time) / max(job.true_size, EPS),\n",
        "        })\n",
        "\n",
        "    return totalC, pd.DataFrame(metrics)\n",
        "\n",
        "def simulate_sjf_or_spjf(jobs: List[Job], use_predictions: bool) -> Tuple[float, pd.DataFrame]:\n",
        "    \"\"\"SJF (oracle true size) or SPJF (predicted size), non-preemptive.\"\"\"\n",
        "    J = sorted(jobs, key=lambda x: x.arrival_time)\n",
        "    n, i, t, totalC = len(J), 0, 0.0, 0.0\n",
        "    metrics = []\n",
        "    ready: List[Tuple[float, float, int, Job]] = []\n",
        "\n",
        "    while i < n or ready:\n",
        "        if not ready and i < n and t < J[i].arrival_time:\n",
        "            t = J[i].arrival_time\n",
        "        while i < n and J[i].arrival_time <= t + EPS:\n",
        "            key = J[i].predicted_size if use_predictions else J[i].true_size\n",
        "            heapq.heappush(ready, (key, J[i].arrival_time, J[i].job_id, J[i]))\n",
        "            i += 1\n",
        "        if not ready:\n",
        "            continue\n",
        "        _, rj, jid, job = heapq.heappop(ready)\n",
        "        start = t\n",
        "        t += job.true_size\n",
        "        Cj = t\n",
        "        totalC += Cj\n",
        "        metrics.append({\n",
        "            'job_id': jid, 'arrival_time': rj, 'completion_time': Cj,\n",
        "            'flow_time': Cj - rj, 'wait_time': start - rj,\n",
        "            'slowdown': (Cj - rj) / max(job.true_size, EPS),\n",
        "        })\n",
        "\n",
        "    return totalC, pd.DataFrame(metrics)\n",
        "\n",
        "def simulate_rr(jobs: List[Job]) -> Tuple[float, pd.DataFrame]:\n",
        "    \"\"\"Round Robin (equal sharing), preemptive.\"\"\"\n",
        "    J = sorted(jobs, key=lambda x: x.arrival_time)\n",
        "    n, i, t, totalC = len(J), 0, 0.0, 0.0\n",
        "    metrics = []\n",
        "    active: List[Tuple[Job, float]] = []\n",
        "\n",
        "    while i < n or active:\n",
        "        if not active and i < n and t < J[i].arrival_time:\n",
        "            t = J[i].arrival_time\n",
        "        while i < n and J[i].arrival_time <= t + EPS:\n",
        "            active.append((J[i], J[i].true_size))\n",
        "            i += 1\n",
        "        if not active:\n",
        "            continue\n",
        "        k = len(active)\n",
        "        rate = 1.0 / k\n",
        "        time_to_finish = min(rem / rate for (_, rem) in active)\n",
        "        next_arrival = J[i].arrival_time if i < n else float('inf')\n",
        "        dt = min(time_to_finish, next_arrival - t)\n",
        "        t += dt\n",
        "        finished_idx = []\n",
        "        for idx, (job, rem) in enumerate(active):\n",
        "            rem_new = rem - rate * dt\n",
        "            active[idx] = (job, rem_new)\n",
        "            if rem_new <= EPS and dt == time_to_finish:\n",
        "                finished_idx.append(idx)\n",
        "        for idx in reversed(finished_idx):\n",
        "            job, _ = active.pop(idx)\n",
        "            Cj = t\n",
        "            totalC += Cj\n",
        "            metrics.append({\n",
        "                'job_id': job.job_id, 'arrival_time': job.arrival_time, 'completion_time': Cj,\n",
        "                'flow_time': Cj - job.arrival_time,\n",
        "                'wait_time': Cj - job.arrival_time - job.true_size,\n",
        "                'slowdown': (Cj - job.arrival_time) / max(job.true_size, EPS),\n",
        "            })\n",
        "\n",
        "    return totalC, pd.DataFrame(metrics)\n",
        "\n",
        "def simulate_prr(jobs: List[Job], lam: float = 0.7) -> Tuple[float, pd.DataFrame]:\n",
        "    \"\"\"Preferential Round Robin with λ parameter.\"\"\"\n",
        "    assert 0.0 < lam < 1.0\n",
        "    J = sorted(jobs, key=lambda x: x.arrival_time)\n",
        "    n, i, t, totalC = len(J), 0, 0.0, 0.0\n",
        "    metrics = []\n",
        "    active: List[Tuple[Job, float]] = []\n",
        "\n",
        "    while i < n or active:\n",
        "        if not active and i < n and t < J[i].arrival_time:\n",
        "            t = J[i].arrival_time\n",
        "        while i < n and J[i].arrival_time <= t + EPS:\n",
        "            active.append((J[i], J[i].true_size))\n",
        "            i += 1\n",
        "        if not active:\n",
        "            continue\n",
        "        k = len(active)\n",
        "        min_idx = min(range(k), key=lambda idx: active[idx][0].predicted_size)\n",
        "        base = (1.0 - lam) / k\n",
        "        rates = [base] * k\n",
        "        rates[min_idx] += lam\n",
        "        time_to_finish = min(rem / max(rates[idx], EPS) for idx, (_, rem) in enumerate(active))\n",
        "        next_arrival = J[i].arrival_time if i < n else float('inf')\n",
        "        dt = min(time_to_finish, next_arrival - t)\n",
        "        t += dt\n",
        "        finished_idx = []\n",
        "        for idx, (job, rem) in enumerate(active):\n",
        "            rem_new = rem - rates[idx] * dt\n",
        "            active[idx] = (job, rem_new)\n",
        "            if rem_new <= EPS and dt == time_to_finish:\n",
        "                finished_idx.append(idx)\n",
        "        for idx in reversed(finished_idx):\n",
        "            job, _ = active.pop(idx)\n",
        "            Cj = t\n",
        "            totalC += Cj\n",
        "            metrics.append({\n",
        "                'job_id': job.job_id, 'arrival_time': job.arrival_time, 'completion_time': Cj,\n",
        "                'flow_time': Cj - job.arrival_time,\n",
        "                'wait_time': Cj - job.arrival_time - job.true_size,\n",
        "                'slowdown': (Cj - job.arrival_time) / max(job.true_size, EPS),\n",
        "            })\n",
        "\n",
        "    return totalC, pd.DataFrame(metrics)\n",
        "\n",
        "# ======================== MAIN EVALUATION ========================\n",
        "\n",
        "def evaluate_total_completion_all_methods(\n",
        "    df: pd.DataFrame,\n",
        "    idx_te: pd.Index,\n",
        "    predictions: Dict[str, np.ndarray],\n",
        "    sample_size: Optional[int] = 10000,\n",
        "    prr_lambda: float = 0.7,\n",
        "    seed: int = 42,\n",
        "    verbose: bool = True\n",
        ") -> Dict:\n",
        "    \"\"\"\n",
        "    Complete evaluation of total completion time for all prediction methods.\n",
        "\n",
        "    Args:\n",
        "        df: Full dataframe with columns [submit_time, p_star, ...]\n",
        "        idx_te: Test set indices into df\n",
        "        predictions: Dict mapping method name -> prediction array (aligned with idx_te)\n",
        "        sample_size: Number of jobs to evaluate (None = all)\n",
        "        prr_lambda: Lambda parameter for PRR\n",
        "        seed: Random seed for consistent sampling\n",
        "        verbose: Print detailed output\n",
        "\n",
        "    Returns:\n",
        "        Dictionary with complete results\n",
        "    \"\"\"\n",
        "\n",
        "    if verbose:\n",
        "        print(\"=\"*80)\n",
        "        print(\"TOTAL COMPLETION TIME SCHEDULING EVALUATION\")\n",
        "        print(f\"Sample size: {sample_size:,} | PRR λ: {prr_lambda:.2f} | Seed: {seed}\")\n",
        "        print(\"=\"*80)\n",
        "\n",
        "    start_time = time.time()\n",
        "\n",
        "    # ============ Data preparation ============\n",
        "    test_df = df.loc[idx_te].copy().reset_index(drop=True)\n",
        "    test_df['original_idx'] = np.arange(len(test_df))\n",
        "\n",
        "    test_df = test_df.sort_values('submit_time').reset_index(drop=True)\n",
        "\n",
        "    t0 = float(test_df['submit_time'].min())\n",
        "    test_df['submit_time'] = test_df['submit_time'] - t0\n",
        "\n",
        "    if sample_size is not None and sample_size < len(test_df):\n",
        "        np.random.seed(seed)\n",
        "        test_df = test_df.iloc[:sample_size].copy()\n",
        "\n",
        "    selected_original_indices = test_df['original_idx'].values\n",
        "\n",
        "    base_df = test_df[['submit_time', 'p_star']].copy()\n",
        "    base_df['job_id'] = np.arange(len(base_df))\n",
        "\n",
        "    if verbose:\n",
        "        print(f\"\\nActual jobs being evaluated: {len(base_df):,}\")\n",
        "        print(\"Computing baselines...\")\n",
        "\n",
        "    # ============ Baselines ============\n",
        "    baseline_df = base_df.copy()\n",
        "    baseline_df['y_pred'] = baseline_df['p_star']\n",
        "    baseline_jobs = build_jobs_from_df(baseline_df)\n",
        "\n",
        "    srpt_total, srpt_metrics = simulate_srpt(baseline_jobs)\n",
        "    fifo_total, fifo_metrics = simulate_fifo(baseline_jobs)\n",
        "    sjf_total, sjf_metrics = simulate_sjf_or_spjf(baseline_jobs, use_predictions=False)\n",
        "    rr_total, rr_metrics = simulate_rr(baseline_jobs)\n",
        "\n",
        "    results = {\n",
        "        'meta': {\n",
        "            'n_jobs': len(base_df),\n",
        "            'sample_size': sample_size,\n",
        "            'prr_lambda': prr_lambda,\n",
        "            'seed': seed\n",
        "        },\n",
        "        'baselines': {\n",
        "            'SRPT': {\n",
        "                'total_completion': srpt_total,\n",
        "                'avg_flow_time': float(srpt_metrics['flow_time'].mean()),\n",
        "                'ratio_vs_opt': 1.0,\n",
        "                'ratio_vs_fifo': srpt_total / fifo_total\n",
        "            },\n",
        "            'SJF': {\n",
        "                'total_completion': sjf_total,\n",
        "                'avg_flow_time': float(sjf_metrics['flow_time'].mean()),\n",
        "                'ratio_vs_opt': sjf_total / srpt_total,\n",
        "                'ratio_vs_fifo': sjf_total / fifo_total\n",
        "            },\n",
        "            'RR': {\n",
        "                'total_completion': rr_total,\n",
        "                'avg_flow_time': float(rr_metrics['flow_time'].mean()),\n",
        "                'ratio_vs_opt': rr_total / srpt_total,\n",
        "                'ratio_vs_fifo': rr_total / fifo_total\n",
        "            },\n",
        "            'FIFO': {\n",
        "                'total_completion': fifo_total,\n",
        "                'avg_flow_time': float(fifo_metrics['flow_time'].mean()),\n",
        "                'ratio_vs_opt': fifo_total / srpt_total,\n",
        "                'ratio_vs_fifo': 1.0\n",
        "            }\n",
        "        },\n",
        "        'methods': {}\n",
        "    }\n",
        "\n",
        "    # ============ Evaluate Each Prediction Method ============\n",
        "    methods_to_eval = list(predictions.keys())\n",
        "\n",
        "    if verbose:\n",
        "        print(f\"\\nEvaluating {len(methods_to_eval)} prediction methods...\")\n",
        "        print(\"-\"*80)\n",
        "\n",
        "    for i, method_name in enumerate(methods_to_eval):\n",
        "        if method_name not in predictions:\n",
        "            continue\n",
        "\n",
        "        method_start = time.time()\n",
        "\n",
        "        all_predictions = predictions[method_name]\n",
        "        y_pred = all_predictions[selected_original_indices]\n",
        "\n",
        "        try:\n",
        "            rho, _ = spearmanr(base_df['p_star'].values, y_pred)\n",
        "            pred_quality = float(rho) if not np.isnan(rho) else 0.5\n",
        "        except:\n",
        "            pred_quality = 0.5\n",
        "\n",
        "        sched_df = base_df.copy()\n",
        "        sched_df['y_pred'] = y_pred\n",
        "        jobs = build_jobs_from_df(sched_df)\n",
        "\n",
        "        spjf_total, spjf_metrics = simulate_sjf_or_spjf(jobs, use_predictions=True)\n",
        "        prr_total, prr_metrics = simulate_prr(jobs, lam=prr_lambda)\n",
        "\n",
        "        results['methods'][method_name] = {\n",
        "            'prediction_quality': pred_quality,\n",
        "            'SPJF': {\n",
        "                'total_completion': spjf_total,\n",
        "                'avg_flow_time': float(spjf_metrics['flow_time'].mean()),\n",
        "                'ratio_vs_opt': spjf_total / srpt_total,\n",
        "                'ratio_vs_fifo': spjf_total / fifo_total\n",
        "            },\n",
        "            'PRR': {\n",
        "                'total_completion': prr_total,\n",
        "                'avg_flow_time': float(prr_metrics['flow_time'].mean()),\n",
        "                'ratio_vs_opt': prr_total / srpt_total,\n",
        "                'ratio_vs_fifo': prr_total / fifo_total,\n",
        "                'lambda': prr_lambda\n",
        "            }\n",
        "        }\n",
        "\n",
        "        if verbose:\n",
        "            print(f\"[{i+1}/{len(methods_to_eval)}] {method_name:8s}: \"\n",
        "                  f\"ρ={pred_quality:.3f}, \"\n",
        "                  f\"SPJF/OPT={spjf_total/srpt_total:.4f}, \"\n",
        "                  f\"PRR/OPT={prr_total/srpt_total:.4f}, \"\n",
        "                  f\"time={time.time() - method_start:.1f}s\")\n",
        "\n",
        "    total_time = time.time() - start_time\n",
        "\n",
        "    # ============ Print Summary ============\n",
        "    if verbose:\n",
        "        print(\"\\n\" + \"=\"*80)\n",
        "        print(f\"SUMMARY (n={len(base_df):,} jobs, runtime={total_time:.1f}s)\")\n",
        "        print(\"=\"*80)\n",
        "\n",
        "        print(\"\\nBASELINES:\")\n",
        "        print(f\"{'Policy':12s} {'ΣC_j':>16s} {'Avg Flow':>12s} {'ρ_TC':>10s} {'ratio/FIFO':>11s}\")\n",
        "        print(\"-\"*80)\n",
        "        for name in ['SRPT', 'SJF', 'RR', 'FIFO']:\n",
        "            r = results['baselines'][name]\n",
        "            print(f\"{name:12s} {r['total_completion']:16.3f} {r['avg_flow_time']:12.3f} \"\n",
        "                  f\"{r['ratio_vs_opt']:10.4f} {r['ratio_vs_fifo']:11.4f}\")\n",
        "\n",
        "        print(f\"\\nPREDICTION METHODS (PRR with λ={prr_lambda:.2f}):\")\n",
        "        print(f\"{'Method':8s} {'ρ':>6s} {'SPJF ρ_TC':>10s} {'PRR ρ_TC':>10s} {'Improvement':>12s}\")\n",
        "        print(\"-\"*80)\n",
        "\n",
        "        sorted_methods = sorted(results['methods'].items(),\n",
        "                               key=lambda x: x[1]['PRR']['ratio_vs_opt'])\n",
        "\n",
        "        for method_name, method_data in sorted_methods:\n",
        "            spjf_ratio = method_data['SPJF']['ratio_vs_opt']\n",
        "            prr_ratio = method_data['PRR']['ratio_vs_opt']\n",
        "            improvement = (spjf_ratio - prr_ratio) / spjf_ratio * 100\n",
        "\n",
        "            print(f\"{method_name:8s} {method_data['prediction_quality']:6.3f} \"\n",
        "                  f\"{spjf_ratio:10.4f} \"\n",
        "                  f\"{prr_ratio:10.4f} \"\n",
        "                  f\"{improvement:11.1f}%\")\n",
        "\n",
        "        if sorted_methods:\n",
        "            best = sorted_methods[0]\n",
        "            print(\"\\n\" + \"=\"*80)\n",
        "            print(f\"*** BEST: {best[0]} achieves ρ_TC={best[1]['PRR']['ratio_vs_opt']:.4f} \"\n",
        "                  f\"with PRR(λ={prr_lambda:.2f}) ***\")\n",
        "            print(\"=\"*80)\n",
        "\n",
        "    # Save results\n",
        "    summary_rows = []\n",
        "    for method_name, method_data in results['methods'].items():\n",
        "        summary_rows.append({\n",
        "            'method': method_name,\n",
        "            'spearman_rho': method_data['prediction_quality'],\n",
        "            'spjf_total': method_data['SPJF']['total_completion'],\n",
        "            'spjf_ratio_tc': method_data['SPJF']['ratio_vs_opt'],\n",
        "            'prr_total': method_data['PRR']['total_completion'],\n",
        "            'prr_ratio_tc': method_data['PRR']['ratio_vs_opt'],\n",
        "            'prr_lambda': prr_lambda\n",
        "        })\n",
        "\n",
        "    summary_df = pd.DataFrame(summary_rows)\n",
        "    summary_df.to_csv(f\"total_completion_results_{sample_size}.csv\", index=False)\n",
        "\n",
        "    if verbose:\n",
        "        print(f\"\\n[INFO] Results saved to: total_completion_results_{sample_size}.csv\")\n",
        "\n",
        "    return results\n",
        "\n",
        "# ======================== LAMBDA SWEEP ========================\n",
        "\n",
        "def sweep_prr_lambda_range(\n",
        "    df: pd.DataFrame,\n",
        "    idx_te: pd.Index,\n",
        "    predictions: Dict[str, np.ndarray],\n",
        "    sample_size: int = 10000,\n",
        "    lambda_values: Optional[List[float]] = None,\n",
        "    seed: int = 42\n",
        ") -> pd.DataFrame:\n",
        "    \"\"\"Sweep PRR λ values to find optimal setting. Uses M5 predictions by default.\"\"\"\n",
        "\n",
        "    if lambda_values is None:\n",
        "        lambda_values = [0.5, 0.6, 0.7, 0.8, 0.9, 0.95]\n",
        "\n",
        "    print(f\"\\nRunning λ sweep: {lambda_values}\")\n",
        "    print(\"-\"*50)\n",
        "\n",
        "    sweep_results = []\n",
        "\n",
        "    for lam in lambda_values:\n",
        "        results = evaluate_total_completion_all_methods(\n",
        "            df=df, idx_te=idx_te, predictions=predictions,\n",
        "            sample_size=sample_size, prr_lambda=lam, seed=seed, verbose=False\n",
        "        )\n",
        "\n",
        "        if results and 'M5' in results['methods']:\n",
        "            m5_prr = results['methods']['M5']['PRR']\n",
        "            sweep_results.append({\n",
        "                'lambda': lam,\n",
        "                'ratio_vs_opt': m5_prr['ratio_vs_opt'],\n",
        "                'total_completion': m5_prr['total_completion'],\n",
        "                'avg_flow_time': m5_prr['avg_flow_time']\n",
        "            })\n",
        "            print(f\"λ={lam:.2f}: ρ_TC={m5_prr['ratio_vs_opt']:.4f}\")\n",
        "\n",
        "    sweep_df = pd.DataFrame(sweep_results)\n",
        "    best_lambda = sweep_df.loc[sweep_df['ratio_vs_opt'].idxmin(), 'lambda']\n",
        "\n",
        "    print(f\"\\n*** Optimal λ={best_lambda:.2f} for M5 ***\")\n",
        "\n",
        "    return sweep_df\n",
        "\n",
        "# ======================== MAIN EXECUTION ========================\n",
        "\n",
        "if __name__ == \"__main__\":\n",
        "    print(\"\\nStarting TOTAL COMPLETION TIME evaluation...\\n\")\n",
        "\n",
        "\n",
        "    print(\"Usage: run the prediction pipeline first, then call\")\n",
        "    print(\"  evaluate_total_completion_all_methods(df, idx_te, predictions)\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "DyUyyoVsUClZ",
        "outputId": "c0e685d0-7b90-4bba-e4aa-28e9beb40972"
      },
      "execution_count": 10,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\n",
            "Starting TOTAL COMPLETION TIME evaluation...\n",
            "\n",
            "Usage: run the prediction pipeline first, then call\n",
            "  evaluate_total_completion_all_methods(df, idx_te, predictions)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "results = evaluate_total_completion_all_methods(\n",
        "    df=df, idx_te=idx_te, predictions=predictions,\n",
        "    sample_size=10000, prr_lambda=0.7, verbose=True\n",
        ")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "pH5FMhO9UHNi",
        "outputId": "3d69a5eb-4f53-49d9-c76b-4b2eec34a237"
      },
      "execution_count": 11,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "================================================================================\n",
            "TOTAL COMPLETION TIME SCHEDULING EVALUATION\n",
            "Sample size: 10,000 | PRR λ: 0.70 | Seed: 42\n",
            "================================================================================\n",
            "\n",
            "Actual jobs being evaluated: 10,000\n",
            "Computing baselines...\n",
            "\n",
            "Evaluating 6 prediction methods...\n",
            "--------------------------------------------------------------------------------\n",
            "[1/6] M1      : ρ=0.965, SPJF/OPT=1.1354, PRR/OPT=1.3278, time=22.7s\n",
            "[2/6] M3      : ρ=0.860, SPJF/OPT=1.6859, PRR/OPT=1.8866, time=24.0s\n",
            "[3/6] M4      : ρ=0.965, SPJF/OPT=1.1605, PRR/OPT=1.3536, time=23.6s\n",
            "[4/6] M5      : ρ=0.968, SPJF/OPT=1.1064, PRR/OPT=1.2920, time=24.6s\n",
            "[5/6] M6      : ρ=0.967, SPJF/OPT=1.1241, PRR/OPT=1.3100, time=23.5s\n",
            "[6/6] M7      : ρ=0.958, SPJF/OPT=1.1723, PRR/OPT=1.3690, time=23.2s\n",
            "\n",
            "================================================================================\n",
            "SUMMARY (n=10,000 jobs, runtime=153.1s)\n",
            "================================================================================\n",
            "\n",
            "BASELINES:\n",
            "Policy                   ΣC_j     Avg Flow       ρ_TC  ratio/FIFO\n",
            "--------------------------------------------------------------------------------\n",
            "SRPT          49103772143.000  4578908.143     1.0000      0.1862\n",
            "SJF           49166586215.000  4585189.551     1.0013      0.1864\n",
            "RR            96964142202.892  9364945.149     1.9747      0.3676\n",
            "FIFO         263785184516.000 26047049.381     5.3720      1.0000\n",
            "\n",
            "PREDICTION METHODS (PRR with λ=0.70):\n",
            "Method        ρ  SPJF ρ_TC   PRR ρ_TC  Improvement\n",
            "--------------------------------------------------------------------------------\n",
            "M5        0.968     1.1064     1.2920       -16.8%\n",
            "M6        0.967     1.1241     1.3100       -16.5%\n",
            "M1        0.965     1.1354     1.3278       -16.9%\n",
            "M4        0.965     1.1605     1.3536       -16.6%\n",
            "M7        0.958     1.1723     1.3690       -16.8%\n",
            "M3        0.860     1.6859     1.8866       -11.9%\n",
            "\n",
            "================================================================================\n",
            "*** BEST: M5 achieves ρ_TC=1.2920 with PRR(λ=0.70) ***\n",
            "================================================================================\n",
            "\n",
            "[INFO] Results saved to: total_completion_results_10000.csv\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# =====================================================================\n",
        "# MAX-STRETCH (1|r,pmtn|S_max)\n",
        "# =====================================================================\n",
        "\n",
        "from typing import Dict, Tuple, Optional, List\n",
        "import numpy as np, heapq, time, math\n",
        "\n",
        "EPS = 1e-9\n",
        "\n",
        "# --------------------------- Preparation ----------------------------\n",
        "def _prepare_jobs(\n",
        "    df,\n",
        "    idx_te,\n",
        "    y_pred: np.ndarray,\n",
        "    sample_size: Optional[int] = 5000,\n",
        "    seed: int = 42,\n",
        "    clip_ratio: Optional[Tuple[float, float]] = (0.1, 10.0),\n",
        "):\n",
        "    cols = [\"submit_time\", \"p_star\"]\n",
        "    te = df.loc[idx_te, cols].reset_index(drop=True).copy()\n",
        "\n",
        "    n_all = len(te)\n",
        "    y_pred = np.asarray(y_pred[:n_all], dtype=np.float64)\n",
        "\n",
        "    if sample_size and sample_size < n_all:\n",
        "        rng = np.random.RandomState(seed)\n",
        "        pick = np.sort(rng.choice(n_all, size=sample_size, replace=False))\n",
        "        te = te.iloc[pick].copy()\n",
        "        y_pred = y_pred[pick]\n",
        "\n",
        "    sort_idx = np.argsort(te[\"submit_time\"].values)\n",
        "    te = te.iloc[sort_idx].reset_index(drop=True)\n",
        "    y_pred = y_pred[sort_idx]\n",
        "\n",
        "    r0 = float(te[\"submit_time\"].iloc[0])\n",
        "    r = te[\"submit_time\"].to_numpy(dtype=np.float64) - r0\n",
        "    p = np.maximum(te[\"p_star\"].to_numpy(dtype=np.float64), 1.0)\n",
        "    q = np.maximum(y_pred.astype(np.float64), 1.0)\n",
        "\n",
        "    if clip_ratio:\n",
        "        lo, hi = clip_ratio\n",
        "        q = np.clip(q, lo * p, hi * p)\n",
        "\n",
        "    return r, p, q\n",
        "\n",
        "# ------------------------------ Metrics -----------------------------\n",
        "def _stretch_array(r: np.ndarray, C: np.ndarray, p: np.ndarray) -> np.ndarray:\n",
        "    return np.maximum((C - r) / np.maximum(p, EPS), 1.0)\n",
        "\n",
        "def _compute_metrics(r: np.ndarray, C: np.ndarray, p: np.ndarray, normS: float) -> Dict:\n",
        "    s = _stretch_array(r, C, p)\n",
        "    tau = 10.0\n",
        "    bsld = (C - r) / np.maximum(p, tau)\n",
        "    return {\n",
        "        \"n\": int(s.size),\n",
        "        \"max\": float(np.max(s)),\n",
        "        \"p99\": float(np.percentile(s, 99)),\n",
        "        \"p95\": float(np.percentile(s, 95)),\n",
        "        \"p90\": float(np.percentile(s, 90)),\n",
        "        \"median\": float(np.median(s)),\n",
        "        \"mean\": float(np.mean(s)),\n",
        "        \"max_over_OPT\": float(np.max(s) / normS),\n",
        "        \"p99_over_OPT\": float(np.percentile(s, 99) / normS),\n",
        "        \"p95_over_OPT\": float(np.percentile(s, 95) / normS),\n",
        "        \"med_over_OPT\": float(np.median(s) / normS),\n",
        "        \"bsld10_max\": float(np.max(bsld)),\n",
        "        \"bsld10_p99\": float(np.percentile(bsld, 99)),\n",
        "        \"bsld10_med\": float(np.median(bsld)),\n",
        "    }\n",
        "\n",
        "# ---------------------- EDF feasibility & OPT(S*) --------------------\n",
        "def _edf_feasible_fast(r: np.ndarray, p: np.ndarray, d: np.ndarray) -> bool:\n",
        "    \"\"\"EDF is feasibility-optimal on a preemptive uniprocessor.\"\"\"\n",
        "    n = len(r)\n",
        "    rem = p.copy()\n",
        "    heap = []  # (deadline, j)\n",
        "    i, t = 0, 0.0\n",
        "\n",
        "    while i < n or heap:\n",
        "        if not heap and i < n:\n",
        "            t = max(t, r[i])\n",
        "        while i < n and r[i] <= t + EPS:\n",
        "            heapq.heappush(heap, (d[i], i)); i += 1\n",
        "        if not heap:\n",
        "            continue\n",
        "        dj, j = heapq.heappop(heap)\n",
        "        if t > dj + EPS:\n",
        "            return False\n",
        "        next_arr = r[i] if i < n else math.inf\n",
        "        dt = min(rem[j], next_arr - t, dj - t)\n",
        "        if dt <= 1e-12:\n",
        "            t = min(next_arr, dj)\n",
        "            if rem[j] > EPS:\n",
        "                heapq.heappush(heap, (dj, j))\n",
        "            continue\n",
        "        rem[j] -= dt\n",
        "        t += dt\n",
        "        if rem[j] > EPS:\n",
        "            if t >= dj - EPS:\n",
        "                return False\n",
        "            heapq.heappush(heap, (dj, j))\n",
        "    return True\n",
        "\n",
        "def _opt_max_stretch_fast(r: np.ndarray, p: np.ndarray, tol: float = 1e-3) -> float:\n",
        "    \"\"\"Exact S* via bisection on S with EDF-feasibility; O(log(1/tol) * n log n).\"\"\"\n",
        "    def quick_feasible(S: float) -> bool:\n",
        "        d = r + S * p\n",
        "        if np.sum(p) > (np.max(d) - r[0]) + 1e-12:\n",
        "            return False\n",
        "        return _edf_feasible_fast(r, p, d)\n",
        "\n",
        "    lo, hi = 1.0, 2.0\n",
        "    while not quick_feasible(hi):\n",
        "        hi *= 2.0\n",
        "        if hi > 1e12:\n",
        "            raise RuntimeError(\"Failed to bracket S*; check inputs.\")\n",
        "    for _ in range(60):\n",
        "        mid = 0.5 * (lo + hi)\n",
        "        if quick_feasible(mid):\n",
        "            hi = mid\n",
        "        else:\n",
        "            lo = mid\n",
        "        if hi - lo <= tol * max(1.0, hi):\n",
        "            break\n",
        "    return hi\n",
        "\n",
        "def _edf_schedule_fast(r: np.ndarray, p: np.ndarray, S: float, slack: float = 0.0) -> np.ndarray:\n",
        "    \"\"\"Construct EDF schedule at S and return completion times.\"\"\"\n",
        "    d = r + (1.0 + slack) * S * p\n",
        "    n = len(r)\n",
        "    rem = p.copy()\n",
        "    C = np.full(n, np.nan, dtype=np.float64)\n",
        "    heap = []\n",
        "    i, t = 0, 0.0\n",
        "\n",
        "    while i < n or heap:\n",
        "        if not heap and i < n:\n",
        "            t = max(t, r[i])\n",
        "        while i < n and r[i] <= t + EPS:\n",
        "            heapq.heappush(heap, (d[i], i)); i += 1\n",
        "        if not heap:\n",
        "            continue\n",
        "        dj, j = heapq.heappop(heap)\n",
        "        next_arr = r[i] if i < n else math.inf\n",
        "        dt = min(rem[j], next_arr - t, dj - t)\n",
        "        if dt <= 1e-12:\n",
        "            t = min(next_arr, dj)\n",
        "            if rem[j] > EPS:\n",
        "                heapq.heappush(heap, (dj, j))\n",
        "            continue\n",
        "        rem[j] -= dt\n",
        "        t += dt\n",
        "        if rem[j] <= EPS:\n",
        "            C[j] = t\n",
        "        else:\n",
        "            heapq.heappush(heap, (dj, j))\n",
        "    return C\n",
        "\n",
        "# -------------------- Scheduling Algorithms --------------------\n",
        "def _srpt_fast(r: np.ndarray, p_true: np.ndarray, key: np.ndarray) -> np.ndarray:\n",
        "    \"\"\"SRPT/SPRPT: preemptive, event-driven. key=p for SRPT, key=q for SPRPT.\"\"\"\n",
        "    n = len(r)\n",
        "    rem_t = p_true.copy()\n",
        "    rem_k = key.copy()\n",
        "    C = np.full(n, np.nan, dtype=np.float64)\n",
        "    heap = []\n",
        "    i, t = 0, 0.0\n",
        "    active = -1\n",
        "\n",
        "    while i < n or active >= 0 or heap:\n",
        "        if active < 0 and not heap and i < n:\n",
        "            t = max(t, r[i])\n",
        "        while i < n and r[i] <= t + EPS:\n",
        "            if active < 0 or rem_k[i] < rem_k[active]:\n",
        "                if active >= 0:\n",
        "                    heapq.heappush(heap, (rem_k[active], active))\n",
        "                active = i\n",
        "            else:\n",
        "                heapq.heappush(heap, (rem_k[i], i))\n",
        "            i += 1\n",
        "        if active < 0:\n",
        "            if not heap:\n",
        "                continue\n",
        "            _, active = heapq.heappop(heap)\n",
        "        next_arr = r[i] if i < n else math.inf\n",
        "        dt = min(rem_t[active], next_arr - t)\n",
        "        if dt <= 1e-12:\n",
        "            t = next_arr\n",
        "            continue\n",
        "        rem_t[active] -= dt\n",
        "        rem_k[active] = max(0.0, rem_k[active] - dt)\n",
        "        t += dt\n",
        "        if rem_t[active] <= EPS:\n",
        "            C[active] = t\n",
        "            active = -1\n",
        "    return C\n",
        "\n",
        "def _fifo_fast(r: np.ndarray, p: np.ndarray) -> np.ndarray:\n",
        "    \"\"\"FIFO: non-preemptive, process in arrival order.\"\"\"\n",
        "    n = len(r)\n",
        "    C = np.full(n, np.nan, dtype=np.float64)\n",
        "    t = 0.0\n",
        "    for j in range(n):\n",
        "        if t < r[j]:\n",
        "            t = r[j]\n",
        "        t += p[j]\n",
        "        C[j] = t\n",
        "    return C\n",
        "\n",
        "def _las_fast(r: np.ndarray, p: np.ndarray) -> np.ndarray:\n",
        "    \"\"\"Least-Attained-Service / Foreground-Background baseline.\"\"\"\n",
        "    n = len(r)\n",
        "    rem = p.copy()\n",
        "    att = np.zeros(n, dtype=np.float64)\n",
        "    C = np.full(n, np.nan, dtype=np.float64)\n",
        "    active = np.zeros(n, dtype=bool)\n",
        "    i, t = 0, 0.0\n",
        "    done = 0\n",
        "\n",
        "    while done < n:\n",
        "        if not active.any() and i < n:\n",
        "            t = max(t, r[i])\n",
        "        while i < n and r[i] <= t + EPS:\n",
        "            active[i] = True; i += 1\n",
        "        if not active.any():\n",
        "            continue\n",
        "        idx = np.flatnonzero(active)\n",
        "        j = idx[np.argmin(att[idx])]\n",
        "        next_arr = r[i] if i < n else math.inf\n",
        "        dt = min(rem[j], next_arr - t)\n",
        "        if dt <= 1e-12:\n",
        "            t = next_arr; continue\n",
        "        rem[j] -= dt; att[j] += dt; t += dt\n",
        "        if rem[j] <= EPS:\n",
        "            C[j] = t; active[j] = False; done += 1\n",
        "    return C\n",
        "\n",
        "def _edf_pred_deadlines(r: np.ndarray, p_true: np.ndarray, p_pred: np.ndarray,\n",
        "                        factor: float) -> np.ndarray:\n",
        "    \"\"\"EDF-P: EDF with predicted deadlines d_j = r_j + factor * p_hat_j.\"\"\"\n",
        "    n = len(r)\n",
        "    d = r + factor * p_pred\n",
        "    rem = p_true.copy()\n",
        "    C = np.full(n, np.nan, dtype=np.float64)\n",
        "    heap = []; i, t = 0, 0.0\n",
        "    while i < n or heap:\n",
        "        if not heap and i < n:\n",
        "            t = max(t, r[i])\n",
        "        while i < n and r[i] <= t + EPS:\n",
        "            heapq.heappush(heap, (d[i], i)); i += 1\n",
        "        if not heap:\n",
        "            continue\n",
        "        dj, j = heapq.heappop(heap)\n",
        "        next_arr = r[i] if i < n else math.inf\n",
        "        dt = min(rem[j], next_arr - t)\n",
        "        if dt <= 1e-12:\n",
        "            t = next_arr; heapq.heappush(heap, (dj, j)); continue\n",
        "        rem[j] -= dt; t += dt\n",
        "        if rem[j] <= EPS:\n",
        "            C[j] = t\n",
        "        else:\n",
        "            heapq.heappush(heap, (dj, j))\n",
        "    return C\n",
        "\n",
        "# ------------------------- Main Runner ------------------------------\n",
        "def run_max_stretch(\n",
        "    df,\n",
        "    idx_te,\n",
        "    y_pred: np.ndarray,\n",
        "    sample_size: Optional[int] = 5000,\n",
        "    clip_ratio: Optional[Tuple[float, float]] = (0.1, 10.0),\n",
        "    verbose: bool = True,\n",
        ") -> Dict[str, Dict]:\n",
        "    \"\"\"\n",
        "    Evaluate max-stretch for all baselines and prediction-based algorithms.\n",
        "\n",
        "    Args:\n",
        "        df: Full dataframe with columns [submit_time, p_star, ...]\n",
        "        idx_te: Test set indices into df\n",
        "        y_pred: Prediction array aligned with idx_te\n",
        "        sample_size: Number of jobs to evaluate\n",
        "        clip_ratio: (lo, hi) to clip prediction/true ratio for robustness\n",
        "        verbose: Print results\n",
        "    \"\"\"\n",
        "    r, p, q = _prepare_jobs(df, idx_te, y_pred,\n",
        "                            sample_size=sample_size, clip_ratio=clip_ratio)\n",
        "    n = len(p)\n",
        "    t0 = time.time()\n",
        "\n",
        "    # 1) Bisection to find S*\n",
        "    S_bisect = _opt_max_stretch_fast(r, p, tol=1e-3)\n",
        "\n",
        "    # 2) Build EDF schedule at S* and get realized max stretch\n",
        "    C_opt = _edf_schedule_fast(r, p, S_bisect, slack=0.0)\n",
        "    s_opt = _stretch_array(r, C_opt, p)\n",
        "    S_emp = float(np.max(s_opt))\n",
        "\n",
        "    # 3) Normalization anchor so OPT's max/OPT = 1.000\n",
        "    normS = S_emp\n",
        "\n",
        "    results: Dict[str, Dict] = {}\n",
        "\n",
        "    # OPT\n",
        "    results[\"OPT (EDF at S*)\"] = _compute_metrics(r, C_opt, p, normS)\n",
        "\n",
        "    # Clairvoyant\n",
        "    C_srpt = _srpt_fast(r, p, p.copy())\n",
        "    results[\"SRPT (true)\"] = _compute_metrics(r, C_srpt, p, normS)\n",
        "\n",
        "    # Non-clairvoyant baselines\n",
        "    C_fifo = _fifo_fast(r, p)\n",
        "    results[\"FIFO\"] = _compute_metrics(r, C_fifo, p, normS)\n",
        "\n",
        "    C_las = _las_fast(r, p)\n",
        "    results[\"LAS/FB\"] = _compute_metrics(r, C_las, p, normS)\n",
        "\n",
        "    # Prediction-based\n",
        "    C_sprpt = _srpt_fast(r, p, q.copy())\n",
        "    results[\"SPRPT (pred)\"] = _compute_metrics(r, C_sprpt, p, normS)\n",
        "\n",
        "    # FIX: use S* directly as factor, not clamped to [5, 20]\n",
        "    C_edfp = _edf_pred_deadlines(r, p, q, factor=float(S_bisect))\n",
        "    results[\"EDF-P (pred)\"] = _compute_metrics(r, C_edfp, p, normS)\n",
        "\n",
        "    elapsed = time.time() - t0\n",
        "    results[\"_meta\"] = {\n",
        "        \"n\": n,\n",
        "        \"S_bisect\": float(S_bisect),\n",
        "        \"S_emp\": float(S_emp),\n",
        "        \"elapsed_sec\": float(elapsed),\n",
        "    }\n",
        "\n",
        "    if verbose:\n",
        "        print(\"\\n\" + \"=\" * 96)\n",
        "        print(f\"MAX-STRETCH  1|r_j,pmtn|S_max  (n={n:,}, time={elapsed:.2f}s)\")\n",
        "        print(f\"S* (bisection) = {S_bisect:.4f}   |   S_emp (EDF realized) = {S_emp:.4f}\")\n",
        "        print(\"-\" * 96)\n",
        "        hdr = f\"{'Algorithm':22s} {'S_max':>10s} {'S_99':>10s} {'S_med':>10s} {'ρ_max':>9s} {'ρ_99':>9s} {'ρ_med':>9s}\"\n",
        "        print(hdr)\n",
        "        print(\"-\" * 96)\n",
        "        order = [\"OPT (EDF at S*)\", \"SRPT (true)\", \"FIFO\", \"LAS/FB\",\n",
        "                 \"SPRPT (pred)\", \"EDF-P (pred)\"]\n",
        "        for name in order:\n",
        "            m = results[name]\n",
        "            print(f\"{name:22s} {m['max']:10.2f} {m['p99']:10.2f} {m['median']:10.2f} \"\n",
        "                  f\"{m['max_over_OPT']:9.3f} {m['p99_over_OPT']:9.3f} {m['med_over_OPT']:9.3f}\")\n",
        "\n",
        "        print(f\"\\nBounded Slowdown (tau=10):  {'max':>10s} {'p99':>10s} {'median':>10s}\")\n",
        "        for name in order:\n",
        "            m = results[name]\n",
        "            print(f\"  {name:22s} {m['bsld10_max']:10.2f} {m['bsld10_p99']:10.2f} {m['bsld10_med']:10.2f}\")\n",
        "        print(\"=\" * 96)\n",
        "\n",
        "    return results\n",
        "\n",
        "# -------------------- Multi-method evaluation -----------------------\n",
        "def run_max_stretch_all_methods(\n",
        "    df,\n",
        "    idx_te,\n",
        "    predictions: Dict[str, np.ndarray],\n",
        "    sample_size: Optional[int] = 5000,\n",
        "    clip_ratio: Optional[Tuple[float, float]] = (0.1, 10.0),\n",
        "    verbose: bool = True,\n",
        ") -> Dict[str, Dict]:\n",
        "    \"\"\"\n",
        "    Evaluate max-stretch across all prediction methods.\n",
        "\n",
        "    Args:\n",
        "        df: Full dataframe\n",
        "        idx_te: Test indices\n",
        "        predictions: Dict mapping method name -> prediction array\n",
        "        sample_size: Number of jobs\n",
        "        clip_ratio: Prediction clipping bounds\n",
        "        verbose: Print results\n",
        "    \"\"\"\n",
        "    # Prepare shared job data (same sample for all methods)\n",
        "    r, p, _ = _prepare_jobs(df, idx_te,\n",
        "                            np.zeros(len(df.loc[idx_te])),  # dummy\n",
        "                            sample_size=sample_size, clip_ratio=None)\n",
        "    n = len(p)\n",
        "\n",
        "    # Shared OPT computation (only depends on r, p)\n",
        "    t0 = time.time()\n",
        "    S_bisect = _opt_max_stretch_fast(r, p, tol=1e-3)\n",
        "    C_opt = _edf_schedule_fast(r, p, S_bisect, slack=0.0)\n",
        "    s_opt = _stretch_array(r, C_opt, p)\n",
        "    S_emp = float(np.max(s_opt))\n",
        "    normS = S_emp\n",
        "\n",
        "    # Shared baselines\n",
        "    C_srpt = _srpt_fast(r, p, p.copy())\n",
        "    C_fifo = _fifo_fast(r, p)\n",
        "    C_las = _las_fast(r, p)\n",
        "\n",
        "    all_results = {\n",
        "        \"_meta\": {\n",
        "            \"n\": n,\n",
        "            \"S_bisect\": float(S_bisect),\n",
        "            \"S_emp\": float(S_emp),\n",
        "            \"sample_size\": sample_size,\n",
        "        },\n",
        "        \"baselines\": {\n",
        "            \"OPT (EDF at S*)\": _compute_metrics(r, C_opt, p, normS),\n",
        "            \"SRPT (true)\": _compute_metrics(r, C_srpt, p, normS),\n",
        "            \"FIFO\": _compute_metrics(r, C_fifo, p, normS),\n",
        "            \"LAS/FB\": _compute_metrics(r, C_las, p, normS),\n",
        "        },\n",
        "        \"methods\": {},\n",
        "    }\n",
        "\n",
        "    if verbose:\n",
        "        print(\"=\" * 96)\n",
        "        print(f\"MAX-STRETCH  1|r_j,pmtn|S_max  (n={n:,})\")\n",
        "        print(f\"S* = {S_bisect:.4f}   |   S_emp = {S_emp:.4f}\")\n",
        "        print(\"-\" * 96)\n",
        "\n",
        "    # Per-method evaluation\n",
        "    for method_name, y_pred_raw in predictions.items():\n",
        "        # Re-prepare with this method's predictions (same sample via same seed)\n",
        "        _, _, q = _prepare_jobs(df, idx_te, y_pred_raw,\n",
        "                                sample_size=sample_size, clip_ratio=clip_ratio)\n",
        "\n",
        "        C_sprpt = _srpt_fast(r, p, q.copy())\n",
        "        C_edfp = _edf_pred_deadlines(r, p, q, factor=float(S_bisect))\n",
        "\n",
        "        all_results[\"methods\"][method_name] = {\n",
        "            \"SPRPT\": _compute_metrics(r, C_sprpt, p, normS),\n",
        "            \"EDF-P\": _compute_metrics(r, C_edfp, p, normS),\n",
        "        }\n",
        "\n",
        "        if verbose:\n",
        "            m_sprpt = all_results[\"methods\"][method_name][\"SPRPT\"]\n",
        "            m_edfp = all_results[\"methods\"][method_name][\"EDF-P\"]\n",
        "            print(f\"{method_name:8s}  SPRPT ρ_max={m_sprpt['max_over_OPT']:.3f}  \"\n",
        "                  f\"ρ_99={m_sprpt['p99_over_OPT']:.3f}  ρ_med={m_sprpt['med_over_OPT']:.3f}  |  \"\n",
        "                  f\"EDF-P ρ_max={m_edfp['max_over_OPT']:.3f}  \"\n",
        "                  f\"ρ_99={m_edfp['p99_over_OPT']:.3f}  ρ_med={m_edfp['med_over_OPT']:.3f}\")\n",
        "\n",
        "    elapsed = time.time() - t0\n",
        "    all_results[\"_meta\"][\"elapsed_sec\"] = float(elapsed)\n",
        "\n",
        "    if verbose:\n",
        "        print(\"-\" * 96)\n",
        "        print(f\"\\nBASELINES:\")\n",
        "        print(f\"{'Algorithm':22s} {'ρ_max':>9s} {'ρ_99':>9s} {'ρ_med':>9s}\")\n",
        "        for name in [\"OPT (EDF at S*)\", \"SRPT (true)\", \"FIFO\", \"LAS/FB\"]:\n",
        "            m = all_results[\"baselines\"][name]\n",
        "            print(f\"{name:22s} {m['max_over_OPT']:9.3f} {m['p99_over_OPT']:9.3f} {m['med_over_OPT']:9.3f}\")\n",
        "        print(f\"\\nTotal time: {elapsed:.1f}s\")\n",
        "        print(\"=\" * 96)\n",
        "\n",
        "    return all_results\n",
        "\n",
        "# -------------------------- Usage -----------------------------------\n",
        "if __name__ == \"__main__\":\n",
        "\n",
        "\n",
        "    print(\"Usage: run the prediction pipeline first, then call\")\n",
        "    print(\"  run_max_stretch(df, idx_te, y_pred)\")\n",
        "    print(\"  run_max_stretch_all_methods(df, idx_te, predictions)\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Fn9TKyh2mhGg",
        "outputId": "05e1d22c-8454-4691-b57f-0d0e95f1b732"
      },
      "execution_count": 12,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Usage: run the prediction pipeline first, then call\n",
            "  run_max_stretch(df, idx_te, y_pred)\n",
            "  run_max_stretch_all_methods(df, idx_te, predictions)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Single method:\n",
        "results = run_max_stretch(df, idx_te, predictions[\"M5\"], sample_size=5000)\n",
        "\n",
        "# All methods at once (shared OPT + baselines, efficient):\n",
        "results = run_max_stretch_all_methods(df, idx_te, predictions, sample_size=5000)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "TFEmqQ3ImjeD",
        "outputId": "b9306f56-4b6b-4588-d3f0-f0af94dad8c1"
      },
      "execution_count": 13,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\n",
            "================================================================================================\n",
            "MAX-STRETCH  1|r_j,pmtn|S_max  (n=5,000, time=0.47s)\n",
            "S* (bisection) = 380.7568   |   S_emp (EDF realized) = 380.6676\n",
            "------------------------------------------------------------------------------------------------\n",
            "Algorithm                   S_max       S_99      S_med     ρ_max      ρ_99     ρ_med\n",
            "------------------------------------------------------------------------------------------------\n",
            "OPT (EDF at S*)            380.67     376.80       3.55     1.000     0.990     0.009\n",
            "SRPT (true)                601.73     491.43       3.31     1.581     1.291     0.009\n",
            "FIFO                   2320341.00  803169.77   12222.86  6095.452  2109.898    32.109\n",
            "LAS/FB                   78144.79   50560.14     664.96   205.284   132.820     1.747\n",
            "SPRPT (pred)              6903.22    1749.63      16.89    18.135     4.596     0.044\n",
            "EDF-P (pred)             77907.56   12858.91      16.01   204.660    33.780     0.042\n",
            "\n",
            "Bounded Slowdown (tau=10):         max        p99     median\n",
            "  OPT (EDF at S*)            380.67     376.80       3.55\n",
            "  SRPT (true)                601.73     491.43       3.31\n",
            "  FIFO                   2088306.90  803169.77   12222.86\n",
            "  LAS/FB                   78144.79   50560.14     664.96\n",
            "  SPRPT (pred)              6903.22    1749.63      16.89\n",
            "  EDF-P (pred)             70116.80   12858.91      16.01\n",
            "================================================================================================\n",
            "================================================================================================\n",
            "MAX-STRETCH  1|r_j,pmtn|S_max  (n=5,000)\n",
            "S* = 380.7568   |   S_emp = 380.6676\n",
            "------------------------------------------------------------------------------------------------\n",
            "M1        SPRPT ρ_max=21.446  ρ_99=5.743  ρ_med=0.058  |  EDF-P ρ_max=400.064  ρ_99=75.010  ρ_med=0.124\n",
            "M3        SPRPT ρ_max=51.101  ρ_99=7.441  ρ_med=0.131  |  EDF-P ρ_max=1700.468  ρ_99=509.601  ρ_med=3.780\n",
            "M4        SPRPT ρ_max=21.665  ρ_99=5.110  ρ_med=0.059  |  EDF-P ρ_max=400.362  ρ_99=85.239  ρ_med=0.211\n",
            "M5        SPRPT ρ_max=18.135  ρ_99=4.596  ρ_med=0.044  |  EDF-P ρ_max=204.660  ρ_99=33.780  ρ_med=0.042\n",
            "M6        SPRPT ρ_max=20.771  ρ_99=4.424  ρ_med=0.051  |  EDF-P ρ_max=411.240  ρ_99=87.718  ρ_med=0.088\n",
            "M7        SPRPT ρ_max=31.115  ρ_99=5.514  ρ_med=0.079  |  EDF-P ρ_max=572.268  ρ_99=134.646  ρ_med=0.726\n",
            "------------------------------------------------------------------------------------------------\n",
            "\n",
            "BASELINES:\n",
            "Algorithm                  ρ_max      ρ_99     ρ_med\n",
            "OPT (EDF at S*)            1.000     0.990     0.009\n",
            "SRPT (true)                1.581     1.291     0.009\n",
            "FIFO                    6095.452  2109.898    32.109\n",
            "LAS/FB                   205.284   132.820     1.747\n",
            "\n",
            "Total time: 0.7s\n",
            "================================================================================================\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# =====================================================================\n",
        "# MAKESPAN EVALUATION (P||C_max)\n",
        "# =====================================================================\n",
        "\n",
        "\n",
        "from dataclasses import dataclass\n",
        "from typing import List, Dict, Tuple, Optional\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "import time\n",
        "\n",
        "@dataclass\n",
        "class Job:\n",
        "    job_id: int\n",
        "    true_size: float\n",
        "    pred_size: float\n",
        "\n",
        "    def __post_init__(self):\n",
        "        self.pred_size = max(self.pred_size, 1e-9)\n",
        "        self.true_size = max(self.true_size, 1e-9)\n",
        "\n",
        "# ======================== Core Algorithms ========================\n",
        "\n",
        "def makespan_LPT(jobs: List[Job], m: int, use_predictions: bool = False) -> Tuple[float, List[List[int]]]:\n",
        "    \"\"\"Longest Processing Time First. LPPT when use_predictions=True.\"\"\"\n",
        "    if not jobs or m <= 0:\n",
        "        return 0.0, []\n",
        "\n",
        "    key = (lambda j: j.pred_size) if use_predictions else (lambda j: j.true_size)\n",
        "    sorted_jobs = sorted(jobs, key=key, reverse=True)\n",
        "\n",
        "    if use_predictions:\n",
        "        perceived_loads = [0.0] * m\n",
        "        actual_loads = [0.0] * m\n",
        "        assignments = [[] for _ in range(m)]\n",
        "        for job in sorted_jobs:\n",
        "            min_idx = int(np.argmin(perceived_loads))\n",
        "            perceived_loads[min_idx] += job.pred_size\n",
        "            actual_loads[min_idx] += job.true_size\n",
        "            assignments[min_idx].append(job.job_id)\n",
        "        return max(actual_loads), assignments\n",
        "    else:\n",
        "        loads = [0.0] * m\n",
        "        assignments = [[] for _ in range(m)]\n",
        "        for job in sorted_jobs:\n",
        "            min_idx = int(np.argmin(loads))\n",
        "            loads[min_idx] += job.true_size\n",
        "            assignments[min_idx].append(job.job_id)\n",
        "        return max(loads), assignments\n",
        "\n",
        "def makespan_SPT(jobs: List[Job], m: int, use_predictions: bool = False) -> Tuple[float, List[List[int]]]:\n",
        "    \"\"\"Shortest Processing Time First. SPPT when use_predictions=True.\"\"\"\n",
        "    if not jobs or m <= 0:\n",
        "        return 0.0, []\n",
        "\n",
        "    key = (lambda j: j.pred_size) if use_predictions else (lambda j: j.true_size)\n",
        "    sorted_jobs = sorted(jobs, key=key)\n",
        "\n",
        "    if use_predictions:\n",
        "        perceived_loads = [0.0] * m\n",
        "        actual_loads = [0.0] * m\n",
        "        assignments = [[] for _ in range(m)]\n",
        "        for job in sorted_jobs:\n",
        "            min_idx = int(np.argmin(perceived_loads))\n",
        "            perceived_loads[min_idx] += job.pred_size\n",
        "            actual_loads[min_idx] += job.true_size\n",
        "            assignments[min_idx].append(job.job_id)\n",
        "        return max(actual_loads), assignments\n",
        "    else:\n",
        "        loads = [0.0] * m\n",
        "        assignments = [[] for _ in range(m)]\n",
        "        for job in sorted_jobs:\n",
        "            min_idx = int(np.argmin(loads))\n",
        "            loads[min_idx] += job.true_size\n",
        "            assignments[min_idx].append(job.job_id)\n",
        "        return max(loads), assignments\n",
        "\n",
        "def makespan_random(jobs: List[Job], m: int, seed: int = 42) -> Tuple[float, List[List[int]]]:\n",
        "    \"\"\"Random assignment baseline.\"\"\"\n",
        "    if not jobs or m <= 0:\n",
        "        return 0.0, []\n",
        "    rng = np.random.RandomState(seed)\n",
        "    loads = [0.0] * m\n",
        "    assignments = [[] for _ in range(m)]\n",
        "    for job in jobs:\n",
        "        idx = rng.randint(m)\n",
        "        loads[idx] += job.true_size\n",
        "        assignments[idx].append(job.job_id)\n",
        "    return max(loads), assignments\n",
        "\n",
        "def compute_lower_bound(jobs: List[Job], m: int) -> float:\n",
        "    \"\"\"McNaughton's preemptive bound: OPT_pre = max(Σp/m, max_j p_j).\"\"\"\n",
        "    if not jobs or m <= 0:\n",
        "        return 0.0\n",
        "    total = sum(j.true_size for j in jobs)\n",
        "    mx = max(j.true_size for j in jobs)\n",
        "    return max(total / m, mx)\n",
        "\n",
        "# ======================== Data Preparation ========================\n",
        "\n",
        "def _sample_jobs(df: pd.DataFrame, idx_te, y_pred: np.ndarray,\n",
        "                 m: int, n_samples: Optional[int] = None,\n",
        "                 seed: int = 42) -> Tuple[List[Job], np.ndarray]:\n",
        "    \"\"\"\n",
        "    Sample jobs from test set. Returns (jobs_with_oracle_pred, true_sizes).\n",
        "    Jobs are created with true_size only; predictions are set per-method later.\n",
        "    \"\"\"\n",
        "    test_df = df.loc[idx_te].copy().reset_index(drop=True)\n",
        "\n",
        "    if n_samples is None:\n",
        "        n_samples = min(len(test_df), m * max(10, int(np.log2(m) * 10)))\n",
        "\n",
        "    if len(test_df) > n_samples:\n",
        "        rng = np.random.RandomState(seed)\n",
        "        pick = rng.choice(len(test_df), size=n_samples, replace=False)\n",
        "        test_df = test_df.iloc[pick].copy().reset_index(drop=True)\n",
        "    else:\n",
        "        pick = np.arange(len(test_df))\n",
        "\n",
        "    true_sizes = test_df['p_star'].values.astype(np.float64)\n",
        "    return pick, true_sizes\n",
        "\n",
        "# ======================== Main Evaluation ========================\n",
        "\n",
        "def evaluate_makespan(\n",
        "    df: pd.DataFrame,\n",
        "    idx_te,\n",
        "    predictions: Dict[str, np.ndarray],\n",
        "    machine_counts: List[int] = [5, 10, 20, 50, 100],\n",
        "    n_random_trials: int = 10,\n",
        "    seed: int = 42,\n",
        "    verbose: bool = True,\n",
        ") -> pd.DataFrame:\n",
        "    \"\"\"\n",
        "    Evaluate makespan (P||C_max) for all prediction methods.\n",
        "\n",
        "    Args:\n",
        "        df: Full dataframe with column p_star\n",
        "        idx_te: Test set indices\n",
        "        predictions: Dict mapping method name -> prediction array\n",
        "        machine_counts: List of m values to evaluate\n",
        "        n_random_trials: Number of random assignment trials for averaging\n",
        "        seed: Random seed for sampling\n",
        "        verbose: Print results\n",
        "\n",
        "    Returns:\n",
        "        DataFrame with results for all methods and machine counts.\n",
        "    \"\"\"\n",
        "    if verbose:\n",
        "        print(\"=\" * 80)\n",
        "        print(\"MAKESPAN EVALUATION  P||C_max\")\n",
        "        print(\"=\" * 80)\n",
        "\n",
        "    all_rows = []\n",
        "\n",
        "    for m in machine_counts:\n",
        "        if verbose:\n",
        "            print(f\"\\n--- m = {m} machines ---\")\n",
        "\n",
        "        # Sample ONCE per m (same jobs for all methods)\n",
        "        pick, true_sizes = _sample_jobs(df, idx_te, None, m, seed=seed)\n",
        "        n = len(true_sizes)\n",
        "\n",
        "        # Oracle baselines (same for all prediction methods)\n",
        "        oracle_jobs = [Job(i, true_sizes[i], true_sizes[i]) for i in range(n)]\n",
        "        opt_pre = compute_lower_bound(oracle_jobs, m)\n",
        "        lpt_ms, _ = makespan_LPT(oracle_jobs, m, use_predictions=False)\n",
        "        spt_ms, _ = makespan_SPT(oracle_jobs, m, use_predictions=False)\n",
        "\n",
        "        # Random baseline (averaged)\n",
        "        random_ms = np.mean([makespan_random(oracle_jobs, m, seed=seed + i)[0]\n",
        "                             for i in range(n_random_trials)])\n",
        "\n",
        "        if verbose:\n",
        "            print(f\"  n={n}, OPT_pre={opt_pre:.1f}\")\n",
        "            print(f\"  LPT={lpt_ms:.1f} (ρ={lpt_ms/opt_pre:.4f}), \"\n",
        "                  f\"SPT={spt_ms:.1f} (ρ={spt_ms/opt_pre:.4f}), \"\n",
        "                  f\"Random={random_ms:.1f} (ρ={random_ms/opt_pre:.4f})\")\n",
        "\n",
        "        # Per-method evaluation\n",
        "        for method_name, y_pred_all in predictions.items():\n",
        "            y_pred = y_pred_all[:len(df.loc[idx_te])][pick]\n",
        "\n",
        "            # Build jobs with this method's predictions\n",
        "            jobs = [Job(i, true_sizes[i], float(y_pred[i])) for i in range(n)]\n",
        "\n",
        "            lppt_ms, _ = makespan_LPT(jobs, m, use_predictions=True)\n",
        "            sppt_ms, _ = makespan_SPT(jobs, m, use_predictions=True)\n",
        "\n",
        "            row = {\n",
        "                'method': method_name,\n",
        "                'm': m,\n",
        "                'n_jobs': n,\n",
        "                'OPT_pre': opt_pre,\n",
        "                'LPT': lpt_ms,\n",
        "                'LPT_ratio': lpt_ms / opt_pre,\n",
        "                'SPT': spt_ms,\n",
        "                'SPT_ratio': spt_ms / opt_pre,\n",
        "                'LPPT': lppt_ms,\n",
        "                'LPPT_ratio': lppt_ms / opt_pre,\n",
        "                'SPPT': sppt_ms,\n",
        "                'SPPT_ratio': sppt_ms / opt_pre,\n",
        "                'Random': random_ms,\n",
        "                'Random_ratio': random_ms / opt_pre,\n",
        "            }\n",
        "            all_rows.append(row)\n",
        "\n",
        "            if verbose:\n",
        "                print(f\"  {method_name:8s}: LPPT ρ={lppt_ms/opt_pre:.4f}, \"\n",
        "                      f\"SPPT ρ={sppt_ms/opt_pre:.4f}\")\n",
        "\n",
        "    results_df = pd.DataFrame(all_rows)\n",
        "\n",
        "    # Print summary table\n",
        "    if verbose:\n",
        "        print(\"\\n\" + \"=\" * 80)\n",
        "        print(\"SUMMARY TABLE\")\n",
        "        print(\"=\" * 80)\n",
        "\n",
        "        # Baselines (same across methods, print once per m)\n",
        "        print(f\"\\n{'m':>5s}  {'OPT_pre':>10s}  {'LPT ρ':>8s}  {'SPT ρ':>8s}  {'Rand ρ':>8s}\")\n",
        "        print(\"-\" * 50)\n",
        "        for m in machine_counts:\n",
        "            sub = results_df[results_df['m'] == m].iloc[0]\n",
        "            print(f\"{m:5d}  {sub['OPT_pre']:10.1f}  {sub['LPT_ratio']:8.4f}  \"\n",
        "                  f\"{sub['SPT_ratio']:8.4f}  {sub['Random_ratio']:8.4f}\")\n",
        "\n",
        "        # Per-method prediction results\n",
        "        print(f\"\\n{'Method':>8s} {'m':>5s}  {'LPPT ρ':>8s}  {'SPPT ρ':>8s}\")\n",
        "        print(\"-\" * 40)\n",
        "        for method_name in predictions.keys():\n",
        "            for m in machine_counts:\n",
        "                sub = results_df[(results_df['method'] == method_name) &\n",
        "                                 (results_df['m'] == m)]\n",
        "                if len(sub) > 0:\n",
        "                    row = sub.iloc[0]\n",
        "                    print(f\"{method_name:>8s} {m:5d}  {row['LPPT_ratio']:8.4f}  \"\n",
        "                          f\"{row['SPPT_ratio']:8.4f}\")\n",
        "        print(\"=\" * 80)\n",
        "\n",
        "    # Save\n",
        "    results_df.to_csv('makespan_results.csv', index=False)\n",
        "    if verbose:\n",
        "        print(f\"\\n[INFO] Results saved to makespan_results.csv\")\n",
        "\n",
        "    return results_df\n",
        "\n",
        "# ======================== Usage ========================\n",
        "\n",
        "if __name__ == \"__main__\":\n",
        "\n",
        "    print(\"Usage: run the prediction pipeline first, then call\")\n",
        "    print(\"  evaluate_makespan(df, idx_te, predictions)\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "j-qV9IGaoqzA",
        "outputId": "1d3156a0-65cb-4155-d0ac-579392b9ac10"
      },
      "execution_count": 14,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Usage: run the prediction pipeline first, then call\n",
            "  evaluate_makespan(df, idx_te, predictions)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "results_df = evaluate_makespan(\n",
        "    df=df, idx_te=idx_te, predictions=predictions,\n",
        "    machine_counts=[5, 10, 20, 50, 100],\n",
        "    verbose=True\n",
        ")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "TEeVpD9rosrI",
        "outputId": "1d8b5a35-60c2-4cb5-aa0f-e4adfad62b18"
      },
      "execution_count": 15,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "================================================================================\n",
            "MAKESPAN EVALUATION  P||C_max\n",
            "================================================================================\n",
            "\n",
            "--- m = 5 machines ---\n",
            "  n=115, OPT_pre=130642.0\n",
            "  LPT=130642.0 (ρ=1.0000), SPT=179764.0 (ρ=1.3760), Random=180856.9 (ρ=1.3844)\n",
            "  M1      : LPPT ρ=1.5554, SPPT ρ=1.6472\n",
            "  M3      : LPPT ρ=1.3640, SPPT ρ=1.7030\n",
            "  M4      : LPPT ρ=1.4301, SPPT ρ=1.3737\n",
            "  M5      : LPPT ρ=1.4537, SPPT ρ=1.3134\n",
            "  M6      : LPPT ρ=1.5305, SPPT ρ=1.6557\n",
            "  M7      : LPPT ρ=1.4591, SPPT ρ=1.6234\n",
            "\n",
            "--- m = 10 machines ---\n",
            "  n=330, OPT_pre=156477.0\n",
            "  LPT=156487.0 (ρ=1.0001), SPT=230515.0 (ρ=1.4732), Random=300771.4 (ρ=1.9221)\n",
            "  M1      : LPPT ρ=1.7477, SPPT ρ=1.5129\n",
            "  M3      : LPPT ρ=2.2087, SPPT ρ=1.5956\n",
            "  M4      : LPPT ρ=1.7376, SPPT ρ=1.8655\n",
            "  M5      : LPPT ρ=1.7663, SPPT ρ=1.9572\n",
            "  M6      : LPPT ρ=1.6004, SPPT ρ=1.7986\n",
            "  M7      : LPPT ρ=1.6230, SPPT ρ=1.7049\n",
            "\n",
            "--- m = 20 machines ---\n",
            "  n=860, OPT_pre=221916.2\n",
            "  LPT=221922.0 (ρ=1.0000), SPT=303654.0 (ρ=1.3683), Random=426937.6 (ρ=1.9239)\n",
            "  M1      : LPPT ρ=1.6691, SPPT ρ=1.9555\n",
            "  M3      : LPPT ρ=2.0607, SPPT ρ=1.6392\n",
            "  M4      : LPPT ρ=1.6992, SPPT ρ=1.4648\n",
            "  M5      : LPPT ρ=1.7800, SPPT ρ=1.7767\n",
            "  M6      : LPPT ρ=1.7059, SPPT ρ=1.5573\n",
            "  M7      : LPPT ρ=2.5211, SPPT ρ=2.4727\n",
            "\n",
            "--- m = 50 machines ---\n",
            "  n=2800, OPT_pre=266545.1\n",
            "  LPT=266552.0 (ρ=1.0000), SPT=468377.0 (ρ=1.7572), Random=524303.2 (ρ=1.9670)\n",
            "  M1      : LPPT ρ=1.8037, SPPT ρ=1.7973\n",
            "  M3      : LPPT ρ=2.0729, SPPT ρ=1.6846\n",
            "  M4      : LPPT ρ=1.5253, SPPT ρ=2.0523\n",
            "  M5      : LPPT ρ=1.6497, SPPT ρ=1.7061\n",
            "  M6      : LPPT ρ=1.4298, SPPT ρ=1.6102\n",
            "  M7      : LPPT ρ=1.9049, SPPT ρ=1.9772\n",
            "\n",
            "--- m = 100 machines ---\n",
            "  n=6600, OPT_pre=405970.0\n",
            "  LPT=405970.0 (ρ=1.0000), SPT=698010.0 (ρ=1.7194), Random=929710.3 (ρ=2.2901)\n",
            "  M1      : LPPT ρ=1.8282, SPPT ρ=1.5910\n",
            "  M3      : LPPT ρ=1.9770, SPPT ρ=2.1409\n",
            "  M4      : LPPT ρ=1.4756, SPPT ρ=1.8279\n",
            "  M5      : LPPT ρ=1.7928, SPPT ρ=1.9009\n",
            "  M6      : LPPT ρ=1.9475, SPPT ρ=1.9903\n",
            "  M7      : LPPT ρ=1.7001, SPPT ρ=1.9370\n",
            "\n",
            "================================================================================\n",
            "SUMMARY TABLE\n",
            "================================================================================\n",
            "\n",
            "    m     OPT_pre     LPT ρ     SPT ρ    Rand ρ\n",
            "--------------------------------------------------\n",
            "    5    130642.0    1.0000    1.3760    1.3844\n",
            "   10    156477.0    1.0001    1.4732    1.9221\n",
            "   20    221916.2    1.0000    1.3683    1.9239\n",
            "   50    266545.1    1.0000    1.7572    1.9670\n",
            "  100    405970.0    1.0000    1.7194    2.2901\n",
            "\n",
            "  Method     m    LPPT ρ    SPPT ρ\n",
            "----------------------------------------\n",
            "      M1     5    1.5554    1.6472\n",
            "      M1    10    1.7477    1.5129\n",
            "      M1    20    1.6691    1.9555\n",
            "      M1    50    1.8037    1.7973\n",
            "      M1   100    1.8282    1.5910\n",
            "      M3     5    1.3640    1.7030\n",
            "      M3    10    2.2087    1.5956\n",
            "      M3    20    2.0607    1.6392\n",
            "      M3    50    2.0729    1.6846\n",
            "      M3   100    1.9770    2.1409\n",
            "      M4     5    1.4301    1.3737\n",
            "      M4    10    1.7376    1.8655\n",
            "      M4    20    1.6992    1.4648\n",
            "      M4    50    1.5253    2.0523\n",
            "      M4   100    1.4756    1.8279\n",
            "      M5     5    1.4537    1.3134\n",
            "      M5    10    1.7663    1.9572\n",
            "      M5    20    1.7800    1.7767\n",
            "      M5    50    1.6497    1.7061\n",
            "      M5   100    1.7928    1.9009\n",
            "      M6     5    1.5305    1.6557\n",
            "      M6    10    1.6004    1.7986\n",
            "      M6    20    1.7059    1.5573\n",
            "      M6    50    1.4298    1.6102\n",
            "      M6   100    1.9475    1.9903\n",
            "      M7     5    1.4591    1.6234\n",
            "      M7    10    1.6230    1.7049\n",
            "      M7    20    2.5211    2.4727\n",
            "      M7    50    1.9049    1.9772\n",
            "      M7   100    1.7001    1.9370\n",
            "================================================================================\n",
            "\n",
            "[INFO] Results saved to makespan_results.csv\n"
          ]
        }
      ]
    }
  ]
}