{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "068ee844",
   "metadata": {},
   "source": [
    "# 算法"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "68e3d97a",
   "metadata": {},
   "source": [
    "## Ours"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "09ceb97f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from dataclasses import dataclass\n",
    "from typing import List, Optional, Tuple, Dict\n",
    "\n",
    "@dataclass\n",
    "class BPACConfig:\n",
    "    \"\"\"\n",
    "    B-PAC 算法超参数配置\n",
    "    \"\"\"\n",
    "    alpha: float = 0.1          # 容错概率 (1-Confidence), 例如 0.1 代表 90% 置信度\n",
    "    epsilon: float = 0.1      # 容忍的风险上限 (Error Tolerance), 例如 0.05 代表允许 5% 的性能损失\n",
    "    rho: float = 0.1           # 最小探索概率 (Minimum Exploration Probability),0到1\n",
    "    beta: float = 1.0           # FTRL 正则化参数,0到无穷\n",
    "    c_clip: float = 0.9         # 投注截断常数，0到1\n",
    "    num_thresholds: int = 1001   # 阈值搜索空间的精细度\n",
    "    warm_up: int = 50          # 初始预热步数\n",
    "    rho_0: float = 0.05\n",
    "    rho_1: float = 0.6\n",
    "    change_point: int = 200\n",
    "\n",
    "def compute_step_loss(y_correct_t: int, y_hat_correct_t: int) -> float:\n",
    "    \"\"\"\n",
    "    单步 Loss 计算，对应你提供的逻辑：\n",
    "    Loss = 1 当且仅当 (专家对 AND 小模型错)\n",
    "    \"\"\"\n",
    "    # y_correct_t: 1 if expert is correct, 0 else\n",
    "    # y_hat_correct_t: 1 if instant model is correct, 0 else\n",
    "    \n",
    "    weak_wrong = 1 - y_hat_correct_t\n",
    "    # 只有 strong 正确 & weak 错误 时才记为 1\n",
    "    loss = float(y_correct_t * weak_wrong)\n",
    "    return loss\n",
    "\n",
    "def compute_step_loss_open(y_correct_t: int, y_hat_correct_t: int, max_val: float) -> float:\n",
    "    \"\"\"\n",
    "    单步 Loss 计算（修改版）\n",
    "    \n",
    "    新公式：\n",
    "    loss = (y_correct_t - y_hat_correct_t) / max_val\n",
    "    \n",
    "    参数:\n",
    "        y_correct_t: int, 1-10, 表示专家的分数\n",
    "        y_hat_correct_t: int, 1-10 表示小模型的分数\n",
    "        max_val: float, 用于归一化的分母（通常 > 0），当前最大的可能分数差值。\n",
    "    \n",
    "    返回:\n",
    "        float, 计算得到的单步 loss 值\n",
    "    \"\"\"\n",
    "    if max_val == 0:\n",
    "        raise ValueError(\"max_val cannot be zero to avoid division by zero\")\n",
    "    y_correct_t = np.asanyarray(y_correct_t,dtype=float)\n",
    "    y_hat_correct_t = np.asanyarray(y_hat_correct_t,dtype=float)\n",
    "    loss = np.sqrt((y_correct_t - y_hat_correct_t) / max_val)\n",
    "\n",
    "    return loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dc3cc3ae",
   "metadata": {},
   "outputs": [],
   "source": [
    "class BPAC:\n",
    "    def __init__(self, config: BPACConfig):\n",
    "        self.cfg = config\n",
    "        self.threshold_candidates = np.linspace(0, 1, self.cfg.num_thresholds)\n",
    "\n",
    "        # 状态初始化\n",
    "        self.current_u_idx = 0 \n",
    "        self.current_u = self.threshold_candidates[self.current_u_idx]\n",
    "        self.wealth = np.ones(self.cfg.num_thresholds) # K_0 = 1\n",
    "        \n",
    "        # FTRL 统计量\n",
    "        self.sum_D = np.zeros(self.cfg.num_thresholds)\n",
    "        self.sum_D_sq = np.zeros(self.cfg.num_thresholds)\n",
    "\n",
    "    def get_action(self, uncertainty_score: float):\n",
    "        \"\"\"\n",
    "        Returns:\n",
    "            action (int): 1 (Expert), 0 (Instant)\n",
    "            propensity (float): The probability of choosing Expert (pi_t)\n",
    "        \"\"\"\n",
    "        # 策略: pi_t = I(U >= u) + rho * I(U < u)\n",
    "        if uncertainty_score >= self.current_u:\n",
    "            # 必须调用专家\n",
    "            return 1, 1.0\n",
    "        else:\n",
    "            # 尝试使用小模型，但有 rho 的概率探索\n",
    "            # prop 是指“在这个不确定性下，算法设计上调用专家的概率”\n",
    "            propensity = self.cfg.rho \n",
    "            \n",
    "            # 实际采样动作\n",
    "            is_exploring = np.random.rand() < self.cfg.rho\n",
    "            action = 1 if is_exploring else 0\n",
    "            \n",
    "            return action, propensity\n",
    "\n",
    "    def update(self, uncertainty_score: float, action: int, observed_loss: Optional[float]):\n",
    "        \"\"\"\n",
    "        核心更新逻辑 (Bandit Feedback)\n",
    "        论文中的 update 仅依赖于 'observed' 数据\n",
    "        \"\"\"\n",
    "        # 1. 数据准备\n",
    "        # 如果 action=0 (没调专家)，则 observed_loss 为 None，但在公式中 l_t * xi_t 会变成 0\n",
    "        l_t = observed_loss if observed_loss is not None else 0.0\n",
    "        xi_t = action\n",
    "        \n",
    "        # 2. 计算 Propensity 向量 (Vectorized for all u)\n",
    "        # indicator_less: I(U_t < u)\n",
    "        indicator_less = (uncertainty_score < self.threshold_candidates).astype(float)\n",
    "        # pi_t(u)\n",
    "        if uncertainty_score < self.current_u:\n",
    "            pi_t = self.cfg.rho\n",
    "        else:\n",
    "            pi_t = 1.0\n",
    "        \n",
    "        # 3. 计算 Payoff D_t(u)\n",
    "        # D_t = epsilon - (l_t * xi_t * I(U < u)) / pi_t\n",
    "        weighted_loss = (1-self.cfg.rho_0)*(l_t * xi_t * indicator_less) / pi_t\n",
    "\n",
    "        # epsilon = epsilon / (1 - rho) 调整\n",
    "        # epsilon = self.cfg.epsilon / (1.0 - self.cfg.rho)\n",
    "        D_t = self.cfg.epsilon - weighted_loss\n",
    "        \n",
    "        # 4. FTRL Lambda 更新 [cite: 199]\n",
    "        denom = self.sum_D_sq + self.cfg.beta\n",
    "        denom[denom == 0] = 1e-9 # 避免除零\n",
    "        lambda_raw = self.sum_D / denom\n",
    "        \n",
    "        M_t = max(self.cfg.epsilon,((1.0-self.cfg.rho_0)/self.cfg.rho)-self.cfg.epsilon)\n",
    "        upper_bound = self.cfg.c_clip / M_t\n",
    "        lambda_t = np.clip(lambda_raw, 0, upper_bound)\n",
    "        \n",
    "        # 5. 财富更新\n",
    "        self.wealth = self.wealth * (1.0 + lambda_t * D_t)\n",
    "        self.sum_D += D_t\n",
    "        self.sum_D_sq += (D_t ** 2)\n",
    "\n",
    "        # # 6. 阈值选择 [cite: 166]\n",
    "        # valid_indices = np.where(self.wealth >= (1.0 / self.cfg.alpha))[0]\n",
    "        \n",
    "        is_safe_mask = (self.wealth >= (1.0 / self.cfg.alpha))\n",
    "        prefix_safe_mask = np.logical_and.accumulate(is_safe_mask)\n",
    "        valid_indices = np.where(prefix_safe_mask)[0]\n",
    "\n",
    "        if len(valid_indices) > 0:\n",
    "            self.current_u_idx = valid_indices[-1]\n",
    "            self.current_u = self.threshold_candidates[self.current_u_idx]\n",
    "        else:\n",
    "            # Fallback to safest (all expert)\n",
    "            self.current_u_idx = 0\n",
    "            self.current_u = 0.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "66c3095c",
   "metadata": {},
   "outputs": [],
   "source": [
    "def run_simulation(data_sequence: List[Dict], config: BPACConfig):\n",
    "    \"\"\"\n",
    "    data_sequence: List of item dicts\n",
    "    item keys: \"uncertainty\", \"instant_correct\", \"expert_correct\", \"instant_token\", \"expert_token\"\n",
    "    \"\"\"\n",
    "    model = BPAC(config)\n",
    "    logs = []\n",
    "    warm_up = config.warm_up\n",
    "    # print(f\"Start Simulation with {len(data_sequence)} samples...\")\n",
    "    # print(f\"Config: Epsilon={config.epsilon}, Alpha={config.alpha}, Rho={config.rho}\")\n",
    "    magpie_seq = [item for item in data_sequence if item[\"dataset\"]==\"magpie\"]\n",
    "    if len(magpie_seq)>0:\n",
    "        diffs = []\n",
    "        for i in magpie_seq:\n",
    "            diff = i['expert_score'] - i['instant_score']\n",
    "            diffs.append(diff)\n",
    "        max_diff = max(diffs)\n",
    "    \n",
    "    # 累积变量\n",
    "    total_actual_tokens = 0\n",
    "    total_baseline_tokens = 0\n",
    "    cumulative_loss = 0\n",
    "    expert_calls = 0\n",
    "\n",
    "    for t, item in enumerate(data_sequence):\n",
    "        # 1. 提取特征\n",
    "        u_t = item['uncertainty']\n",
    "        inst_corr = item['instant_correct'] if item[\"dataset\"] != \"magpie\" else item['instant_score']\n",
    "        exp_corr = item['expert_correct'] if item[\"dataset\"] != \"magpie\" else item['expert_score']\n",
    "        inst_tok = item['instant_token']\n",
    "        exp_tok = item['expert_token']\n",
    "        data = item[\"dataset\"]\n",
    "        # 2. 算法决策\n",
    "        \n",
    "        if t< model.cfg.change_point:\n",
    "            model.cfg.rho = model.cfg.rho_1\n",
    "        else:\n",
    "            model.cfg.rho = model.cfg.rho_0\n",
    "            \n",
    "        action, propensity = model.get_action(u_t)\n",
    "        \n",
    "        # 3. 计算 Loss\n",
    "        # (A) True Loss: 上帝视角，用于评估和画图\n",
    "        # 即使 action=0，如果小模型错了专家对了，这里也是 1\n",
    "        if data ==\"magpie\":\n",
    "            true_loss = compute_step_loss_open(exp_corr, inst_corr, max_val=max_diff)\n",
    "        else:\n",
    "            true_loss = compute_step_loss(exp_corr, inst_corr)\n",
    "        \n",
    "        # (B) Observed Loss: 算法视角 (Bandit Feedback) \n",
    "        # 只有调用了专家 (action=1)，算法才能看到 loss\n",
    "        # 如果 action=0，算法不知道 loss，传入 None (内部处理为0)\n",
    "        observed_loss = true_loss if action == 1 else None\n",
    "        \n",
    "        # 4. 算法更新\n",
    "        model.update(u_t, action, observed_loss)\n",
    "        if t < warm_up:\n",
    "            continue\n",
    "\n",
    "        # 5. Token 消耗计算\n",
    "        # Baseline: 假设全用 Expert\n",
    "        step_baseline_tokens = exp_tok\n",
    "        \n",
    "        # Actual: \n",
    "        # Action 0 -> instant\n",
    "        # Action 1 -> instant + expert (Cascade)\n",
    "        if action == 1:\n",
    "            step_actual_tokens = inst_tok + exp_tok\n",
    "            expert_calls += 1\n",
    "        else:\n",
    "            step_actual_tokens = inst_tok\n",
    "\n",
    "        total_actual_tokens += step_actual_tokens\n",
    "        total_baseline_tokens += step_baseline_tokens\n",
    "        \n",
    "        # 计算当前的 Token Ratio (Accumulated)\n",
    "        # 避免除以0\n",
    "        current_token_ratio = total_actual_tokens / total_baseline_tokens if total_baseline_tokens > 0 else 1.0\n",
    "        # 计算当前专家调用的比例\n",
    "        current_expert_ratio = expert_calls / (t - warm_up + 1)\n",
    "        # 计算当前的 Average Risk (Accumulated True Loss / t)\n",
    "        if action == 0:\n",
    "            cumulative_loss += true_loss # 当调用小模型时，才记录损失\n",
    "        else:\n",
    "            cumulative_loss += 0 # 注意调用专家时，损失永远为0.\n",
    "        current_avg_risk = cumulative_loss / (t + 1 - warm_up)\n",
    "\n",
    "        # 计算当前选的位置的财富\n",
    "        wealth = model.wealth[model.current_u_idx]\n",
    "        # 6. 记录日志\n",
    "        logs.append({\n",
    "            \"step\": t,\n",
    "            \"uncertainty\": u_t,\n",
    "            \"threshold\": model.current_u,      # 当前阈值\n",
    "            \"action\": action,                  # 1=Expert, 0=Instant\n",
    "            \"true_loss\": true_loss,            # 真实损失 (上帝视角)\n",
    "            \"observed_loss\": observed_loss if observed_loss is not None else np.nan,\n",
    "            \"avg_risk\": current_avg_risk,      # 累积平均风险\n",
    "            \"token_ratio\": current_token_ratio, # 累积 Token 消耗比\n",
    "            \"expert_call_ratio\": current_expert_ratio, # 累积专家调用比\n",
    "            \"wealth\": wealth                   # 当前财富水平\n",
    "        })\n",
    "\n",
    "    return pd.DataFrame(logs), model"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e4045b12",
   "metadata": {},
   "source": [
    "## Base"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b37372b7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from scipy.stats import norm\n",
    "\n",
    "def compute_pac_loss(y_correct, y_hat_correct):\n",
    "    \"\"\"\n",
    "    计算二元 Loss (Eq. 9 in paper)\n",
    "    gt_anss 在这里只是占位，因为 y_correct 和 y_hat_correct 已经是 0/1 了\n",
    "    \"\"\"\n",
    "    y_correct = np.array(y_correct, dtype=float)\n",
    "    y_hat_correct = np.array(y_hat_correct, dtype=float)\n",
    "    \n",
    "    # 只有当：专家对(1) 且 小模型错(0) 时，Loss = 1\n",
    "    weak_wrong = 1.0 - y_hat_correct\n",
    "    loss = (y_correct * weak_wrong)\n",
    "    return loss\n",
    "\n",
    "def compute_pac_loss_open(y_correct_t: int, y_hat_correct_t: int, max_val: float) -> float:\n",
    "    \"\"\"\n",
    "    单步 Loss 计算（修改版）\n",
    "    \n",
    "    新公式：\n",
    "    loss = (y_correct_t - y_hat_correct_t) / max_val\n",
    "    \n",
    "    参数:\n",
    "        y_correct_t: int, 1-10, 表示专家的分数\n",
    "        y_hat_correct_t: int, 1-10 表示小模型的分数\n",
    "        max_val: float, 用于归一化的分母（通常 > 0），当前最大的可能分数差值。\n",
    "    \n",
    "    返回:\n",
    "        float, 计算得到的单步 loss 值\n",
    "    \"\"\"\n",
    "    if max_val == 0:\n",
    "        raise ValueError(\"max_val cannot be zero to avoid division by zero\")\n",
    "    y_correct_t = np.asanyarray(y_correct_t,dtype=float)\n",
    "    y_hat_correct_t = np.asanyarray(y_hat_correct_t,dtype=float)\n",
    "    loss = np.sqrt((y_correct_t - y_hat_correct_t) / max_val)\n",
    "    # k = 1\n",
    "    # loss = (1-np.exp(-k*(y_correct_t - y_hat_correct_t))) / (1 - np.exp(-k*max_val))\n",
    "\n",
    "    return loss\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "def split_calib_test(y, calib_ratio=None, calib_num=None, seed=None):\n",
    "    \"\"\"\n",
    "    将数据随机划分为修正集 (calibration set) 和测试集 (test set)\n",
    "\n",
    "    参数:\n",
    "      y           : 数据 (长度 n)\n",
    "      calib_ratio : 修正集占比 (0, 1)\n",
    "      calib_num   : 修正集样本数 (int)\n",
    "      seed        : 随机种子\n",
    "\n",
    "    返回:\n",
    "      calib_idx, test_idx\n",
    "    \"\"\"\n",
    "    # -------- 参数合法性检查 --------\n",
    "    if calib_ratio is None and calib_num is None:\n",
    "        raise ValueError(\"必须指定 calib_ratio 或 calib_num 其中之一\")\n",
    "\n",
    "    if calib_ratio is not None and calib_num is not None:\n",
    "        raise ValueError(\"calib_ratio 和 calib_num 不能同时指定\")\n",
    "\n",
    "    if calib_ratio is not None:\n",
    "        if not (0 < calib_ratio < 1):\n",
    "            raise ValueError(\"calib_ratio 必须在 (0, 1) 之间\")\n",
    "\n",
    "    if calib_num is not None:\n",
    "        if not isinstance(calib_num, int) or calib_num <= 0:\n",
    "            raise ValueError(\"calib_num 必须是正整数\")\n",
    "\n",
    "    # -------- 随机数生成器 --------\n",
    "    rng = np.random.default_rng(seed)\n",
    "\n",
    "    n = len(y)\n",
    "\n",
    "    # -------- 计算 calib_size --------\n",
    "    if calib_ratio is not None:\n",
    "        calib_size = int(n * calib_ratio)\n",
    "    else:\n",
    "        calib_size = calib_num\n",
    "\n",
    "    if calib_size >= n:\n",
    "        raise ValueError(\"calibration set 的大小必须小于数据总量\")\n",
    "\n",
    "    # -------- 打乱并划分 --------\n",
    "    indices = np.arange(n)\n",
    "    rng.shuffle(indices)\n",
    "\n",
    "    calib_idx = indices[:calib_size]\n",
    "    test_idx = indices[calib_size:]\n",
    "\n",
    "    return calib_idx, test_idx\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from scipy.stats import norm\n",
    "\n",
    "def run_baseline_continuous(\n",
    "    data_seq,\n",
    "    calib_ratio=0.5, epsilon=0.05, alpha=0.05, seed=42,\n",
    "    pi=0.5, m=None , shuffle=False\n",
    "):\n",
    "    \"\"\"\n",
    "    连续模拟 Two-Stage Baseline，其中计算 u_hat 的逻辑严格复刻原始代码。\n",
    "    \"\"\"\n",
    "    # -------------------------------------------------------------------------\n",
    "    # 1. 数据准备与重排 (Calibration -> Test)\n",
    "    # -------------------------------------------------------------------------\n",
    "\n",
    "    y_solved = [d['expert_correct'] if d['dataset'] != 'magpie' else d['expert_score'] for d in data_seq]\n",
    "    y_hat_solved = [d['instant_correct'] if d['dataset'] != 'magpie' else d['instant_score'] for d in data_seq]\n",
    "    \n",
    "    U = [d['uncertainty'] for d in data_seq]\n",
    "    y_token = [d['expert_token'] for d in data_seq]\n",
    "    y_hat_token = [d['instant_token'] for d in data_seq]\n",
    "    data_name = [d['dataset'] for d in data_seq]\n",
    "\n",
    "    if \"magpie\" in data_name:\n",
    "        magpie_seq = [item for item in data_seq if item[\"dataset\"]==\"magpie\"]\n",
    "        diffs = []\n",
    "        for i in magpie_seq:\n",
    "            diff = i['expert_score'] - i['instant_score']\n",
    "            diffs.append(diff)\n",
    "        max_diff = max(diffs)\n",
    "\n",
    "    n = len(y_solved)\n",
    "    calib_size = int(n * calib_ratio)\n",
    "    \n",
    "    # 设定随机种子\n",
    "    rng = np.random.default_rng(seed)\n",
    "    \n",
    "    # 打乱索引\n",
    "    indices = np.arange(n)\n",
    "    if shuffle:\n",
    "        rng.shuffle(indices)\n",
    "    \n",
    "    # 重排索引：前 calib_size 放前面，剩下的放后面\n",
    "    sorted_indices = np.concatenate([indices[:calib_size], indices[calib_size:]])\n",
    "    \n",
    "    # 辅助重排函数\n",
    "    def reorder(arr):\n",
    "        return [arr[i] for i in sorted_indices]\n",
    "    \n",
    "    # 重排数据\n",
    "    y_s = reorder(y_solved)       # 专家正确性\n",
    "    y_h = reorder(y_hat_solved)   # 小模型正确性\n",
    "    u_s_list = reorder(U)         # 不确定性 (list)\n",
    "    tok_s = reorder(y_token)      # 专家Token\n",
    "    tok_h = reorder(y_hat_token)  # 小模型Token\n",
    "    \n",
    "    # -------------------------------------------------------------------------\n",
    "    # 2. 计算固定阈值 u_hat (完全保留原始逻辑)\n",
    "    # -------------------------------------------------------------------------\n",
    "    \n",
    "    # (A) 提取校准数据 (前 calib_size 个)\n",
    "    y_solved_c = np.array(y_s[:calib_size], dtype=object)\n",
    "    y_hat_solved_c = np.array(y_h[:calib_size], dtype=object)\n",
    "    U_c = np.asarray(u_s_list[:calib_size], dtype=float)\n",
    "    \n",
    "    n_c = len(y_solved_c)\n",
    "    \n",
    "    # 如果没指定 m，默认使用校准集大小 (和原始代码中 calib_idx.shape[0] 对应)\n",
    "    if m is None:\n",
    "        m = n_c\n",
    "\n",
    "    # (B) 计算 Full Loss\n",
    "    loss_full_c = []\n",
    "    for i in range(n_c):\n",
    "        if data_name[i] ==\"magpie\":\n",
    "            loss = compute_pac_loss_open(y_solved_c[i], y_hat_solved_c[i], max_val=max_diff)\n",
    "        else:\n",
    "            loss = compute_pac_loss(y_solved_c[i], y_hat_solved_c[i])\n",
    "        loss_full_c.append(loss)\n",
    "    \n",
    "    loss_full_c = np.array(loss_full_c, dtype=float)\n",
    "\n",
    "    # (C) 放回抽样 (Bootstrap) --- [原始逻辑]\n",
    "    sample_idx = rng.choice(n_c, size=m, replace=True)\n",
    "    U_s_sampled = U_c[sample_idx] # 注意变量名避免冲突\n",
    "    loss_s_sampled = loss_full_c[sample_idx]\n",
    "\n",
    "    # (D) Bernoulli(pi) 审计 --- [原始逻辑]\n",
    "    phi = (rng.random(m) < pi).astype(float)\n",
    "    weights = phi / pi\n",
    "\n",
    "    # (E) 定义 UCB 函数 --- [原始逻辑]\n",
    "    z = norm.ppf(1 - alpha)\n",
    "    \n",
    "    def upper_conf_bound(u):\n",
    "        mask = (U_s_sampled <= u).astype(float)\n",
    "        X = loss_s_sampled * mask * weights\n",
    "        mean = X.mean()\n",
    "        std = X.std(ddof=1) if X.size > 1 else 0.0\n",
    "        return mean + z * std / np.sqrt(X.size)\n",
    "\n",
    "    # (F) Grid Search 与 u_hat 选择 --- [原始逻辑]\n",
    "    # 原始代码使用 np.sort(U_c) 作为网格\n",
    "    u_grid = np.sort(U_c)\n",
    "    \n",
    "    # 计算所有点的 UCB\n",
    "    ucbs = np.array([upper_conf_bound(u) for u in u_grid])\n",
    "    \n",
    "    # 找到满足条件的最大的 u\n",
    "    ok_idx = np.where(ucbs <= epsilon)[0]\n",
    "    if ok_idx.size > 0:\n",
    "        u_hat = float(u_grid[ok_idx.max()])\n",
    "    else:\n",
    "        # 如果都不满足，取最小的（最保守）\n",
    "        u_hat = float(u_grid[0])\n",
    "        \n",
    "    # print(f\"Run Seed={seed}: u_hat = {u_hat:.4f}\")\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    # 3. 连续流模拟 (生成 Logs)\n",
    "    # -------------------------------------------------------------------------\n",
    "    logs = []\n",
    "    \n",
    "    total_actual_tokens = 0\n",
    "    total_baseline_tokens = 0\n",
    "    cumulative_loss = 0\n",
    "    expert_calls = 0\n",
    "    \n",
    "    for t in range(n):\n",
    "        cur_u = u_s_list[t]\n",
    "        cur_y_exp = y_s[t]\n",
    "        cur_y_inst = y_h[t]\n",
    "        cur_tok_exp = tok_s[t]\n",
    "        cur_tok_inst = tok_h[t]\n",
    "        \n",
    "        # 判断阶段\n",
    "        is_in_calibration = (t < calib_size)\n",
    "        \n",
    "        if is_in_calibration:\n",
    "            # === 校准阶段 ===\n",
    "            action = 1 \n",
    "            threshold = 0.0 # 占位\n",
    "            \n",
    "            # 校准阶段虽然强制调用专家，但为了对齐 Risk 曲线的起点，\n",
    "            # 我们通常认为此时 Loss=0 (因为获得了专家/真实标签)\n",
    "            true_loss = 0.0 \n",
    "            observed_loss = 0.0\n",
    "        else:\n",
    "            # === 测试阶段 ===\n",
    "            threshold = u_hat\n",
    "            # 应用计算出的 u_hat\n",
    "            if cur_u >= u_hat:\n",
    "                action = 1 # Expert\n",
    "            else:\n",
    "                action = 0 # Instant\n",
    "            \n",
    "            # Loss 计算\n",
    "            if action == 1:\n",
    "                true_loss = 0.0\n",
    "                observed_loss = 0.0\n",
    "            else:\n",
    "                # 没调专家：如果专家对(1)且小模型错(0)，则 loss=1\n",
    "                true_loss = float(cur_y_exp == 1 and cur_y_inst == 0)\n",
    "                observed_loss = None\n",
    "\n",
    "        # 统计\n",
    "        step_baseline = cur_tok_exp\n",
    "        if action == 1:\n",
    "            step_actual = cur_tok_inst + cur_tok_exp\n",
    "            expert_calls += 1\n",
    "        else:\n",
    "            step_actual = cur_tok_inst\n",
    "            \n",
    "        total_actual_tokens += step_actual\n",
    "        total_baseline_tokens += step_baseline\n",
    "        cumulative_loss += true_loss\n",
    "        \n",
    "        # 实时指标\n",
    "        current_token_ratio = total_actual_tokens / total_baseline_tokens if total_baseline_tokens > 0 else 1.0\n",
    "        current_expert_ratio = expert_calls / (t + 1)\n",
    "        current_avg_risk = cumulative_loss / (t + 1)\n",
    "        \n",
    "        logs.append({\n",
    "            \"step\": t,\n",
    "            \"phase\": \"Calibration\" if is_in_calibration else \"Test\",\n",
    "            \"uncertainty\": cur_u,\n",
    "            \"threshold\": threshold,\n",
    "            \"action\": action,\n",
    "            \"true_loss\": true_loss,\n",
    "            \"observed_loss\": observed_loss,\n",
    "            \"avg_risk\": current_avg_risk,\n",
    "            \"token_ratio\": current_token_ratio,\n",
    "            \"expert_call_ratio\": current_expert_ratio,\n",
    "            \"wealth\": 1.0 # 占位，方便画图\n",
    "        })\n",
    "        \n",
    "    return pd.DataFrame(logs), u_hat"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3e535b4e",
   "metadata": {},
   "source": [
    "# use mmlupro and bbh"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c2c60cdc",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "expert_data1 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/mmlupro/qwen3-think.json\")\n",
    "expert_data2 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/mmlupro/qwen3-think2224.2224--1.json\")\n",
    "expert_data3 = pd.concat([expert_data1, expert_data2.iloc[1:, :]], ignore_index=True)\n",
    "expert_data_mmlu = expert_data3[expert_data3[\"matched\"] == True]\n",
    "session_ids = list(expert_data_mmlu[\"session_id\"])\n",
    "\n",
    "instant_data1 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/mmlupro/qwen3-ins1.json\")\n",
    "instant_data_mmlu = instant_data1[instant_data1[\"session_id\"].isin(session_ids)]\n",
    "expert_data_mmlu.shape, instant_data_mmlu.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "64559136",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "expert_data3 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/bbh/qwen3-think.json\")\n",
    "expert_data_bbh = expert_data3[expert_data3[\"matched\"] == True]\n",
    "session_ids = list(expert_data_bbh[\"session_id\"])\n",
    "\n",
    "instant_data1 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/bbh/qwen3-ins.json\")\n",
    "instant_data_bbh = instant_data1[instant_data1[\"session_id\"].isin(session_ids)]\n",
    "expert_data_bbh.shape, instant_data_bbh.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4a1fd5e1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "ratio = 0.5          # 删除比例\n",
    "seed = 42            # 随机种子，保证可复现\n",
    "\n",
    "# 找到 matched == True 的索引\n",
    "matched_idx = instant_data_bbh[instant_data_bbh[\"matched\"] == True].index\n",
    "\n",
    "# 要删除的数量\n",
    "n_drop = int(len(matched_idx) * ratio)\n",
    "\n",
    "# 随机选取要删除的索引\n",
    "drop_idx = np.random.RandomState(seed).choice(\n",
    "    matched_idx,\n",
    "    size=n_drop,\n",
    "    replace=False\n",
    ")\n",
    "\n",
    "# 删除\n",
    "instant_data_bbh_reduced = instant_data_bbh.drop(drop_idx)\n",
    "\n",
    "print(\"原始大小:\", instant_data_bbh.shape)\n",
    "print(\"删除后大小:\", instant_data_bbh_reduced.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a617bed8",
   "metadata": {},
   "outputs": [],
   "source": [
    "instant_data_bbh[\"matched\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d2f5f38b",
   "metadata": {},
   "outputs": [],
   "source": [
    "expert_data = pd.concat([expert_data_mmlu, expert_data_bbh], ignore_index=True)\n",
    "instant_data = pd.concat([instant_data_mmlu, instant_data_bbh_reduced], ignore_index=True)\n",
    "expert_data.shape, instant_data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1eb53b9c",
   "metadata": {},
   "outputs": [],
   "source": [
    "expert_data[\"dataset\"].unique(), instant_data[\"dataset\"].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1811947c",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_list =[]\n",
    "selcet_ids =[]\n",
    "for i,row in expert_data.iterrows():\n",
    "    session_id = row['session_id']\n",
    "    if session_id not in instant_data['session_id'].values:\n",
    "        print(f\"Instant data missing at index:{i}, session_id:{session_id}\")\n",
    "        continue\n",
    "    tmp_dict = {}\n",
    "    instant_row = instant_data[instant_data['session_id'] == session_id].iloc[0]\n",
    "    selcet_ids.append(session_id)\n",
    "    if instant_row[\"question\"] != row[\"question\"]:\n",
    "        print(f\"Question mismatch at index:{i}, session_id:{session_id}\")\n",
    "        continue\n",
    "    tmp_dict['uncertainty'] = 1- instant_row['token_probs'][0]\n",
    "    tmp_dict['instant_correct'] = int(instant_row['matched']) if instant_row['matched'] != \"No answer extracted\" else 0\n",
    "    tmp_dict['expert_correct'] = int(expert_data.loc[i, 'matched'])\n",
    "    tmp_dict['instant_token'] = instant_row['gen_token_count']\n",
    "    tmp_dict['expert_token'] = expert_data.loc[i, 'gen_token_count']\n",
    "    tmp_dict['dataset'] = row['dataset']\n",
    "    data_list.append(tmp_dict)\n",
    "\n",
    "print(f\"Total samples prepared: {len(data_list)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "130948b1",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(data_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0ed76282",
   "metadata": {},
   "outputs": [],
   "source": [
    "split_ind =0\n",
    "for i, item in enumerate(data_list):\n",
    "    if item['dataset'] == 'bbh':\n",
    "        split_ind = i\n",
    "        break\n",
    "split_ind"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7484c4f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_list[3738:3739]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5c9851b4",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from tqdm import tqdm \n",
    "\n",
    "# data_list[:split_ind] 是 Dataset A (MMLU)\n",
    "# data_list[split_ind:] 是 Dataset B (BBH)\n",
    "data_list_A1 = data_list[:split_ind]\n",
    "data_list_A = data_list_A1[:1500]  # 取前 1500 个作为 Dataset A\n",
    "data_list_B = data_list[split_ind:]\n",
    "\n",
    "# 参数设置\n",
    "NUM_RUNS = 100         # 重复次数\n",
    "TARGET_EPSILON = 0.05   # 目标 Risk\n",
    "TARGET_ALPHA = 0.1     # 置信度\n",
    "CALIB_NUM = 500        # 校准集大小\n",
    "TOTAL_NUM = len(data_list_A) + len(data_list_B) # 所有的题目数量\n",
    "Calib_ratio = CALIB_NUM / TOTAL_NUM\n",
    "\n",
    "print(f\"Non-Stationary Setup: Calib on A ({len(data_list_A)}), Test on B ({len(data_list_B)})\")\n",
    "\n",
    "# -----------------------------------------------------------\n",
    "# 1. Baseline 循环 (非平稳数据流)\n",
    "# -----------------------------------------------------------\n",
    "all_risks, all_token_ratios, all_expert_ratios, all_u_hats = [], [], [], []\n",
    "\n",
    "print(f\"Starting Baseline Simulation ({NUM_RUNS} runs)...\")\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    rng = np.random.default_rng(seed)\n",
    "    \n",
    "    # === 关键：分别打乱 A 和 B，然后拼接 ===\n",
    "    # 打乱 A\n",
    "    idx_a = np.arange(len(data_list_A))\n",
    "    rng.shuffle(idx_a)\n",
    "    data_a_shuffled = [data_list_A[i] for i in idx_a]\n",
    "    \n",
    "    # 打乱 B\n",
    "    idx_b = np.arange(len(data_list_B))\n",
    "    rng.shuffle(idx_b)\n",
    "    data_b_shuffled = [data_list_B[i] for i in idx_b]\n",
    "    \n",
    "    # 拼接: 先 A 后 B (模拟分布偏移)\n",
    "    current_stream = data_a_shuffled + data_b_shuffled\n",
    "    \n",
    "    # 提取列表形式\n",
    "    # y_solved = [d['expert_correct'] for d in current_stream]\n",
    "    # y_hat_solved = [d['instant_correct'] for d in current_stream]\n",
    "    # U = [d['uncertainty'] for d in current_stream]\n",
    "    # y_tok = [d['expert_token'] for d in current_stream]\n",
    "    # y_hat_tok = [d['instant_token'] for d in current_stream]\n",
    "    \n",
    "    # 运行 Baseline (禁止内部 shuffle)\n",
    "    df, u_hat = run_baseline_continuous(\n",
    "        current_stream,\n",
    "        calib_ratio=Calib_ratio, # 确保刚好覆盖完 Dataset A\n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=TARGET_ALPHA,\n",
    "        seed=seed,\n",
    "        shuffle=False # <--- 关键：保持 A->B 的顺序\n",
    "    )\n",
    "    \n",
    "    all_risks.append(df['avg_risk'].values)\n",
    "    all_token_ratios.append(df['token_ratio'].values)\n",
    "    all_expert_ratios.append(df['expert_call_ratio'].values)\n",
    "    all_u_hats.append(u_hat)\n",
    "\n",
    "arr_risks = np.array(all_risks)\n",
    "arr_token_ratios = np.array(all_token_ratios)\n",
    "arr_expert_ratios = np.array(all_expert_ratios)\n",
    "arr_thresholds = np.array(all_u_hats)\n",
    "\n",
    "print(\"\\nSimulation Finished!\")\n",
    "print(f\"Average u_hat across runs: {np.mean(all_u_hats):.4f} (std: {np.std(all_u_hats):.4f})\")\n",
    "\n",
    "# -----------------------------------------------------------\n",
    "# 2. BPAC 循环 (非平稳数据流)\n",
    "# -----------------------------------------------------------\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "# 参数设置\n",
    "NUM_RUNS = 100        # 重复 100 次\n",
    "WARM_UP_STEPS = 0   # BPAC 的 Warm-up 步数 (对应 Baseline 的校准集大小)\n",
    "TARGET_EPSILON = 0.05  # 目标 Risk\n",
    "CFG_ALPHA= 0.1       # BPAC 参数\n",
    "CFG_RHO = 0      # BPAC 参数\n",
    "BPA_beta = 1\n",
    "CFG_RHO_0 = 0.05\n",
    "CFG_RHO_1 = 0.7\n",
    "CFG_CHANGE = 200\n",
    "\n",
    "bpac_risks, bpac_token_ratios, bpac_expert_ratios = [], [], []\n",
    "bpac_wealths, bpac_thresholds = [], []\n",
    "\n",
    "print(f\"Starting BPAC Simulation ({NUM_RUNS} runs)...\")\n",
    "\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    rng = np.random.default_rng(seed)\n",
    "    \n",
    "    # === 关键：使用相同的逻辑构造数据流 (Data Alignment) ===\n",
    "    # 打乱 A\n",
    "    idx_a = np.arange(len(data_list_A))\n",
    "    rng.shuffle(idx_a)\n",
    "    data_a_shuffled = [data_list_A[i] for i in idx_a]\n",
    "    \n",
    "    # 打乱 B\n",
    "    idx_b = np.arange(len(data_list_B))\n",
    "    rng.shuffle(idx_b)\n",
    "    data_b_shuffled = [data_list_B[i] for i in idx_b]\n",
    "    \n",
    "    # 拼接\n",
    "    current_stream = data_a_shuffled + data_b_shuffled\n",
    "    \n",
    "    # 配置并运行 BPAC\n",
    "    cfg = BPACConfig(\n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=CFG_ALPHA, \n",
    "        rho=CFG_RHO, \n",
    "        warm_up=WARM_UP_STEPS, # 前 A 个样本作为 Warm-up\n",
    "        num_thresholds=1001, \n",
    "        beta=BPA_beta, \n",
    "        c_clip=0.9,\n",
    "        rho_0=CFG_RHO_0,\n",
    "        rho_1=CFG_RHO_1,\n",
    "        change_point=CFG_CHANGE\n",
    "    )\n",
    "    \n",
    "    df_result, _ = run_simulation(current_stream, cfg)\n",
    "    \n",
    "    bpac_risks.append(df_result['avg_risk'].values)\n",
    "    bpac_token_ratios.append(df_result['token_ratio'].values)\n",
    "    bpac_expert_ratios.append(df_result['expert_call_ratio'].values)\n",
    "    bpac_wealths.append(df_result['wealth'].values)\n",
    "    bpac_thresholds.append(df_result['threshold'].values)\n",
    "\n",
    "bpac_risks_arr = np.array(bpac_risks)\n",
    "bpac_token_ratios_arr = np.array(bpac_token_ratios)\n",
    "bpac_expert_ratios_arr = np.array(bpac_expert_ratios)\n",
    "bpac_wealths_arr = np.array(bpac_wealths)\n",
    "bpac_thresholds_arr = np.array(bpac_thresholds)\n",
    "print(\"BPAC Simulation Finished!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "26a9c017",
   "metadata": {},
   "source": [
    "## 论文图"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3b270b0e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "from scipy import stats\n",
    "from matplotlib.ticker import MultipleLocator\n",
    "\n",
    "# ==========================================\n",
    "# 1. 设置阴影类型 (ICML 关键设置)\n",
    "# ==========================================\n",
    "# 选项: \n",
    "# 'std': 标准差 (Mean ± Std) -> 阴影最大，容易超线\n",
    "# 'sem': 标准误 (Mean ± Std/sqrt(N)) -> 阴影更窄，聚焦于均值的准确性 (推荐!)\n",
    "# 'ci':  95% 置信区间 (Bootstrap/t-distribution) -> 严谨的统计学区间\n",
    "# 'percentile': 分位数 (10% - 90%) -> 展示数据分布的真实范围\n",
    "\n",
    "SHADOW_TYPE = 'std' \n",
    "NUM_RUNS = 100  # 你的实验次数\n",
    "\n",
    "def get_error_bounds(data_array, type='sem',scale=1):\n",
    "    \"\"\"根据选择的类型计算上下界\"\"\"\n",
    "    mean_val = np.mean(data_array, axis=0)\n",
    "    \n",
    "    if type == 'std':\n",
    "        std_val = np.std(data_array, axis=0)\n",
    "        lower = mean_val - std_val\n",
    "        upper = mean_val + std_val\n",
    "        \n",
    "    return mean_val*scale, lower*scale, upper*scale\n",
    "\n",
    "# ==========================================\n",
    "# 2. 计算绘图数据\n",
    "# ==========================================\n",
    "steps = np.arange(bpac_risks_arr.shape[1])\n",
    "\n",
    "# --- 计算 Risk ---\n",
    "base_mean, base_low, base_high = get_error_bounds(arr_risks, SHADOW_TYPE)\n",
    "bpac_mean, bpac_low, bpac_high = get_error_bounds(bpac_risks_arr, SHADOW_TYPE)\n",
    "\n",
    "# --- 计算 ECP ---\n",
    "base_ecp_mean, base_ecp_low, base_ecp_high = get_error_bounds(arr_expert_ratios, SHADOW_TYPE,scale=100)\n",
    "bpac_ecp_mean, bpac_ecp_low, bpac_ecp_high = get_error_bounds(bpac_expert_ratios_arr, SHADOW_TYPE,scale=100)\n",
    "\n",
    "# --- 计算 TCP ---\n",
    "base_tcp_mean, base_tcp_low, base_tcp_high = get_error_bounds(arr_token_ratios, SHADOW_TYPE,scale=100)\n",
    "bpac_tcp_mean, bpac_tcp_low, bpac_tcp_high = get_error_bounds(bpac_token_ratios_arr, SHADOW_TYPE,scale=100)\n",
    "\n",
    "# --- 物理截断 (Clipping) ---\n",
    "# 无论用什么统计方法，Risk 和 Ratio 都不可能小于 0\n",
    "def clip_bounds(low, high, min_v=0, max_v=None):\n",
    "    low = np.maximum(low, min_v)\n",
    "    if max_v is not None:\n",
    "        high = np.minimum(high, max_v)\n",
    "    return low, high\n",
    "\n",
    "# base_low, base_high = clip_bounds(base_low, base_high, 0, 1.0) # Risk 通常<1，也可不设上限\n",
    "# bpac_low, bpac_high = clip_bounds(bpac_low, bpac_high, 0, 1.0)\n",
    "\n",
    "# base_ecp_low, base_ecp_high = clip_bounds(base_ecp_low, base_ecp_high, 0, 1.0)\n",
    "# bpac_ecp_low, bpac_ecp_high = clip_bounds(bpac_ecp_low, bpac_ecp_high, 0, 1.0)\n",
    "\n",
    "# base_tcp_low, base_tcp_high = clip_bounds(base_tcp_low, base_tcp_high, 0, None)\n",
    "# bpac_tcp_low, bpac_tcp_high = clip_bounds(bpac_tcp_low, bpac_tcp_high, 0, None)\n",
    "\n",
    "\n",
    "# ==========================================\n",
    "# 3. 绘图 (ICML Style)\n",
    "# ==========================================\n",
    "# 设置字体和线宽\n",
    "plt.rcParams.update({\n",
    "    'font.size': 14,                # 全局基础字号\n",
    "    'axes.labelsize': 20,           # 轴标签字号 (Risk, ECP, TCP) -> 加大到20\n",
    "    'axes.titlesize': 18,           # 标题字号 (如果你有标题的话)\n",
    "    'xtick.labelsize': 16,          # X轴刻度字号\n",
    "    'ytick.labelsize': 16,          # Y轴刻度字号\n",
    "    'legend.fontsize': 16,          # 图例字号 -> 加大到16\n",
    "    'lines.linewidth': 3.0,         # 线宽加粗，配合大图\n",
    "    'axes.grid': True, \n",
    "    'grid.alpha': 0.3, \n",
    "    'grid.linestyle': '--',\n",
    "    'figure.autolayout': True,\n",
    "    'savefig.dpi': 300\n",
    "})\n",
    "\n",
    "fig, axes = plt.subplots(1, 3, figsize=(20, 5), sharex=True)\n",
    "\n",
    "# 颜色\n",
    "C_BASE = '#1f77b4'  # Blue\n",
    "C_OURS = '#d62728'  # Red\n",
    "C_TGT = '#2ca02c'   # Green\n",
    "C_WARM = 'gray'\n",
    "\n",
    "# --- (a) Risk ---\n",
    "ax = axes[0]\n",
    "ax.plot(steps, base_mean, color=C_BASE, linestyle='--', label='PAC (Baseline)')\n",
    "ax.fill_between(steps, base_low, base_high, color=C_BASE, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, bpac_mean, color=C_OURS, linestyle='-', label='B-PAC (Ours)')\n",
    "ax.fill_between(steps, bpac_low, bpac_high, color=C_OURS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.axhline(y=TARGET_EPSILON, color=C_TGT, linestyle='-', linewidth=2, label='Tolerance')\n",
    "ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "\n",
    "# ax.set_title('(a) Cumulative Risk')\n",
    "ax.set_ylabel('ER',fontsize=22)\n",
    "ax.set_xlabel('Time Step')\n",
    "ax.set_ylim(0, 0.15) # 视野聚焦\n",
    "ax.yaxis.set_major_locator(MultipleLocator(0.05))\n",
    "\n",
    "# 图例中说明阴影含义\n",
    "if SHADOW_TYPE == 'sem':\n",
    "    shadow_label = 'Shaded: SEM'\n",
    "elif SHADOW_TYPE == 'std':\n",
    "    shadow_label = 'Shaded: Std. Dev.'\n",
    "elif SHADOW_TYPE == 'ci':\n",
    "    shadow_label = 'Shaded: 95% CI'\n",
    "else:\n",
    "    shadow_label = 'Shaded: 10-90% Pctl'\n",
    "    \n",
    "# 图例设置 (加大)\n",
    "from matplotlib.lines import Line2D\n",
    "handles, labels = ax.get_legend_handles_labels()\n",
    "handles.append(Line2D([0], [0], color='gray', alpha=0.3, linewidth=10)) # 如果不需要阴影说明可注释\n",
    "# labels.append(\"Shaded: SEM\")\n",
    "ax.legend(handles, labels, loc='upper right', frameon=True, framealpha=0.95, fontsize=16)\n",
    "\n",
    "# --- (b) ECP ---\n",
    "ax = axes[1]\n",
    "ax.plot(steps, base_ecp_mean, color=C_BASE, linestyle='--')\n",
    "ax.fill_between(steps, base_ecp_low, base_ecp_high, color=C_BASE, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, bpac_ecp_mean, color=C_OURS, linestyle='-')\n",
    "ax.fill_between(steps, bpac_ecp_low, bpac_ecp_high, color=C_OURS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "# ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2, label='Warm-up End')\n",
    "ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "# ax.set_title('(b) Expert Call Rate (ECP)')\n",
    "ax.set_ylabel('ECP(%)')\n",
    "ax.set_xlabel('Time Step')\n",
    "ax.set_ylim(0, 100)\n",
    "# ax.legend(loc='upper right')\n",
    "\n",
    "# --- (c) TCP ---\n",
    "ax = axes[2]\n",
    "ax.plot(steps, base_tcp_mean, color=C_BASE, linestyle='--')\n",
    "ax.fill_between(steps, base_tcp_low, base_tcp_high, color=C_BASE, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, bpac_tcp_mean, color=C_OURS, linestyle='-')\n",
    "ax.fill_between(steps, bpac_tcp_low, bpac_tcp_high, color=C_OURS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.axhline(y=100, color='black', linestyle='-.', linewidth=1.5, label='Full Expert')\n",
    "ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "\n",
    "# ax.set_title('(c) Token Cost Ratio (TCP)')\n",
    "ax.set_ylabel('TP(%)')\n",
    "ax.set_xlabel('Time Step')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('qwen3ins_mmlu_bbh.pdf', bbox_inches='tight')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b265e289",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# Baseline 数据\n",
    "base_mean_risk = np.mean(arr_risks, axis=0)\n",
    "base_std_risk = np.std(arr_risks, axis=0)\n",
    "\n",
    "base_mean_token = np.mean(arr_token_ratios, axis=0)\n",
    "base_std_token = np.std(arr_token_ratios, axis=0)\n",
    "\n",
    "base_mean_expert = np.mean(arr_expert_ratios, axis=0)\n",
    "base_std_expert = np.std(arr_expert_ratios, axis=0)\n",
    "\n",
    "# 获取总时间步数\n",
    "N_STEPS = bpac_risks_arr.shape[1] \n",
    "\n",
    "# A. 重构 Baseline Threshold 序列 (因为你只存了最终的一个 u_hat)\n",
    "# 逻辑：在校准期 (前 CALIB_NUM 步) 阈值设为 0，之后设为 u_hat\n",
    "# arr_thresholds 是 (100,) 的标量数组\n",
    "base_threshold_seqs = np.zeros((NUM_RUNS, N_STEPS))\n",
    "for i in range(NUM_RUNS):\n",
    "    base_threshold_seqs[i, :CALIB_NUM] = 0.0  # 校准期阈值通常视为0 (强制Action=1)\n",
    "    base_threshold_seqs[i, CALIB_NUM:] = arr_thresholds[i] # 测试期应用 u_hat\n",
    "\n",
    "base_mean_threshold_seq = np.mean(base_threshold_seqs, axis=0)\n",
    "base_std_threshold_seq = np.std(base_threshold_seqs, axis=0)\n",
    "\n",
    "\n",
    "# BPAC 数据\n",
    "bpac_mean_risk = np.mean(bpac_risks_arr, axis=0)\n",
    "bpac_std_risk = np.std(bpac_risks_arr, axis=0)\n",
    "\n",
    "bpac_mean_token = np.mean(bpac_token_ratios_arr, axis=0)\n",
    "bpac_std_token = np.std(bpac_token_ratios_arr, axis=0)\n",
    "\n",
    "bpac_mean_expert = np.mean(bpac_expert_ratios_arr, axis=0)\n",
    "bpac_std_expert = np.std(bpac_expert_ratios_arr, axis=0)\n",
    "\n",
    "# C. 确保其他 BPAC 统计量已准备好 (以防万一)\n",
    "bpac_mean_threshold = np.mean(bpac_thresholds_arr, axis=0)\n",
    "bpac_std_threshold = np.std(bpac_thresholds_arr, axis=0)\n",
    "\n",
    "# B. 计算 BPAC Wealth 统计 (你存了 bpac_wealths_arr 但还没算均值)\n",
    "bpac_mean_wealth = np.mean(bpac_wealths_arr, axis=0)\n",
    "bpac_std_wealth = np.std(bpac_wealths_arr, axis=0)\n",
    "\n",
    "\n",
    "fig, axes = plt.subplots(5, 1, figsize=(12, 24), sharex=True)\n",
    "(ax1, ax2, ax3, ax4, ax5) = axes\n",
    "\n",
    "# X轴\n",
    "steps = np.arange(N_STEPS)\n",
    "\n",
    "# === 子图 1: Average Risk (风险控制) ===\n",
    "ax1.plot(steps, base_mean_risk, label='Baseline', color='blue', linestyle='--')\n",
    "ax1.fill_between(steps, base_mean_risk - base_std_risk, base_mean_risk + base_std_risk, color='blue', alpha=0.1)\n",
    "\n",
    "ax1.plot(steps, bpac_mean_risk, label='BPAC (Ours)', color='red', linewidth=2)\n",
    "ax1.fill_between(steps, bpac_mean_risk - bpac_std_risk, bpac_mean_risk + bpac_std_risk, color='red', alpha=0.1)\n",
    "\n",
    "ax1.axhline(y=TARGET_EPSILON, color='green', linestyle='-', linewidth=2, label=f'Target $\\epsilon={TARGET_EPSILON}$')\n",
    "ax1.axvline(x=CALIB_NUM, color='gray', linestyle=':', label='Calibration End')\n",
    "\n",
    "ax1.set_ylabel('Avg Risk')\n",
    "ax1.set_title(f'1. Risk Control (Target $\\epsilon < {TARGET_EPSILON}$)')\n",
    "ax1.legend(loc='upper right')\n",
    "ax1.grid(True, alpha=0.3)\n",
    "ax1.set_ylim(0, TARGET_EPSILON * 3.0) # 聚焦在 Epsilon 附近\n",
    "\n",
    "# === 子图 2: Token Ratio (成本节省) ===\n",
    "ax2.plot(steps, base_mean_token, label='Baseline', color='blue', linestyle='--')\n",
    "ax2.fill_between(steps, base_mean_token - base_std_token, base_mean_token + base_std_token, color='blue', alpha=0.1)\n",
    "\n",
    "ax2.plot(steps, bpac_mean_token, label='BPAC', color='red', linewidth=2)\n",
    "ax2.fill_between(steps, bpac_mean_token - bpac_std_token, bpac_mean_token + bpac_std_token, color='red', alpha=0.1)\n",
    "\n",
    "ax2.axhline(y=1.0, color='black', linestyle=':')\n",
    "ax2.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "ax2.set_ylabel('Token Ratio')\n",
    "ax2.set_title('2. Cost Efficiency (Lower is Better)')\n",
    "ax2.grid(True, alpha=0.3)\n",
    "\n",
    "# === 子图 3: Expert Call Ratio (专家调用率) ===\n",
    "ax3.plot(steps, base_mean_expert, label='Baseline', color='blue', linestyle='--')\n",
    "ax3.fill_between(steps, base_mean_expert - base_std_expert, base_mean_expert + base_std_expert, color='blue', alpha=0.1)\n",
    "\n",
    "ax3.plot(steps, bpac_mean_expert, label='BPAC', color='red', linewidth=2)\n",
    "ax3.fill_between(steps, bpac_mean_expert - bpac_std_expert, bpac_mean_expert + bpac_std_expert, color='red', alpha=0.1)\n",
    "\n",
    "ax3.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "ax3.set_ylabel('Expert Call Ratio')\n",
    "ax3.set_title('3. Expert Usage Behavior')\n",
    "ax3.grid(True, alpha=0.3)\n",
    "ax3.set_ylim(0, 1.1)\n",
    "\n",
    "# === 子图 4: Threshold Evolution (阈值变化) ===\n",
    "# Baseline: 前期是0，后期是固定值\n",
    "ax4.plot(steps, base_mean_threshold_seq, label='Baseline $\\hat{u}$', color='blue', linestyle='--')\n",
    "ax4.fill_between(steps, \n",
    "                 base_mean_threshold_seq - base_std_threshold_seq, \n",
    "                 base_mean_threshold_seq + base_std_threshold_seq, \n",
    "                 color='blue', alpha=0.1)\n",
    "\n",
    "# BPAC: 动态变化\n",
    "ax4.plot(steps, bpac_mean_threshold, label='BPAC $u_t$', color='red', linewidth=1.5)\n",
    "ax4.fill_between(steps, \n",
    "                 bpac_mean_threshold - bpac_std_threshold, \n",
    "                 bpac_mean_threshold + bpac_std_threshold, \n",
    "                 color='red', alpha=0.1)\n",
    "\n",
    "ax4.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "ax4.set_ylabel('Uncertainty Threshold')\n",
    "ax4.set_title('4. Threshold Adaptation (Dynamic vs Fixed)')\n",
    "ax4.legend(loc='upper right')\n",
    "ax4.grid(True, alpha=0.3)\n",
    "ax4.set_ylim(0, 1.0) \n",
    "\n",
    "# === 子图 5: Wealth Evolution (财富积累) ===\n",
    "# 只有 BPAC 有 Wealth\n",
    "ax5.plot(steps, bpac_mean_wealth, label='BPAC Wealth', color='purple', linewidth=1.5)\n",
    "ax5.fill_between(steps, \n",
    "                 bpac_mean_wealth - bpac_std_wealth, \n",
    "                 bpac_mean_wealth + bpac_std_wealth, \n",
    "                 color='purple', alpha=0.1)\n",
    "\n",
    "ax5.axhline(y=1.0, color='black', linestyle='--', label='Initial Wealth (1.0)')\n",
    "ax5.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "# 如果财富增长非常快，建议开启对数坐标\n",
    "# ax5.set_yscale('log') \n",
    "\n",
    "ax5.set_ylabel('Wealth')\n",
    "ax5.set_xlabel('Time Step')\n",
    "ax5.set_title('5. BPAC Wealth Accumulation')\n",
    "ax5.legend(loc='upper left')\n",
    "ax5.grid(True, alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ac898230",
   "metadata": {},
   "source": [
    "# use math and bbh"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a9142004",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "expert_data1 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/math/qwen3-think.json\")\n",
    "expert_data2 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/math/qwen3-think2648.2648--1.json\")\n",
    "expert_data3 = pd.concat([expert_data1, expert_data2.iloc[1:, :]], ignore_index=True)\n",
    "expert_data_math = expert_data3[expert_data3[\"matched\"] == True]\n",
    "session_ids = list(expert_data_math[\"session_id\"])\n",
    "expert_data_math[\"question\"] = expert_data_math[\"problem\"]\n",
    "\n",
    "instant_data1 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/math/qwen3-ins1.json\")\n",
    "instant_data_math = instant_data1[instant_data1[\"session_id\"].isin(session_ids)]\n",
    "instant_data_math[\"question\"] = instant_data_math[\"problem\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "787b676c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "expert_data3 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/bbh/qwen3-think.json\")\n",
    "expert_data_bbh = expert_data3[expert_data3[\"matched\"] == True]\n",
    "session_ids = list(expert_data_bbh[\"session_id\"])\n",
    "\n",
    "instant_data1 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/bbh/qwen3-ins.json\")\n",
    "instant_data_bbh = instant_data1[instant_data1[\"session_id\"].isin(session_ids)]\n",
    "expert_data_bbh.shape, instant_data_bbh.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "efd02b9a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "ratio = 0.5          # 删除比例\n",
    "seed = 42            # 随机种子，保证可复现\n",
    "\n",
    "# 找到 matched == True 的索引\n",
    "matched_idx = instant_data_bbh[instant_data_bbh[\"matched\"] == True].index\n",
    "\n",
    "# 要删除的数量\n",
    "n_drop = int(len(matched_idx) * ratio)\n",
    "\n",
    "# 随机选取要删除的索引\n",
    "drop_idx = np.random.RandomState(seed).choice(\n",
    "    matched_idx,\n",
    "    size=n_drop,\n",
    "    replace=False\n",
    ")\n",
    "\n",
    "# 删除\n",
    "instant_data_bbh_reduced = instant_data_bbh.drop(drop_idx)\n",
    "\n",
    "print(\"原始大小:\", instant_data_bbh.shape)\n",
    "print(\"删除后大小:\", instant_data_bbh_reduced.shape)\n",
    "instant_data_bbh[\"matched\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0bcbe407",
   "metadata": {},
   "outputs": [],
   "source": [
    "expert_data = pd.concat([expert_data_math, expert_data_bbh], ignore_index=True)\n",
    "instant_data = pd.concat([instant_data_math, instant_data_bbh_reduced], ignore_index=True)\n",
    "expert_data[\"dataset\"].unique(), instant_data[\"dataset\"].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "934774cd",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9eb86f79",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_list =[]\n",
    "selcet_ids =[]\n",
    "for i,row in expert_data.iterrows():\n",
    "    session_id = row['session_id']\n",
    "    if session_id not in instant_data['session_id'].values:\n",
    "        print(f\"Instant data missing at index:{i}, session_id:{session_id}\")\n",
    "        continue\n",
    "    tmp_dict = {}\n",
    "    instant_row = instant_data[instant_data['session_id'] == session_id].iloc[0]\n",
    "    selcet_ids.append(session_id)\n",
    "    if instant_row[\"question\"] != row[\"question\"]:\n",
    "        print(f\"Question mismatch at index:{i}, session_id:{session_id}\")\n",
    "        continue\n",
    "    tmp_dict['uncertainty'] = 1- instant_row['token_probs'][0]\n",
    "    tmp_dict['instant_correct'] = int(instant_row['matched']) if instant_row['matched'] != \"No answer extracted\" else 0\n",
    "    tmp_dict['expert_correct'] = int(expert_data.loc[i, 'matched'])\n",
    "    tmp_dict['instant_token'] = instant_row['gen_token_count']\n",
    "    tmp_dict['expert_token'] = expert_data.loc[i, 'gen_token_count']\n",
    "    tmp_dict['dataset'] = row['dataset']\n",
    "    data_list.append(tmp_dict)\n",
    "\n",
    "print(f\"Total samples prepared: {len(data_list)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f40c30fb",
   "metadata": {},
   "outputs": [],
   "source": [
    "split_ind =0\n",
    "for i, item in enumerate(data_list):\n",
    "    if item['dataset'] == 'bbh':\n",
    "        split_ind = i\n",
    "        break\n",
    "split_ind"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "47ce6ff4",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_list[4629]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10e0730b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from tqdm import tqdm \n",
    "\n",
    "# data_list[:split_ind] 是 Dataset A (MMLU)\n",
    "# data_list[split_ind:] 是 Dataset B (BBH)\n",
    "data_list_A1 = data_list[:split_ind]\n",
    "data_list_A = data_list_A1[:2000]  # 取前 1500 个作为 Dataset A\n",
    "data_list_B = data_list[split_ind:]\n",
    "\n",
    "# 参数设置\n",
    "NUM_RUNS = 100         # 重复次数\n",
    "TARGET_EPSILON = 0.05   # 目标 Risk\n",
    "TARGET_ALPHA = 0.05     # 置信度\n",
    "CALIB_NUM = 500        # 校准集大小\n",
    "TOTAL_NUM = len(data_list_A) + len(data_list_B) # 所有的题目数量\n",
    "calib_ratio = CALIB_NUM / TOTAL_NUM\n",
    "\n",
    "print(f\"Non-Stationary Setup: Calib on A ({len(data_list_A)}), Test on B ({len(data_list_B)})\")\n",
    "\n",
    "# -----------------------------------------------------------\n",
    "# 1. Baseline 循环 (非平稳数据流)\n",
    "# -----------------------------------------------------------\n",
    "all_risks, all_token_ratios, all_expert_ratios, all_u_hats = [], [], [], []\n",
    "\n",
    "print(f\"Starting Baseline Simulation ({NUM_RUNS} runs)...\")\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    rng = np.random.default_rng(seed)\n",
    "    \n",
    "    # === 关键：分别打乱 A 和 B，然后拼接 ===\n",
    "    # 打乱 A\n",
    "    idx_a = np.arange(len(data_list_A))\n",
    "    rng.shuffle(idx_a)\n",
    "    data_a_shuffled = [data_list_A[i] for i in idx_a]\n",
    "    \n",
    "    # 打乱 B\n",
    "    idx_b = np.arange(len(data_list_B))\n",
    "    rng.shuffle(idx_b)\n",
    "    data_b_shuffled = [data_list_B[i] for i in idx_b]\n",
    "    \n",
    "    # 拼接: 先 A 后 B (模拟分布偏移)\n",
    "    current_stream = data_a_shuffled + data_b_shuffled\n",
    "    \n",
    "    # 提取列表形式\n",
    "    y_solved = [d['expert_correct'] for d in current_stream]\n",
    "    y_hat_solved = [d['instant_correct'] for d in current_stream]\n",
    "    U = [d['uncertainty'] for d in current_stream]\n",
    "    y_tok = [d['expert_token'] for d in current_stream]\n",
    "    y_hat_tok = [d['instant_token'] for d in current_stream]\n",
    "    \n",
    "    # 运行 Baseline (禁止内部 shuffle)\n",
    "    df, u_hat = run_baseline_continuous(\n",
    "        y_solved, y_hat_solved, U, y_tok, y_hat_tok,\n",
    "        calib_ratio=calib_ratio, # 确保刚好覆盖完 Dataset A\n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=TARGET_ALPHA,\n",
    "        seed=seed,\n",
    "        shuffle=False # <--- 关键：保持 A->B 的顺序\n",
    "    )\n",
    "    \n",
    "    all_risks.append(df['avg_risk'].values)\n",
    "    all_token_ratios.append(df['token_ratio'].values)\n",
    "    all_expert_ratios.append(df['expert_call_ratio'].values)\n",
    "    all_u_hats.append(u_hat)\n",
    "\n",
    "arr_risks = np.array(all_risks)\n",
    "arr_token_ratios = np.array(all_token_ratios)\n",
    "arr_expert_ratios = np.array(all_expert_ratios)\n",
    "arr_thresholds = np.array(all_u_hats)\n",
    "\n",
    "print(\"\\nSimulation Finished!\")\n",
    "print(f\"Average u_hat across runs: {np.mean(all_u_hats):.4f} (std: {np.std(all_u_hats):.4f})\")\n",
    "\n",
    "# -----------------------------------------------------------\n",
    "# 2. BPAC 循环 (非平稳数据流)\n",
    "# -----------------------------------------------------------\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "# 参数设置\n",
    "NUM_RUNS = 100        # 重复 100 次\n",
    "WARM_UP_STEPS = 0   # BPAC 的 Warm-up 步数 (对应 Baseline 的校准集大小)\n",
    "TARGET_EPSILON = 0.05  # 目标 Risk\n",
    "CFG_ALPHA= 0.1       # BPAC 参数\n",
    "CFG_RHO = 0      # BPAC 参数\n",
    "BPA_beta = 1\n",
    "CFG_RHO_0 = 0.05\n",
    "CFG_RHO_1 = 0.7\n",
    "CFG_CHANGE = 200\n",
    "\n",
    "bpac_risks, bpac_token_ratios, bpac_expert_ratios = [], [], []\n",
    "bpac_wealths, bpac_thresholds = [], []\n",
    "\n",
    "print(f\"Starting BPAC Simulation ({NUM_RUNS} runs)...\")\n",
    "\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    rng = np.random.default_rng(seed)\n",
    "    \n",
    "    # === 关键：使用相同的逻辑构造数据流 (Data Alignment) ===\n",
    "    # 打乱 A\n",
    "    idx_a = np.arange(len(data_list_A))\n",
    "    rng.shuffle(idx_a)\n",
    "    data_a_shuffled = [data_list_A[i] for i in idx_a]\n",
    "    \n",
    "    # 打乱 B\n",
    "    idx_b = np.arange(len(data_list_B))\n",
    "    rng.shuffle(idx_b)\n",
    "    data_b_shuffled = [data_list_B[i] for i in idx_b]\n",
    "    \n",
    "    # 拼接\n",
    "    current_stream = data_a_shuffled + data_b_shuffled\n",
    "    \n",
    "    # 配置并运行 BPAC\n",
    "    cfg = BPACConfig(\n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=CFG_ALPHA, \n",
    "        rho=CFG_RHO, \n",
    "        warm_up=WARM_UP_STEPS, # 前 A 个样本作为 Warm-up\n",
    "        num_thresholds=1001, \n",
    "        beta=BPA_beta, \n",
    "        c_clip=0.9,\n",
    "        rho_0=CFG_RHO_0,\n",
    "        rho_1=CFG_RHO_1,\n",
    "        change_point=CFG_CHANGE\n",
    "    )\n",
    "    \n",
    "    df_result, _ = run_simulation(current_stream, cfg)\n",
    "    \n",
    "    bpac_risks.append(df_result['avg_risk'].values)\n",
    "    bpac_token_ratios.append(df_result['token_ratio'].values)\n",
    "    bpac_expert_ratios.append(df_result['expert_call_ratio'].values)\n",
    "    bpac_wealths.append(df_result['wealth'].values)\n",
    "    bpac_thresholds.append(df_result['threshold'].values)\n",
    "\n",
    "bpac_risks_arr = np.array(bpac_risks)\n",
    "bpac_token_ratios_arr = np.array(bpac_token_ratios)\n",
    "bpac_expert_ratios_arr = np.array(bpac_expert_ratios)\n",
    "bpac_wealths_arr = np.array(bpac_wealths)\n",
    "bpac_thresholds_arr = np.array(bpac_thresholds)\n",
    "print(\"BPAC Simulation Finished!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "89e3336b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "from scipy import stats\n",
    "\n",
    "# ==========================================\n",
    "# 1. 设置阴影类型 (ICML 关键设置)\n",
    "# ==========================================\n",
    "# 选项: \n",
    "# 'std': 标准差 (Mean ± Std) -> 阴影最大，容易超线\n",
    "# 'sem': 标准误 (Mean ± Std/sqrt(N)) -> 阴影更窄，聚焦于均值的准确性 (推荐!)\n",
    "# 'ci':  95% 置信区间 (Bootstrap/t-distribution) -> 严谨的统计学区间\n",
    "# 'percentile': 分位数 (10% - 90%) -> 展示数据分布的真实范围\n",
    "\n",
    "SHADOW_TYPE = 'sem' \n",
    "NUM_RUNS = 100  # 你的实验次数\n",
    "\n",
    "def get_error_bounds(data_array, type='sem'):\n",
    "    \"\"\"根据选择的类型计算上下界\"\"\"\n",
    "    mean_val = np.mean(data_array, axis=0)\n",
    "    \n",
    "    if type == 'std':\n",
    "        std_val = np.std(data_array, axis=0)\n",
    "        lower = mean_val - std_val\n",
    "        upper = mean_val + std_val\n",
    "        \n",
    "    elif type == 'sem':\n",
    "        # 标准误 = Std / sqrt(N)\n",
    "        sem_val = stats.sem(data_array, axis=0)\n",
    "        lower = mean_val - sem_val\n",
    "        upper = mean_val + sem_val\n",
    "        \n",
    "    elif type == 'ci':\n",
    "        # 95% t-distribution 置信区间\n",
    "        sem_val = stats.sem(data_array, axis=0)\n",
    "        # ppf(0.975) 对应双侧 95%\n",
    "        ci_scale = stats.t.ppf(0.975, df=NUM_RUNS-1) \n",
    "        lower = mean_val - sem_val * ci_scale\n",
    "        upper = mean_val + sem_val * ci_scale\n",
    "        \n",
    "    elif type == 'percentile':\n",
    "        # 10% - 90% 分位数 (剔除最极端的异常值)\n",
    "        lower = np.percentile(data_array, 10, axis=0)\n",
    "        upper = np.percentile(data_array, 90, axis=0)\n",
    "        \n",
    "    else:\n",
    "        raise ValueError(\"Unknown shadow type\")\n",
    "        \n",
    "    return mean_val, lower, upper\n",
    "\n",
    "# ==========================================\n",
    "# 2. 计算绘图数据\n",
    "# ==========================================\n",
    "steps = np.arange(bpac_risks_arr.shape[1])\n",
    "\n",
    "# --- 计算 Risk ---\n",
    "base_mean, base_low, base_high = get_error_bounds(arr_risks, SHADOW_TYPE)\n",
    "bpac_mean, bpac_low, bpac_high = get_error_bounds(bpac_risks_arr, SHADOW_TYPE)\n",
    "\n",
    "# --- 计算 ECP ---\n",
    "base_ecp_mean, base_ecp_low, base_ecp_high = get_error_bounds(arr_expert_ratios, SHADOW_TYPE)\n",
    "bpac_ecp_mean, bpac_ecp_low, bpac_ecp_high = get_error_bounds(bpac_expert_ratios_arr, SHADOW_TYPE)\n",
    "\n",
    "# --- 计算 TCP ---\n",
    "base_tcp_mean, base_tcp_low, base_tcp_high = get_error_bounds(arr_token_ratios, SHADOW_TYPE)\n",
    "bpac_tcp_mean, bpac_tcp_low, bpac_tcp_high = get_error_bounds(bpac_token_ratios_arr, SHADOW_TYPE)\n",
    "\n",
    "# --- 物理截断 (Clipping) ---\n",
    "# 无论用什么统计方法，Risk 和 Ratio 都不可能小于 0\n",
    "def clip_bounds(low, high, min_v=0, max_v=None):\n",
    "    low = np.maximum(low, min_v)\n",
    "    if max_v is not None:\n",
    "        high = np.minimum(high, max_v)\n",
    "    return low, high\n",
    "\n",
    "base_low, base_high = clip_bounds(base_low, base_high, 0, 1.0) # Risk 通常<1，也可不设上限\n",
    "bpac_low, bpac_high = clip_bounds(bpac_low, bpac_high, 0, 1.0)\n",
    "\n",
    "base_ecp_low, base_ecp_high = clip_bounds(base_ecp_low, base_ecp_high, 0, 1.0)\n",
    "bpac_ecp_low, bpac_ecp_high = clip_bounds(bpac_ecp_low, bpac_ecp_high, 0, 1.0)\n",
    "\n",
    "base_tcp_low, base_tcp_high = clip_bounds(base_tcp_low, base_tcp_high, 0, None)\n",
    "bpac_tcp_low, bpac_tcp_high = clip_bounds(bpac_tcp_low, bpac_tcp_high, 0, None)\n",
    "\n",
    "\n",
    "# ==========================================\n",
    "# 3. 绘图 (ICML Style)\n",
    "# ==========================================\n",
    "# 设置字体和线宽\n",
    "plt.rcParams.update({\n",
    "    'font.size': 14,                # 全局基础字号\n",
    "    'axes.labelsize': 20,           # 轴标签字号 (Risk, ECP, TCP) -> 加大到20\n",
    "    'axes.titlesize': 18,           # 标题字号 (如果你有标题的话)\n",
    "    'xtick.labelsize': 16,          # X轴刻度字号\n",
    "    'ytick.labelsize': 16,          # Y轴刻度字号\n",
    "    'legend.fontsize': 16,          # 图例字号 -> 加大到16\n",
    "    'lines.linewidth': 3.0,         # 线宽加粗，配合大图\n",
    "    'axes.grid': True, \n",
    "    'grid.alpha': 0.3, \n",
    "    'grid.linestyle': '--',\n",
    "    'figure.autolayout': True,\n",
    "    'savefig.dpi': 300\n",
    "})\n",
    "\n",
    "fig, axes = plt.subplots(1, 3, figsize=(20, 5), sharex=True)\n",
    "\n",
    "# 颜色\n",
    "C_BASE = '#1f77b4'  # Blue\n",
    "C_OURS = '#d62728'  # Red\n",
    "C_TGT = '#2ca02c'   # Green\n",
    "C_WARM = 'gray'\n",
    "\n",
    "# --- (a) Risk ---\n",
    "ax = axes[0]\n",
    "ax.plot(steps, base_mean, color=C_BASE, linestyle='--', label='PAC (Baseline)')\n",
    "ax.fill_between(steps, base_low, base_high, color=C_BASE, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, bpac_mean, color=C_OURS, linestyle='-', label='B-PAC (Ours)')\n",
    "ax.fill_between(steps, bpac_low, bpac_high, color=C_OURS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.axhline(y=TARGET_EPSILON, color=C_TGT, linestyle='-', linewidth=2, label='Tolerance')\n",
    "ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "\n",
    "# ax.set_title('(a) Cumulative Risk')\n",
    "ax.set_ylabel('ER',fontsize=22)\n",
    "ax.set_xlabel('Time Step')\n",
    "ax.set_ylim(0, TARGET_EPSILON * 2.0) # 视野聚焦\n",
    "# 图例中说明阴影含义\n",
    "if SHADOW_TYPE == 'sem':\n",
    "    shadow_label = 'Shaded: SEM'\n",
    "elif SHADOW_TYPE == 'std':\n",
    "    shadow_label = 'Shaded: Std. Dev.'\n",
    "elif SHADOW_TYPE == 'ci':\n",
    "    shadow_label = 'Shaded: 95% CI'\n",
    "else:\n",
    "    shadow_label = 'Shaded: 10-90% Pctl'\n",
    "    \n",
    "# 图例设置 (加大)\n",
    "from matplotlib.lines import Line2D\n",
    "handles, labels = ax.get_legend_handles_labels()\n",
    "# handles.append(Line2D([0], [0], color='gray', alpha=0.3, linewidth=10)) # 如果不需要阴影说明可注释\n",
    "# labels.append(\"Shaded: SEM\")\n",
    "ax.legend(handles, labels, loc='upper right', frameon=True, framealpha=0.95, fontsize=16)\n",
    "\n",
    "# --- (b) ECP ---\n",
    "ax = axes[1]\n",
    "ax.plot(steps, base_ecp_mean, color=C_BASE, linestyle='--')\n",
    "ax.fill_between(steps, base_ecp_low, base_ecp_high, color=C_BASE, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, bpac_ecp_mean, color=C_OURS, linestyle='-')\n",
    "ax.fill_between(steps, bpac_ecp_low, bpac_ecp_high, color=C_OURS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "# ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2, label='Warm-up End')\n",
    "ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "# ax.set_title('(b) Expert Call Rate (ECP)')\n",
    "ax.set_ylabel('ECP(%)')\n",
    "ax.set_xlabel('Time Step')\n",
    "ax.set_ylim(0, 1.05)\n",
    "# ax.legend(loc='upper right')\n",
    "\n",
    "# --- (c) TCP ---\n",
    "ax = axes[2]\n",
    "ax.plot(steps, base_tcp_mean, color=C_BASE, linestyle='--')\n",
    "ax.fill_between(steps, base_tcp_low, base_tcp_high, color=C_BASE, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, bpac_tcp_mean, color=C_OURS, linestyle='-')\n",
    "ax.fill_between(steps, bpac_tcp_low, bpac_tcp_high, color=C_OURS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.axhline(y=1.0, color='black', linestyle='-.', linewidth=1.5, label='Full Expert')\n",
    "ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "\n",
    "# ax.set_title('(c) Token Cost Ratio (TCP)')\n",
    "ax.set_ylabel('TP(%)')\n",
    "ax.set_xlabel('Time Step')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('unstationary_qwen3ins_logits.pdf', bbox_inches='tight')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "478ea6f6",
   "metadata": {},
   "source": [
    "# use bbh and magpie"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d9f67d85",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "expert_data3 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/bbh/qwen3-think.json\")\n",
    "expert_data_bbh = expert_data3[expert_data3[\"matched\"] == True]\n",
    "session_ids = list(expert_data_bbh[\"session_id\"])\n",
    "\n",
    "instant_data1 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/bbh/qwen3-ins.json\")\n",
    "instant_data_bbh = instant_data1[instant_data1[\"session_id\"].isin(session_ids)]\n",
    "expert_data_bbh.shape, instant_data_bbh.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2bfb802f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "ratio = 0.5          # 删除比例\n",
    "seed = 42            # 随机种子，保证可复现\n",
    "\n",
    "# 找到 matched == True 的索引\n",
    "matched_idx = instant_data_bbh[instant_data_bbh[\"matched\"] == True].index\n",
    "\n",
    "# 要删除的数量\n",
    "n_drop = int(len(matched_idx) * ratio)\n",
    "\n",
    "# 随机选取要删除的索引\n",
    "drop_idx = np.random.RandomState(seed).choice(\n",
    "    matched_idx,\n",
    "    size=n_drop,\n",
    "    replace=False\n",
    ")\n",
    "\n",
    "# 删除\n",
    "instant_data_bbh_reduced = instant_data_bbh.drop(drop_idx)\n",
    "\n",
    "print(\"原始大小:\", instant_data_bbh.shape)\n",
    "print(\"删除后大小:\", instant_data_bbh_reduced.shape)\n",
    "instant_data_bbh[\"matched\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0527d28c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "expert  = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs/magpie/qwen3-think.json\")\n",
    "instant = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs/magpie/qwen3-ins.json\")\n",
    "\n",
    "# ======================\n",
    "# 2. 合并（按 index）\n",
    "# ======================\n",
    "df = expert.merge(\n",
    "    instant[[\"gpt4_score\"]],\n",
    "    left_index=True,\n",
    "    right_index=True,\n",
    "    suffixes=(\"_expert\", \"_instant\")\n",
    ")\n",
    "\n",
    "# ======================\n",
    "# 3. 条件 mask\n",
    "# ======================\n",
    "valid_mask = (\n",
    "    df[\"gpt4_score_expert\"].notna() &\n",
    "    df[\"gpt4_score_instant\"].notna()\n",
    ")\n",
    "\n",
    "better_mask = valid_mask & (\n",
    "    df[\"gpt4_score_expert\"] >= df[\"gpt4_score_instant\"]\n",
    ")\n",
    "\n",
    "# ======================\n",
    "# 4. 筛选\n",
    "# ======================\n",
    "df_selected = df.loc[better_mask]\n",
    "\n",
    "# 拆回原 dataframe（保留整行）\n",
    "expert_data_magpie  = expert.loc[df_selected.index]\n",
    "instant_data_magpie = instant.loc[df_selected.index]\n",
    "\n",
    "# ======================\n",
    "# 5. 统计信息\n",
    "# ======================\n",
    "total_valid = valid_mask.sum()\n",
    "num_better  = better_mask.sum()\n",
    "\n",
    "print(f\"Total valid paired samples (after remove NaN): {total_valid}\")\n",
    "print(f\"Cases where Think/Expert score >= Instant score: {num_better}\")\n",
    "print(f\"Ratio: {num_better / total_valid:.4f}\")\n",
    "print(f\"Percentage: {num_better / total_valid * 100:.1f}%\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "177ad0c0",
   "metadata": {},
   "outputs": [],
   "source": [
    "expert_data = pd.concat([expert_data_bbh,expert_data_magpie], ignore_index=True)\n",
    "instant_data = pd.concat([ instant_data_bbh_reduced, instant_data_magpie], ignore_index=True)\n",
    "expert_data[\"dataset\"].unique(), instant_data[\"dataset\"].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "28d1b31f",
   "metadata": {},
   "outputs": [],
   "source": [
    "expert_data = pd.concat([expert_data_magpie,expert_data_bbh], ignore_index=True)\n",
    "instant_data = pd.concat([ instant_data_magpie, instant_data_bbh_reduced], ignore_index=True)\n",
    "expert_data[\"dataset\"].unique(), instant_data[\"dataset\"].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "07289c6f",
   "metadata": {},
   "outputs": [],
   "source": [
    "instant_data_magpie.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6b29fc36",
   "metadata": {},
   "outputs": [],
   "source": [
    "instant_data[\"matched\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2f2eaf95",
   "metadata": {},
   "outputs": [],
   "source": [
    "row = instant_data_magpie.iloc[0]\n",
    "row['gpt4_score']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6530f93a",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_list =[]\n",
    "selcet_ids =[]\n",
    "for i,row in expert_data.iterrows():\n",
    "    session_id = row['session_id']\n",
    "    if session_id not in instant_data['session_id'].values:\n",
    "        print(f\"Instant data missing at index:{i}, session_id:{session_id}\")\n",
    "        continue\n",
    "    tmp_dict = {}\n",
    "    instant_row = instant_data[instant_data['session_id'] == session_id].iloc[0]\n",
    "    selcet_ids.append(session_id)\n",
    "    if instant_row[\"question\"] != row[\"question\"]:\n",
    "        print(f\"Question mismatch at index:{i}, session_id:{session_id}\")\n",
    "        continue\n",
    "    tmp_dict['uncertainty'] = 1- instant_row['token_probs'][0]\n",
    "    if instant_row['dataset'] == 'bbh':\n",
    "        tmp_dict['instant_correct'] = int(instant_row['matched']) if instant_row['matched'] != \"No answer extracted\" else 0\n",
    "        tmp_dict['expert_correct'] = int(expert_data.loc[i, 'matched'])\n",
    "    elif instant_row['dataset'] == 'magpie':\n",
    "        tmp_dict[\"instant_score\"] = instant_row['gpt4_score']\n",
    "        tmp_dict[\"expert_score\"] = expert_data.loc[i, 'gpt4_score']\n",
    "    tmp_dict['instant_token'] = instant_row['gen_token_count']\n",
    "    tmp_dict['expert_token'] = expert_data.loc[i, 'gen_token_count']\n",
    "    tmp_dict['dataset'] = row['dataset']\n",
    "    data_list.append(tmp_dict)\n",
    "\n",
    "print(f\"Total samples prepared: {len(data_list)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "06734843",
   "metadata": {},
   "outputs": [],
   "source": [
    "split_ind =0\n",
    "for i, item in enumerate(data_list):\n",
    "    if item['dataset'] == 'magpie':\n",
    "        split_ind = i\n",
    "        break\n",
    "split_ind"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0a822aae",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_list[3022]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7e8dde2a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from tqdm import tqdm \n",
    "\n",
    "# data_list[:split_ind] 是 Dataset A (MMLU)\n",
    "# data_list[split_ind:] 是 Dataset B (BBH)\n",
    "data_list_A1 = data_list[:split_ind]\n",
    "data_list_A = data_list_A1[:100]  # 取前 1500 个作为 Dataset A\n",
    "data_list_B = data_list[split_ind:]\n",
    "\n",
    "# 参数设置\n",
    "NUM_RUNS = 100         # 重复次数\n",
    "TARGET_EPSILON = 0.08   # 目标 Risk\n",
    "TARGET_ALPHA = 0.1     # 置信度\n",
    "CALIB_NUM = 100        # 校准集大小\n",
    "TOTAL_NUM = len(data_list_A) + len(data_list_B) # 所有的题目数量\n",
    "calib_ratio = CALIB_NUM / TOTAL_NUM\n",
    "\n",
    "print(f\"Non-Stationary Setup: Calib on A ({len(data_list_A)}), Test on B ({len(data_list_B)})\")\n",
    "\n",
    "# -----------------------------------------------------------\n",
    "# 1. Baseline 循环 (非平稳数据流)\n",
    "# -----------------------------------------------------------\n",
    "all_risks, all_token_ratios, all_expert_ratios, all_u_hats = [], [], [], []\n",
    "\n",
    "print(f\"Starting Baseline Simulation ({NUM_RUNS} runs)...\")\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    rng = np.random.default_rng(seed)\n",
    "    \n",
    "    # === 关键：分别打乱 A 和 B，然后拼接 ===\n",
    "    # 打乱 A\n",
    "    idx_a = np.arange(len(data_list_A))\n",
    "    rng.shuffle(idx_a)\n",
    "    data_a_shuffled = [data_list_A[i] for i in idx_a]\n",
    "    \n",
    "    # 打乱 B\n",
    "    idx_b = np.arange(len(data_list_B))\n",
    "    rng.shuffle(idx_b)\n",
    "    data_b_shuffled = [data_list_B[i] for i in idx_b]\n",
    "    \n",
    "    # 拼接: 先 A 后 B (模拟分布偏移)\n",
    "    current_stream = data_a_shuffled + data_b_shuffled\n",
    "    \n",
    "    # 提取列表形式\n",
    "    # y_solved = [d['expert_correct'] for d in current_stream]\n",
    "    # y_hat_solved = [d['instant_correct'] for d in current_stream]\n",
    "    # U = [d['uncertainty'] for d in current_stream]\n",
    "    # y_tok = [d['expert_token'] for d in current_stream]\n",
    "    # y_hat_tok = [d['instant_token'] for d in current_stream]\n",
    "    # print(current_stream[0])\n",
    "    # 运行 Baseline (禁止内部 shuffle)\n",
    "    df, u_hat = run_baseline_continuous(\n",
    "        current_stream,\n",
    "        calib_ratio=calib_ratio, # 确保刚好覆盖完 Dataset A\n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=TARGET_ALPHA,\n",
    "        seed=seed,\n",
    "        shuffle=False # <--- 关键：保持 A->B 的顺序\n",
    "    )\n",
    "    \n",
    "    all_risks.append(df['avg_risk'].values)\n",
    "    all_token_ratios.append(df['token_ratio'].values)\n",
    "    all_expert_ratios.append(df['expert_call_ratio'].values)\n",
    "    all_u_hats.append(u_hat)\n",
    "\n",
    "arr_risks = np.array(all_risks)\n",
    "arr_token_ratios = np.array(all_token_ratios)\n",
    "arr_expert_ratios = np.array(all_expert_ratios)\n",
    "arr_thresholds = np.array(all_u_hats)\n",
    "\n",
    "print(\"\\nSimulation Finished!\")\n",
    "print(f\"Average u_hat across runs: {np.mean(all_u_hats):.4f} (std: {np.std(all_u_hats):.4f})\")\n",
    "\n",
    "# -----------------------------------------------------------\n",
    "# 2. BPAC 循环 (非平稳数据流)\n",
    "# -----------------------------------------------------------\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "# 参数设置\n",
    "NUM_RUNS = 100        # 重复 100 次\n",
    "WARM_UP_STEPS = 0   # BPAC 的 Warm-up 步数 (对应 Baseline 的校准集大小)\n",
    "TARGET_EPSILON = 0.08 # 目标 Risk\n",
    "CFG_ALPHA= 0.1       # BPAC 参数\n",
    "CFG_RHO = 0      # BPAC 参数\n",
    "BPA_beta = 1\n",
    "CFG_RHO_0 = 0.05\n",
    "CFG_RHO_1 = 0.7\n",
    "CFG_CHANGE = 200\n",
    "\n",
    "bpac_risks, bpac_token_ratios, bpac_expert_ratios = [], [], []\n",
    "bpac_wealths, bpac_thresholds = [], []\n",
    "\n",
    "print(f\"Starting BPAC Simulation ({NUM_RUNS} runs)...\")\n",
    "\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    rng = np.random.default_rng(seed)\n",
    "    \n",
    "    # === 关键：使用相同的逻辑构造数据流 (Data Alignment) ===\n",
    "    # 打乱 A\n",
    "    idx_a = np.arange(len(data_list_A))\n",
    "    rng.shuffle(idx_a)\n",
    "    data_a_shuffled = [data_list_A[i] for i in idx_a]\n",
    "    \n",
    "    # 打乱 B\n",
    "    idx_b = np.arange(len(data_list_B))\n",
    "    rng.shuffle(idx_b)\n",
    "    data_b_shuffled = [data_list_B[i] for i in idx_b]\n",
    "    \n",
    "    # 拼接\n",
    "    current_stream = data_a_shuffled + data_b_shuffled\n",
    "    \n",
    "    # 配置并运行 BPAC\n",
    "    cfg = BPACConfig(\n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=CFG_ALPHA, \n",
    "        rho=CFG_RHO, \n",
    "        warm_up=WARM_UP_STEPS, # 前 A 个样本作为 Warm-up\n",
    "        num_thresholds=1001, \n",
    "        beta=BPA_beta, \n",
    "        c_clip=0.9,\n",
    "        rho_0=CFG_RHO_0,\n",
    "        rho_1=CFG_RHO_1,\n",
    "        change_point=CFG_CHANGE\n",
    "    )\n",
    "    \n",
    "    df_result, _ = run_simulation(current_stream, cfg)\n",
    "    \n",
    "    bpac_risks.append(df_result['avg_risk'].values)\n",
    "    bpac_token_ratios.append(df_result['token_ratio'].values)\n",
    "    bpac_expert_ratios.append(df_result['expert_call_ratio'].values)\n",
    "    bpac_wealths.append(df_result['wealth'].values)\n",
    "    bpac_thresholds.append(df_result['threshold'].values)\n",
    "\n",
    "bpac_risks_arr = np.array(bpac_risks)\n",
    "bpac_token_ratios_arr = np.array(bpac_token_ratios)\n",
    "bpac_expert_ratios_arr = np.array(bpac_expert_ratios)\n",
    "bpac_wealths_arr = np.array(bpac_wealths)\n",
    "bpac_thresholds_arr = np.array(bpac_thresholds)\n",
    "print(\"BPAC Simulation Finished!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1f965ca7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# Baseline 数据\n",
    "base_mean_risk = np.mean(arr_risks, axis=0)\n",
    "base_std_risk = np.std(arr_risks, axis=0)\n",
    "\n",
    "base_mean_token = np.mean(arr_token_ratios, axis=0)\n",
    "base_std_token = np.std(arr_token_ratios, axis=0)\n",
    "\n",
    "base_mean_expert = np.mean(arr_expert_ratios, axis=0)\n",
    "base_std_expert = np.std(arr_expert_ratios, axis=0)\n",
    "\n",
    "# 获取总时间步数\n",
    "N_STEPS = bpac_risks_arr.shape[1] \n",
    "\n",
    "# A. 重构 Baseline Threshold 序列 (因为你只存了最终的一个 u_hat)\n",
    "# 逻辑：在校准期 (前 CALIB_NUM 步) 阈值设为 0，之后设为 u_hat\n",
    "# arr_thresholds 是 (100,) 的标量数组\n",
    "base_threshold_seqs = np.zeros((NUM_RUNS, N_STEPS))\n",
    "for i in range(NUM_RUNS):\n",
    "    base_threshold_seqs[i, :CALIB_NUM] = 0.0  # 校准期阈值通常视为0 (强制Action=1)\n",
    "    base_threshold_seqs[i, CALIB_NUM:] = arr_thresholds[i] # 测试期应用 u_hat\n",
    "\n",
    "base_mean_threshold_seq = np.mean(base_threshold_seqs, axis=0)\n",
    "base_std_threshold_seq = np.std(base_threshold_seqs, axis=0)\n",
    "\n",
    "\n",
    "# BPAC 数据\n",
    "bpac_mean_risk = np.mean(bpac_risks_arr, axis=0)\n",
    "bpac_std_risk = np.std(bpac_risks_arr, axis=0)\n",
    "\n",
    "bpac_mean_token = np.mean(bpac_token_ratios_arr, axis=0)\n",
    "bpac_std_token = np.std(bpac_token_ratios_arr, axis=0)\n",
    "\n",
    "bpac_mean_expert = np.mean(bpac_expert_ratios_arr, axis=0)\n",
    "bpac_std_expert = np.std(bpac_expert_ratios_arr, axis=0)\n",
    "\n",
    "# C. 确保其他 BPAC 统计量已准备好 (以防万一)\n",
    "bpac_mean_threshold = np.mean(bpac_thresholds_arr, axis=0)\n",
    "bpac_std_threshold = np.std(bpac_thresholds_arr, axis=0)\n",
    "\n",
    "# B. 计算 BPAC Wealth 统计 (你存了 bpac_wealths_arr 但还没算均值)\n",
    "bpac_mean_wealth = np.mean(bpac_wealths_arr, axis=0)\n",
    "bpac_std_wealth = np.std(bpac_wealths_arr, axis=0)\n",
    "\n",
    "\n",
    "fig, axes = plt.subplots(5, 1, figsize=(12, 24), sharex=True)\n",
    "(ax1, ax2, ax3, ax4, ax5) = axes\n",
    "\n",
    "# X轴\n",
    "steps = np.arange(N_STEPS)\n",
    "\n",
    "# === 子图 1: Average Risk (风险控制) ===\n",
    "ax1.plot(steps, base_mean_risk, label='Baseline', color='blue', linestyle='--')\n",
    "ax1.fill_between(steps, base_mean_risk - base_std_risk, base_mean_risk + base_std_risk, color='blue', alpha=0.1)\n",
    "\n",
    "ax1.plot(steps, bpac_mean_risk, label='BPAC (Ours)', color='red', linewidth=2)\n",
    "ax1.fill_between(steps, bpac_mean_risk - bpac_std_risk, bpac_mean_risk + bpac_std_risk, color='red', alpha=0.1)\n",
    "\n",
    "ax1.axhline(y=TARGET_EPSILON, color='green', linestyle='-', linewidth=2, label=f'Target $\\epsilon={TARGET_EPSILON}$')\n",
    "ax1.axvline(x=CALIB_NUM, color='gray', linestyle=':', label='Calibration End')\n",
    "\n",
    "ax1.set_ylabel('Avg Risk')\n",
    "ax1.set_title(f'1. Risk Control (Target $\\epsilon < {TARGET_EPSILON}$)')\n",
    "ax1.legend(loc='upper right')\n",
    "ax1.grid(True, alpha=0.3)\n",
    "ax1.set_ylim(0, TARGET_EPSILON * 3.0) # 聚焦在 Epsilon 附近\n",
    "\n",
    "# === 子图 2: Token Ratio (成本节省) ===\n",
    "ax2.plot(steps, base_mean_token, label='Baseline', color='blue', linestyle='--')\n",
    "ax2.fill_between(steps, base_mean_token - base_std_token, base_mean_token + base_std_token, color='blue', alpha=0.1)\n",
    "\n",
    "ax2.plot(steps, bpac_mean_token, label='BPAC', color='red', linewidth=2)\n",
    "ax2.fill_between(steps, bpac_mean_token - bpac_std_token, bpac_mean_token + bpac_std_token, color='red', alpha=0.1)\n",
    "\n",
    "ax2.axhline(y=1.0, color='black', linestyle=':')\n",
    "ax2.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "ax2.set_ylabel('Token Ratio')\n",
    "ax2.set_title('2. Cost Efficiency (Lower is Better)')\n",
    "ax2.grid(True, alpha=0.3)\n",
    "\n",
    "# === 子图 3: Expert Call Ratio (专家调用率) ===\n",
    "ax3.plot(steps, base_mean_expert, label='Baseline', color='blue', linestyle='--')\n",
    "ax3.fill_between(steps, base_mean_expert - base_std_expert, base_mean_expert + base_std_expert, color='blue', alpha=0.1)\n",
    "\n",
    "ax3.plot(steps, bpac_mean_expert, label='BPAC', color='red', linewidth=2)\n",
    "ax3.fill_between(steps, bpac_mean_expert - bpac_std_expert, bpac_mean_expert + bpac_std_expert, color='red', alpha=0.1)\n",
    "\n",
    "ax3.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "ax3.set_ylabel('Expert Call Ratio')\n",
    "ax3.set_title('3. Expert Usage Behavior')\n",
    "ax3.grid(True, alpha=0.3)\n",
    "ax3.set_ylim(0, 1.1)\n",
    "\n",
    "# === 子图 4: Threshold Evolution (阈值变化) ===\n",
    "# Baseline: 前期是0，后期是固定值\n",
    "ax4.plot(steps, base_mean_threshold_seq, label='Baseline $\\hat{u}$', color='blue', linestyle='--')\n",
    "ax4.fill_between(steps, \n",
    "                 base_mean_threshold_seq - base_std_threshold_seq, \n",
    "                 base_mean_threshold_seq + base_std_threshold_seq, \n",
    "                 color='blue', alpha=0.1)\n",
    "\n",
    "# BPAC: 动态变化\n",
    "ax4.plot(steps, bpac_mean_threshold, label='BPAC $u_t$', color='red', linewidth=1.5)\n",
    "ax4.fill_between(steps, \n",
    "                 bpac_mean_threshold - bpac_std_threshold, \n",
    "                 bpac_mean_threshold + bpac_std_threshold, \n",
    "                 color='red', alpha=0.1)\n",
    "\n",
    "ax4.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "ax4.set_ylabel('Uncertainty Threshold')\n",
    "ax4.set_title('4. Threshold Adaptation (Dynamic vs Fixed)')\n",
    "ax4.legend(loc='upper right')\n",
    "ax4.grid(True, alpha=0.3)\n",
    "ax4.set_ylim(0, 1.0) \n",
    "\n",
    "# === 子图 5: Wealth Evolution (财富积累) ===\n",
    "# 只有 BPAC 有 Wealth\n",
    "ax5.plot(steps, bpac_mean_wealth, label='BPAC Wealth', color='purple', linewidth=1.5)\n",
    "ax5.fill_between(steps, \n",
    "                 bpac_mean_wealth - bpac_std_wealth, \n",
    "                 bpac_mean_wealth + bpac_std_wealth, \n",
    "                 color='purple', alpha=0.1)\n",
    "\n",
    "ax5.axhline(y=1.0, color='black', linestyle='--', label='Initial Wealth (1.0)')\n",
    "ax5.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "# 如果财富增长非常快，建议开启对数坐标\n",
    "# ax5.set_yscale('log') \n",
    "\n",
    "ax5.set_ylabel('Wealth')\n",
    "ax5.set_xlabel('Time Step')\n",
    "ax5.set_title('5. BPAC Wealth Accumulation')\n",
    "ax5.legend(loc='upper left')\n",
    "ax5.grid(True, alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "27eeae96",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "from scipy import stats\n",
    "from matplotlib.ticker import MultipleLocator\n",
    "\n",
    "# ==========================================\n",
    "# 1. 设置阴影类型 (ICML 关键设置)\n",
    "# ==========================================\n",
    "# 选项: \n",
    "# 'std': 标准差 (Mean ± Std) -> 阴影最大，容易超线\n",
    "# 'sem': 标准误 (Mean ± Std/sqrt(N)) -> 阴影更窄，聚焦于均值的准确性 (推荐!)\n",
    "# 'ci':  95% 置信区间 (Bootstrap/t-distribution) -> 严谨的统计学区间\n",
    "# 'percentile': 分位数 (10% - 90%) -> 展示数据分布的真实范围\n",
    "\n",
    "SHADOW_TYPE = 'std' \n",
    "NUM_RUNS = 100  # 你的实验次数\n",
    "\n",
    "def get_error_bounds(data_array, type='sem',scale=1):\n",
    "    \"\"\"根据选择的类型计算上下界\"\"\"\n",
    "    mean_val = np.mean(data_array, axis=0)\n",
    "    \n",
    "    if type == 'std':\n",
    "        std_val = np.std(data_array, axis=0)\n",
    "        lower = mean_val - std_val\n",
    "        upper = mean_val + std_val\n",
    "        \n",
    "    return mean_val*scale, lower*scale, upper*scale\n",
    "\n",
    "# ==========================================\n",
    "# 2. 计算绘图数据\n",
    "# ==========================================\n",
    "steps = np.arange(bpac_risks_arr.shape[1])\n",
    "\n",
    "# --- 计算 Risk ---\n",
    "base_mean, base_low, base_high = get_error_bounds(arr_risks, SHADOW_TYPE)\n",
    "bpac_mean, bpac_low, bpac_high = get_error_bounds(bpac_risks_arr, SHADOW_TYPE)\n",
    "\n",
    "# --- 计算 ECP ---\n",
    "base_ecp_mean, base_ecp_low, base_ecp_high = get_error_bounds(arr_expert_ratios, SHADOW_TYPE,scale=100)\n",
    "bpac_ecp_mean, bpac_ecp_low, bpac_ecp_high = get_error_bounds(bpac_expert_ratios_arr, SHADOW_TYPE,scale=100)\n",
    "\n",
    "# --- 计算 TCP ---\n",
    "base_tcp_mean, base_tcp_low, base_tcp_high = get_error_bounds(arr_token_ratios, SHADOW_TYPE,scale=100)\n",
    "bpac_tcp_mean, bpac_tcp_low, bpac_tcp_high = get_error_bounds(bpac_token_ratios_arr, SHADOW_TYPE,scale=100)\n",
    "\n",
    "# --- 物理截断 (Clipping) ---\n",
    "# 无论用什么统计方法，Risk 和 Ratio 都不可能小于 0\n",
    "def clip_bounds(low, high, min_v=0, max_v=None):\n",
    "    low = np.maximum(low, min_v)\n",
    "    if max_v is not None:\n",
    "        high = np.minimum(high, max_v)\n",
    "    return low, high\n",
    "\n",
    "# base_low, base_high = clip_bounds(base_low, base_high, 0, 1.0) # Risk 通常<1，也可不设上限\n",
    "# bpac_low, bpac_high = clip_bounds(bpac_low, bpac_high, 0, 1.0)\n",
    "\n",
    "# base_ecp_low, base_ecp_high = clip_bounds(base_ecp_low, base_ecp_high, 0, 1.0)\n",
    "# bpac_ecp_low, bpac_ecp_high = clip_bounds(bpac_ecp_low, bpac_ecp_high, 0, 1.0)\n",
    "\n",
    "# base_tcp_low, base_tcp_high = clip_bounds(base_tcp_low, base_tcp_high, 0, None)\n",
    "# bpac_tcp_low, bpac_tcp_high = clip_bounds(bpac_tcp_low, bpac_tcp_high, 0, None)\n",
    "\n",
    "\n",
    "# ==========================================\n",
    "# 3. 绘图 (ICML Style)\n",
    "# ==========================================\n",
    "# 设置字体和线宽\n",
    "plt.rcParams.update({\n",
    "    'font.size': 14,                # 全局基础字号\n",
    "    'axes.labelsize': 20,           # 轴标签字号 (Risk, ECP, TCP) -> 加大到20\n",
    "    'axes.titlesize': 18,           # 标题字号 (如果你有标题的话)\n",
    "    'xtick.labelsize': 16,          # X轴刻度字号\n",
    "    'ytick.labelsize': 16,          # Y轴刻度字号\n",
    "    'legend.fontsize': 16,          # 图例字号 -> 加大到16\n",
    "    'lines.linewidth': 3.0,         # 线宽加粗，配合大图\n",
    "    'axes.grid': True, \n",
    "    'grid.alpha': 0.3, \n",
    "    'grid.linestyle': '--',\n",
    "    'figure.autolayout': True,\n",
    "    'savefig.dpi': 300\n",
    "})\n",
    "\n",
    "fig, axes = plt.subplots(1, 3, figsize=(20, 5), sharex=True)\n",
    "\n",
    "# 颜色\n",
    "C_BASE = '#1f77b4'  # Blue\n",
    "C_OURS = '#d62728'  # Red\n",
    "C_TGT = '#2ca02c'   # Green\n",
    "C_WARM = 'gray'\n",
    "\n",
    "# --- (a) Risk ---\n",
    "ax = axes[0]\n",
    "ax.plot(steps, base_mean, color=C_BASE, linestyle='--', label='PAC (Baseline)')\n",
    "ax.fill_between(steps, base_low, base_high, color=C_BASE, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, bpac_mean, color=C_OURS, linestyle='-', label='B-PAC (Ours)')\n",
    "ax.fill_between(steps, bpac_low, bpac_high, color=C_OURS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.axhline(y=TARGET_EPSILON, color=C_TGT, linestyle='-', linewidth=2, label='Tolerance')\n",
    "ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "\n",
    "# ax.set_title('(a) Cumulative Risk')\n",
    "ax.set_ylabel('ER',fontsize=22)\n",
    "ax.set_xlabel('Time Step')\n",
    "ax.set_ylim(0, 0.16) # 视野聚焦\n",
    "ax.yaxis.set_major_locator(MultipleLocator(0.04))\n",
    "\n",
    "# 图例中说明阴影含义\n",
    "if SHADOW_TYPE == 'sem':\n",
    "    shadow_label = 'Shaded: SEM'\n",
    "elif SHADOW_TYPE == 'std':\n",
    "    shadow_label = 'Shaded: Std. Dev.'\n",
    "elif SHADOW_TYPE == 'ci':\n",
    "    shadow_label = 'Shaded: 95% CI'\n",
    "else:\n",
    "    shadow_label = 'Shaded: 10-90% Pctl'\n",
    "    \n",
    "# 图例设置 (加大)\n",
    "from matplotlib.lines import Line2D\n",
    "handles, labels = ax.get_legend_handles_labels()\n",
    "handles.append(Line2D([0], [0], color='gray', alpha=0.3, linewidth=10)) # 如果不需要阴影说明可注释\n",
    "# labels.append(\"Shaded: SEM\")\n",
    "ax.legend(handles, labels, loc='upper right', frameon=True, framealpha=0.95, fontsize=16)\n",
    "\n",
    "# --- (b) ECP ---\n",
    "ax = axes[1]\n",
    "ax.plot(steps, base_ecp_mean, color=C_BASE, linestyle='--')\n",
    "ax.fill_between(steps, base_ecp_low, base_ecp_high, color=C_BASE, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, bpac_ecp_mean, color=C_OURS, linestyle='-')\n",
    "ax.fill_between(steps, bpac_ecp_low, bpac_ecp_high, color=C_OURS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "# ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2, label='Warm-up End')\n",
    "ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "# ax.set_title('(b) Expert Call Rate (ECP)')\n",
    "ax.set_ylabel('ECP(%)')\n",
    "ax.set_xlabel('Time Step')\n",
    "ax.set_ylim(0, 100)\n",
    "# ax.legend(loc='upper right')\n",
    "\n",
    "# --- (c) TCP ---\n",
    "ax = axes[2]\n",
    "ax.plot(steps, base_tcp_mean, color=C_BASE, linestyle='--')\n",
    "ax.fill_between(steps, base_tcp_low, base_tcp_high, color=C_BASE, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, bpac_tcp_mean, color=C_OURS, linestyle='-')\n",
    "ax.fill_between(steps, bpac_tcp_low, bpac_tcp_high, color=C_OURS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.axhline(y=100, color='black', linestyle='-.', linewidth=1.5, label='Full Expert')\n",
    "ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "\n",
    "# ax.set_title('(c) Token Cost Ratio (TCP)')\n",
    "ax.set_ylabel('TP(%)')\n",
    "ax.set_xlabel('Time Step')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('qwen3ins_bbh_magpie.pdf', bbox_inches='tight')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c8dcba9d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "vllm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
