{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "2de27248",
   "metadata": {},
   "source": [
    "# 算法框架"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a7a5d21d",
   "metadata": {},
   "source": [
    "## ours"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "19e88c96",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from dataclasses import dataclass\n",
    "from typing import List, Optional, Tuple, Dict\n",
    "\n",
    "@dataclass\n",
    "class BPACConfig:\n",
    "    \"\"\"\n",
    "    B-PAC 算法超参数配置\n",
    "    \"\"\"\n",
    "    alpha: float = 0.1          # 容错概率 (1-Confidence), 例如 0.1 代表 90% 置信度\n",
    "    epsilon: float = 0.1      # 容忍的风险上限 (Error Tolerance), 例如 0.05 代表允许 5% 的性能损失\n",
    "    rho: float = 0.1           # 最小探索概率 (Minimum Exploration Probability),0到1\n",
    "    beta: float = 1.0           # FTRL 正则化参数,0到无穷\n",
    "    c_clip: float = 0.9         # 投注截断常数，0到1\n",
    "    num_thresholds: int = 1001   # 阈值搜索空间的精细度\n",
    "    warm_up: int = 50          # 初始预热步数\n",
    "    rho_0: float = 0.05\n",
    "    rho_1: float = 0.6\n",
    "    change_point: int = 200\n",
    "\n",
    "def compute_step_loss(y_correct_t: int, y_hat_correct_t: int) -> float:\n",
    "    \"\"\"\n",
    "    单步 Loss 计算，对应你提供的逻辑：\n",
    "    Loss = 1 当且仅当 (专家对 AND 小模型错)\n",
    "    \"\"\"\n",
    "    # y_correct_t: 1 if expert is correct, 0 else\n",
    "    # y_hat_correct_t: 1 if instant model is correct, 0 else\n",
    "    \n",
    "    weak_wrong = 1 - y_hat_correct_t\n",
    "    # 只有 strong 正确 & weak 错误 时才记为 1\n",
    "    loss = float(y_correct_t * weak_wrong)\n",
    "    return loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8f3d82a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "class BPAC:\n",
    "    def __init__(self, config: BPACConfig):\n",
    "        self.cfg = config\n",
    "        self.threshold_candidates = np.linspace(0, 1, self.cfg.num_thresholds)\n",
    "\n",
    "        # 状态初始化\n",
    "        self.current_u_idx = 0 \n",
    "        self.current_u = self.threshold_candidates[self.current_u_idx]\n",
    "        self.wealth = np.ones(self.cfg.num_thresholds) # K_0 = 1\n",
    "        \n",
    "        # FTRL 统计量\n",
    "        self.sum_D = np.zeros(self.cfg.num_thresholds)\n",
    "        self.sum_D_sq = np.zeros(self.cfg.num_thresholds)\n",
    "\n",
    "    def get_action(self, uncertainty_score: float):\n",
    "        \"\"\"\n",
    "        Returns:\n",
    "            action (int): 1 (Expert), 0 (Instant)\n",
    "            propensity (float): The probability of choosing Expert (pi_t)\n",
    "        \"\"\"\n",
    "        # 策略: pi_t = I(U >= u) + rho * I(U < u)\n",
    "        if uncertainty_score >= self.current_u:\n",
    "            # 必须调用专家\n",
    "            return 1, 1.0\n",
    "        else:\n",
    "            # 尝试使用小模型，但有 rho 的概率探索\n",
    "            # prop 是指“在这个不确定性下，算法设计上调用专家的概率”\n",
    "            propensity = self.cfg.rho \n",
    "            \n",
    "            # 实际采样动作\n",
    "            is_exploring = np.random.rand() < self.cfg.rho\n",
    "            action = 1 if is_exploring else 0\n",
    "            \n",
    "            return action, propensity\n",
    "\n",
    "    def update(self, uncertainty_score: float, action: int, observed_loss: Optional[float]):\n",
    "        \"\"\"\n",
    "        核心更新逻辑 (Bandit Feedback)\n",
    "        论文中的 update 仅依赖于 'observed' 数据\n",
    "        \"\"\"\n",
    "        # 1. 数据准备\n",
    "        # 如果 action=0 (没调专家)，则 observed_loss 为 None，但在公式中 l_t * xi_t 会变成 0\n",
    "        l_t = observed_loss if observed_loss is not None else 0.0\n",
    "        xi_t = action\n",
    "        \n",
    "        # 2. 计算 Propensity 向量 (Vectorized for all u)\n",
    "        # indicator_less: I(U_t < u)\n",
    "        indicator_less = (uncertainty_score < self.threshold_candidates).astype(float)\n",
    "        # pi_t(u)\n",
    "        if uncertainty_score < self.current_u:\n",
    "            pi_t = self.cfg.rho\n",
    "        else:\n",
    "            pi_t = 1.0\n",
    "        \n",
    "        # 3. 计算 Payoff D_t(u)\n",
    "        # D_t = epsilon - (l_t * xi_t * I(U < u)) / pi_t\n",
    "        weighted_loss = (1-self.cfg.rho_0)*(l_t * xi_t * indicator_less) / pi_t\n",
    "\n",
    "        # epsilon = epsilon / (1 - rho) 调整\n",
    "        # epsilon = self.cfg.epsilon / (1.0 - self.cfg.rho)\n",
    "        D_t = self.cfg.epsilon - weighted_loss\n",
    "        \n",
    "        # 4. FTRL Lambda 更新 [cite: 199]\n",
    "        denom = self.sum_D_sq + self.cfg.beta\n",
    "        denom[denom == 0] = 1e-9 # 避免除零\n",
    "        lambda_raw = self.sum_D / denom\n",
    "        \n",
    "        M_t = max(self.cfg.epsilon,((1.0-self.cfg.rho_0)/self.cfg.rho)-self.cfg.epsilon)\n",
    "        upper_bound = self.cfg.c_clip / M_t\n",
    "        lambda_t = np.clip(lambda_raw, 0, upper_bound)\n",
    "        \n",
    "        # 5. 财富更新\n",
    "        self.wealth = self.wealth * (1.0 + lambda_t * D_t)\n",
    "        self.sum_D += D_t\n",
    "        self.sum_D_sq += (D_t ** 2)\n",
    "\n",
    "        # # 6. 阈值选择 [cite: 166]\n",
    "        # valid_indices = np.where(self.wealth >= (1.0 / self.cfg.alpha))[0]\n",
    "        \n",
    "        is_safe_mask = (self.wealth >= (1.0 / self.cfg.alpha))\n",
    "        prefix_safe_mask = np.logical_and.accumulate(is_safe_mask)\n",
    "        valid_indices = np.where(prefix_safe_mask)[0]\n",
    "\n",
    "        if len(valid_indices) > 0:\n",
    "            self.current_u_idx = valid_indices[-1]\n",
    "            self.current_u = self.threshold_candidates[self.current_u_idx]\n",
    "        else:\n",
    "            # Fallback to safest (all expert)\n",
    "            self.current_u_idx = 0\n",
    "            self.current_u = 0.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f38617fc",
   "metadata": {},
   "outputs": [],
   "source": [
    "def run_simulation(data_sequence: List[Dict], config: BPACConfig):\n",
    "    \"\"\"\n",
    "    data_sequence: List of item dicts\n",
    "    item keys: \"uncertainty\", \"instant_correct\", \"expert_correct\", \"instant_token\", \"expert_token\"\n",
    "    \"\"\"\n",
    "    model = BPAC(config)\n",
    "    logs = []\n",
    "    warm_up = config.warm_up\n",
    "    # print(f\"Start Simulation with {len(data_sequence)} samples...\")\n",
    "    # print(f\"Config: Epsilon={config.epsilon}, Alpha={config.alpha}, Rho={config.rho}\")\n",
    "\n",
    "    # 累积变量\n",
    "    total_actual_tokens = 0\n",
    "    total_baseline_tokens = 0\n",
    "    cumulative_loss = 0\n",
    "    expert_calls = 0\n",
    "\n",
    "    for t, item in enumerate(data_sequence):\n",
    "        # 1. 提取特征\n",
    "        u_t = item['uncertainty']\n",
    "        inst_corr = item['instant_correct']\n",
    "        exp_corr = item['expert_correct']\n",
    "        inst_tok = item['instant_token']\n",
    "        exp_tok = item['expert_token']\n",
    "        \n",
    "        # 2. 算法决策\n",
    "\n",
    "        \n",
    "        if t< model.cfg.change_point:\n",
    "            model.cfg.rho = model.cfg.rho_1\n",
    "        else:\n",
    "            model.cfg.rho = model.cfg.rho_0\n",
    "            \n",
    "        action, propensity = model.get_action(u_t)\n",
    "        \n",
    "        # 3. 计算 Loss\n",
    "        # (A) True Loss: 上帝视角，用于评估和画图\n",
    "        # 即使 action=0，如果小模型错了专家对了，这里也是 1\n",
    "        true_loss = compute_step_loss(exp_corr, inst_corr)\n",
    "        \n",
    "        # (B) Observed Loss: 算法视角 (Bandit Feedback) \n",
    "        # 只有调用了专家 (action=1)，算法才能看到 loss\n",
    "        # 如果 action=0，算法不知道 loss，传入 None (内部处理为0)\n",
    "        observed_loss = true_loss if action == 1 else None\n",
    "        \n",
    "        # 4. 算法更新\n",
    "        model.update(u_t, action, observed_loss)\n",
    "        if t < warm_up:\n",
    "            continue\n",
    "\n",
    "        # 5. Token 消耗计算\n",
    "        # Baseline: 假设全用 Expert\n",
    "        step_baseline_tokens = exp_tok\n",
    "        \n",
    "        # Actual: \n",
    "        # Action 0 -> instant\n",
    "        # Action 1 -> instant + expert (Cascade)\n",
    "        if action == 1:\n",
    "            step_actual_tokens = inst_tok + exp_tok\n",
    "            expert_calls += 1\n",
    "        else:\n",
    "            step_actual_tokens = inst_tok\n",
    "\n",
    "        total_actual_tokens += step_actual_tokens\n",
    "        total_baseline_tokens += step_baseline_tokens\n",
    "        \n",
    "        # 计算当前的 Token Ratio (Accumulated)\n",
    "        # 避免除以0\n",
    "        current_token_ratio = total_actual_tokens / total_baseline_tokens if total_baseline_tokens > 0 else 1.0\n",
    "        # 计算当前专家调用的比例\n",
    "        current_expert_ratio = expert_calls / (t - warm_up + 1)\n",
    "        # 计算当前的 Average Risk (Accumulated True Loss / t)\n",
    "        if action == 0:\n",
    "            cumulative_loss += true_loss # 当调用小模型时，才记录损失\n",
    "        else:\n",
    "            cumulative_loss += 0 # 注意调用专家时，损失永远为0.\n",
    "        current_avg_risk = cumulative_loss / (t + 1 - warm_up)\n",
    "\n",
    "        # 计算当前选的位置的财富\n",
    "        wealth = model.wealth[model.current_u_idx]\n",
    "        # 6. 记录日志\n",
    "        logs.append({\n",
    "            \"step\": t,\n",
    "            \"uncertainty\": u_t,\n",
    "            \"threshold\": model.current_u,      # 当前阈值\n",
    "            \"action\": action,                  # 1=Expert, 0=Instant\n",
    "            \"true_loss\": true_loss,            # 真实损失 (上帝视角)\n",
    "            \"observed_loss\": observed_loss if observed_loss is not None else np.nan,\n",
    "            \"avg_risk\": current_avg_risk,      # 累积平均风险\n",
    "            \"token_ratio\": current_token_ratio, # 累积 Token 消耗比\n",
    "            \"expert_call_ratio\": current_expert_ratio, # 累积专家调用比\n",
    "            \"wealth\": wealth                   # 当前财富水平\n",
    "        })\n",
    "\n",
    "    return pd.DataFrame(logs), model"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ec013b1c",
   "metadata": {},
   "source": [
    "## Online navie"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9b216623",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from dataclasses import dataclass\n",
    "from typing import List, Optional, Tuple, Dict\n",
    "\n",
    "@dataclass\n",
    "class BPACConfignaive:\n",
    "    \"\"\"\n",
    "    B-PAC 算法超参数配置\n",
    "    \"\"\"\n",
    "    alpha: float = 0.1          # 容错概率 (1-Confidence), 例如 0.1 代表 90% 置信度\n",
    "    epsilon: float = 0.1      # 容忍的风险上限 (Error Tolerance), 例如 0.05 代表允许 5% 的性能损失\n",
    "    rho: float = 0.1           # 最小探索概率 (Minimum Exploration Probability),0到1\n",
    "    num_thresholds: int = 1001   # 阈值搜索空间的精细度\n",
    "    warm_up: int = 50          # 初始预热步数\n",
    "\n",
    "def compute_step_loss(y_correct_t: int, y_hat_correct_t: int) -> float:\n",
    "    \"\"\"\n",
    "    单步 Loss 计算，对应你提供的逻辑：\n",
    "    Loss = 1 当且仅当 (专家对 AND 小模型错)\n",
    "    \"\"\"\n",
    "    # y_correct_t: 1 if expert is correct, 0 else\n",
    "    # y_hat_correct_t: 1 if instant model is correct, 0 else\n",
    "    \n",
    "    weak_wrong = 1 - y_hat_correct_t\n",
    "    # 只有 strong 正确 & weak 错误 时才记为 1\n",
    "    loss = float(y_correct_t * weak_wrong)\n",
    "    return loss\n",
    "\n",
    "class Onaive:\n",
    "    def __init__(self, config: BPACConfignaive):\n",
    "        self.cfg = config\n",
    "        self.threshold_candidates = np.linspace(0, 1, self.cfg.num_thresholds)\n",
    "\n",
    "        # 状态初始化\n",
    "        self.current_u_idx = 0 \n",
    "        self.current_u = self.threshold_candidates[self.current_u_idx]\n",
    "        self.t = 1\n",
    "        self.sum_risk_terms = np.zeros(self.cfg.num_thresholds)\n",
    "\n",
    "        \n",
    "    def get_action(self, uncertainty_score: float):\n",
    "        \"\"\"\n",
    "        Returns:\n",
    "            action (int): 1 (Expert), 0 (Instant)\n",
    "            propensity (float): The probability of choosing Expert (pi_t)\n",
    "        \"\"\"\n",
    "        # 策略: pi_t = I(U >= u) + rho * I(U < u)\n",
    "        if uncertainty_score >= self.current_u:\n",
    "            # 必须调用专家\n",
    "            return 1, 1.0\n",
    "        else:\n",
    "            # 尝试使用小模型，但有 rho 的概率探索\n",
    "            # prop 是指“在这个不确定性下，算法设计上调用专家的概率”\n",
    "            propensity = self.cfg.rho \n",
    "            \n",
    "            # 实际采样动作\n",
    "            is_exploring = np.random.rand() < self.cfg.rho\n",
    "            action = 1 if is_exploring else 0\n",
    "            \n",
    "            return action, propensity\n",
    "\n",
    "    def update(self, uncertainty_score: float, action: int, observed_loss: Optional[float]):\n",
    "        \"\"\"\n",
    "        根据 O-Naive 公式更新风险估计和阈值\n",
    "        R_hat(u) = (1/t) * Sum( xi * l * I(U < u) )\n",
    "        \"\"\"\n",
    "        self.t += 1\n",
    "        \n",
    "        # =========================================================\n",
    "        # 1. 计算公式中的单步项: xi * l * I(U < u)\n",
    "        # =========================================================\n",
    "        \n",
    "        # (a) 标量部分: xi * l\n",
    "        xi = action\n",
    "        l_t = observed_loss if observed_loss is not None else 0.0\n",
    "        \n",
    "        # 只有当调用专家(xi=1)且发生错误(l=1)时，scalar_term 才为 1，否则为 0\n",
    "        scalar_term = xi * l_t \n",
    "        \n",
    "        # (b) 向量部分: I(U_t < u)\n",
    "        # 这是一个形状为 (num_thresholds,) 的 0/1 向量\n",
    "        indicator_less = (uncertainty_score < self.threshold_candidates).astype(float)\n",
    "        \n",
    "        # (c) 累加到总和\n",
    "        # self.sum_risk_terms += scalar_term * indicator_less\n",
    "        # 只有那些“阈值 u 比当前 uncertainty 大”的候选者，才可能在这一步积累风险\n",
    "        self.sum_risk_terms += (scalar_term * indicator_less)\n",
    "            \n",
    "        # =========================================================\n",
    "        # 2. 计算平均累积风险 R_hat(u)\n",
    "        # =========================================================\n",
    "        estimated_risk = self.sum_risk_terms / self.t\n",
    "        \n",
    "        # =========================================================\n",
    "        # 3. 阈值选择: max { u : R_hat(u) <= epsilon }\n",
    "        # =========================================================\n",
    "        # 找到所有风险达标的索引\n",
    "        valid_indices = np.where(estimated_risk <= self.cfg.epsilon)[0]\n",
    "        \n",
    "        if len(valid_indices) > 0:\n",
    "            # 贪婪选择最大的那个\n",
    "            self.current_u_idx = valid_indices[-1]\n",
    "            self.current_u = self.threshold_candidates[self.current_u_idx]\n",
    "        else:\n",
    "            # 没有任何阈值满足风险要求 (说明连 u=0 都不安全，或者刚开始运气极差)\n",
    "            # 退化到最保守策略 (全专家)\n",
    "            self.current_u_idx = 0\n",
    "            self.current_u = 0.0\n",
    "        \n",
    "\n",
    "def run_simulation_onaive(data_sequence: List[Dict], config: BPACConfignaive):\n",
    "    \"\"\"\n",
    "    data_sequence: List of item dicts\n",
    "    item keys: \"uncertainty\", \"instant_correct\", \"expert_correct\", \"instant_token\", \"expert_token\"\n",
    "    \"\"\"\n",
    "    model = Onaive(config)\n",
    "    logs = []\n",
    "    warm_up = config.warm_up\n",
    "    # print(f\"Start Simulation with {len(data_sequence)} samples...\")\n",
    "    # print(f\"Config: Epsilon={config.epsilon}, Alpha={config.alpha}, Rho={config.rho}\")\n",
    "\n",
    "    # 累积变量\n",
    "    total_actual_tokens = 0\n",
    "    total_baseline_tokens = 0\n",
    "    cumulative_loss = 0\n",
    "    expert_calls = 0\n",
    "\n",
    "    for t, item in enumerate(data_sequence):\n",
    "        # 1. 提取特征\n",
    "        u_t = item['uncertainty']\n",
    "        inst_corr = item['instant_correct']\n",
    "        exp_corr = item['expert_correct']\n",
    "        inst_tok = item['instant_token']\n",
    "        exp_tok = item['expert_token']\n",
    "        \n",
    "            \n",
    "        action, propensity = model.get_action(u_t)\n",
    "        \n",
    "        # 3. 计算 Loss\n",
    "        # (A) True Loss: 上帝视角，用于评估和画图\n",
    "        # 即使 action=0，如果小模型错了专家对了，这里也是 1\n",
    "        true_loss = compute_step_loss(exp_corr, inst_corr)\n",
    "        \n",
    "        # (B) Observed Loss: 算法视角 (Bandit Feedback) \n",
    "        # 只有调用了专家 (action=1)，算法才能看到 loss\n",
    "        # 如果 action=0，算法不知道 loss，传入 None (内部处理为0)\n",
    "        observed_loss = true_loss if action == 1 else None\n",
    "        \n",
    "        # 4. 算法更新\n",
    "        model.update(u_t, action, observed_loss)\n",
    "        if t < warm_up:\n",
    "            continue\n",
    "\n",
    "        # 5. Token 消耗计算\n",
    "        # Baseline: 假设全用 Expert\n",
    "        step_baseline_tokens = exp_tok\n",
    "        \n",
    "        # Actual: \n",
    "        # Action 0 -> instant\n",
    "        # Action 1 -> instant + expert (Cascade)\n",
    "        if action == 1:\n",
    "            step_actual_tokens = inst_tok + exp_tok\n",
    "            expert_calls += 1\n",
    "        else:\n",
    "            step_actual_tokens = inst_tok\n",
    "\n",
    "        total_actual_tokens += step_actual_tokens\n",
    "        total_baseline_tokens += step_baseline_tokens\n",
    "        \n",
    "        # 计算当前的 Token Ratio (Accumulated)\n",
    "        # 避免除以0\n",
    "        current_token_ratio = total_actual_tokens / total_baseline_tokens if total_baseline_tokens > 0 else 1.0\n",
    "        # 计算当前专家调用的比例\n",
    "        current_expert_ratio = expert_calls / (t - warm_up + 1)\n",
    "        # 计算当前的 Average Risk (Accumulated True Loss / t)\n",
    "        if action == 0:\n",
    "            cumulative_loss += true_loss # 当调用小模型时，才记录损失\n",
    "        else:\n",
    "            cumulative_loss += 0 # 注意调用专家时，损失永远为0.\n",
    "        current_avg_risk = cumulative_loss / (t + 1 - warm_up)\n",
    "\n",
    "        # 6. 记录日志\n",
    "        logs.append({\n",
    "            \"step\": t,\n",
    "            \"uncertainty\": u_t,\n",
    "            \"threshold\": model.current_u,      # 当前阈值\n",
    "            \"action\": action,                  # 1=Expert, 0=Instant\n",
    "            \"true_loss\": true_loss,            # 真实损失 (上帝视角)\n",
    "            \"observed_loss\": observed_loss if observed_loss is not None else np.nan,\n",
    "            \"avg_risk\": current_avg_risk,      # 累积平均风险\n",
    "            \"token_ratio\": current_token_ratio, # 累积 Token 消耗比\n",
    "            \"expert_call_ratio\": current_expert_ratio, # 累积专家调用比\n",
    "            \"wealth\": 0                   # 当前财富水平\n",
    "        })\n",
    "\n",
    "    return pd.DataFrame(logs), model"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "be39c262",
   "metadata": {},
   "source": [
    "## IPS+HOFF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3efd1681",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from typing import List, Dict, Optional\n",
    "\n",
    "@dataclass\n",
    "class BPACConfigIPS:\n",
    "    \"\"\"\n",
    "    B-PAC 算法超参数配置\n",
    "    \"\"\"\n",
    "    alpha: float = 0.1          # 容错概率 (1-Confidence), 例如 0.1 代表 90% 置信度\n",
    "    epsilon: float = 0.1      # 容忍的风险上限 (Error Tolerance), 例如 0.05 代表允许 5% 的性能损失\n",
    "    rho: float = 0.1           # 最小探索概率 (Minimum Exploration Probability),0到1\n",
    "    num_thresholds: int = 1001   # 阈值搜索空间的精细度\n",
    "    warm_up: int = 50          # 初始预热步数\n",
    "\n",
    "class IPSHoeffding:\n",
    "    def __init__(self, config: BPACConfigIPS):\n",
    "        self.cfg = config\n",
    "        self.threshold_candidates = np.linspace(0, 1, self.cfg.num_thresholds)\n",
    "        \n",
    "        # 状态初始化\n",
    "        self.current_u_idx = 0 \n",
    "        self.current_u = self.threshold_candidates[self.current_u_idx]\n",
    "        \n",
    "        # IPS + Hoeffding 统计量\n",
    "        self.time_step = 0\n",
    "        self.sum_Z = np.zeros(self.cfg.num_thresholds) # 累积的 IPS 估计值\n",
    "        \n",
    "        # 预计算常数 M_tilde = (1-rho)/rho\n",
    "        # 注意：这里的 rho 应该是 rho_min (部署阶段的 rho)，以保证 bound 成立\n",
    "        self.M_tilde = (1.0 - self.cfg.rho) / self.cfg.rho\n",
    "        \n",
    "        # 候选阈值数量 N (对应公式中的 log(N/alpha_t))\n",
    "        self.N_thresholds = self.cfg.num_thresholds\n",
    "\n",
    "    def get_action(self, uncertainty_score: float):\n",
    "        \"\"\"\n",
    "        动作选择逻辑与 BPAC 保持一致，以保证 estimator 的输入分布相同。\n",
    "        \"\"\"\n",
    "        # 策略: pi_t = I(U >= u) + rho * I(U < u)\n",
    "        if uncertainty_score >= self.current_u:\n",
    "            # 必须调用专家\n",
    "            return 1, 1.0\n",
    "        else:\n",
    "            # 探索性调用\n",
    "            propensity = self.cfg.rho \n",
    "            is_exploring = np.random.rand() < self.cfg.rho\n",
    "            action = 1 if is_exploring else 0\n",
    "            \n",
    "            return action, propensity\n",
    "\n",
    "    def update(self, uncertainty_score: float, action: int, observed_loss: Optional[float]):\n",
    "        \"\"\"\n",
    "        基于 IPS + Hoeffding 的更新逻辑\n",
    "        Ref: Image 'IPS+Hoeff. ...'\n",
    "        \"\"\"\n",
    "        self.time_step += 1\n",
    "        t = self.time_step\n",
    "        \n",
    "        # 1. 数据准备\n",
    "        l_t = observed_loss if observed_loss is not None else 0.0\n",
    "        xi_t = action\n",
    "        \n",
    "        # 2. 计算 Propensity (pi_t)\n",
    "        # indicator_less: I(U_t < u)\n",
    "        indicator_less = (uncertainty_score < self.threshold_candidates).astype(float)\n",
    "        \n",
    "        # 注意：为了构造无偏估计，分母必须是生成数据时使用的真实 pi_t\n",
    "        # 如果 U < current_u, pi_t = rho; else pi_t = 1\n",
    "        if uncertainty_score < self.current_u:\n",
    "            pi_t_val = self.cfg.rho\n",
    "        else:\n",
    "            pi_t_val = 1.0\n",
    "            \n",
    "        # 3. 计算 Scaled IPS 估计量 Z_t(u)\n",
    "        # 公式: Z_t(u) = (1 - rho_min) * l_t * xi_t * I(U < u) / pi_t\n",
    "        # 注意：图片文本定义 Z_t(u) 包含了 (1-rho) 因子\n",
    "        scaling_factor = (1.0 - self.cfg.rho)\n",
    "        Z_t = scaling_factor * (l_t * xi_t * indicator_less) / pi_t_val\n",
    "        \n",
    "        # 更新累积和\n",
    "        self.sum_Z += Z_t\n",
    "        \n",
    "        # 4. 计算 Hoeffding Upper Confidence Bound (UCB)\n",
    "        # Mean Z\n",
    "        mean_Z = self.sum_Z / t\n",
    "        \n",
    "        # Failure probability allocation: alpha_t = 6 * alpha / (pi^2 * t^2)\n",
    "        alpha_t = (6 * self.cfg.alpha) / (np.pi**2 * t**2)\n",
    "        \n",
    "        # Penalty Term: M_tilde * sqrt( log(N / alpha_t) / 2t )\n",
    "        # 加上 1e-9 防止 log(0)\n",
    "        # log_term = np.log(self.N_thresholds / alpha_t + 1e-9)\n",
    "        log_term = np.log(1 / alpha_t + 1e-9)\n",
    "        penalty = self.M_tilde * np.sqrt(log_term / (2 * t))\n",
    "        \n",
    "        ucb = mean_Z + penalty\n",
    "        \n",
    "        # 5. 阈值选择\n",
    "        # 选择满足 UCB(u) <= epsilon 的最大 u\n",
    "        # valid_indices = { u : UCB(u) <= epsilon }\n",
    "        \n",
    "        # 注意：这里需要比较的是 Deployment Risk，所以 epsilon 是原始设定的 risk budget\n",
    "        valid_indices = np.where(ucb <= self.cfg.epsilon)[0]\n",
    "        \n",
    "        if len(valid_indices) > 0:\n",
    "            self.current_u_idx = valid_indices[-1] # Max index\n",
    "            self.current_u = self.threshold_candidates[self.current_u_idx]\n",
    "        else:\n",
    "            # Fallback to safest (u=0, all expert)\n",
    "            self.current_u_idx = 0\n",
    "            self.current_u = 0.0\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from dataclasses import dataclass\n",
    "from typing import List, Optional, Tuple, Dict\n",
    "\n",
    "def compute_step_loss(y_correct_t: int, y_hat_correct_t: int) -> float:\n",
    "    \"\"\"\n",
    "    单步 Loss 计算，对应你提供的逻辑：\n",
    "    Loss = 1 当且仅当 (专家对 AND 小模型错)\n",
    "    \"\"\"\n",
    "    # y_correct_t: 1 if expert is correct, 0 else\n",
    "    # y_hat_correct_t: 1 if instant model is correct, 0 else\n",
    "    \n",
    "    weak_wrong = 1 - y_hat_correct_t\n",
    "    # 只有 strong 正确 & weak 错误 时才记为 1\n",
    "    loss = float(y_correct_t * weak_wrong)\n",
    "    return loss\n",
    "\n",
    "def run_simulation_ips(data_sequence: List[Dict], config: IPSHoeffding):\n",
    "    \"\"\"\n",
    "    data_sequence: List of item dicts\n",
    "    item keys: \"uncertainty\", \"instant_correct\", \"expert_correct\", \"instant_token\", \"expert_token\"\n",
    "    \"\"\"\n",
    "    model = IPSHoeffding(config)\n",
    "    logs = []\n",
    "    warm_up = config.warm_up\n",
    "    # print(f\"Start Simulation with {len(data_sequence)} samples...\")\n",
    "    # print(f\"Config: Epsilon={config.epsilon}, Alpha={config.alpha}, Rho={config.rho}\")\n",
    "\n",
    "    # 累积变量\n",
    "    total_actual_tokens = 0\n",
    "    total_baseline_tokens = 0\n",
    "    cumulative_loss = 0\n",
    "    expert_calls = 0\n",
    "\n",
    "    for t, item in enumerate(data_sequence):\n",
    "        # 1. 提取特征\n",
    "        u_t = item['uncertainty']\n",
    "        inst_corr = item['instant_correct']\n",
    "        exp_corr = item['expert_correct']\n",
    "        inst_tok = item['instant_token']\n",
    "        exp_tok = item['expert_token']\n",
    "        \n",
    "        action, propensity = model.get_action(u_t)\n",
    "        \n",
    "        # 3. 计算 Loss\n",
    "        # (A) True Loss: 上帝视角，用于评估和画图\n",
    "        # 即使 action=0，如果小模型错了专家对了，这里也是 1\n",
    "        true_loss = compute_step_loss(exp_corr, inst_corr)\n",
    "        \n",
    "        # (B) Observed Loss: 算法视角 (Bandit Feedback) \n",
    "        # 只有调用了专家 (action=1)，算法才能看到 loss\n",
    "        # 如果 action=0，算法不知道 loss，传入 None (内部处理为0)\n",
    "        observed_loss = true_loss if action == 1 else None\n",
    "        \n",
    "        # 4. 算法更新\n",
    "        model.update(u_t, action, observed_loss)\n",
    "        if t < warm_up:\n",
    "            continue\n",
    "\n",
    "        # 5. Token 消耗计算\n",
    "        # Baseline: 假设全用 Expert\n",
    "        step_baseline_tokens = exp_tok\n",
    "        \n",
    "        # Actual: \n",
    "        # Action 0 -> instant\n",
    "        # Action 1 -> instant + expert (Cascade)\n",
    "        if action == 1:\n",
    "            step_actual_tokens = inst_tok + exp_tok\n",
    "            expert_calls += 1\n",
    "        else:\n",
    "            step_actual_tokens = inst_tok\n",
    "\n",
    "        total_actual_tokens += step_actual_tokens\n",
    "        total_baseline_tokens += step_baseline_tokens\n",
    "        \n",
    "        # 计算当前的 Token Ratio (Accumulated)\n",
    "        # 避免除以0\n",
    "        current_token_ratio = total_actual_tokens / total_baseline_tokens if total_baseline_tokens > 0 else 1.0\n",
    "        # 计算当前专家调用的比例\n",
    "        current_expert_ratio = expert_calls / (t - warm_up + 1)\n",
    "        # 计算当前的 Average Risk (Accumulated True Loss / t)\n",
    "        if action == 0:\n",
    "            cumulative_loss += true_loss # 当调用小模型时，才记录损失\n",
    "        else:\n",
    "            cumulative_loss += 0 # 注意调用专家时，损失永远为0.\n",
    "        current_avg_risk = cumulative_loss / (t + 1 - warm_up)\n",
    "\n",
    "        # 6. 记录日志\n",
    "        logs.append({\n",
    "            \"step\": t,\n",
    "            \"uncertainty\": u_t,\n",
    "            \"threshold\": model.current_u,      # 当前阈值\n",
    "            \"action\": action,                  # 1=Expert, 0=Instant\n",
    "            \"true_loss\": true_loss,            # 真实损失 (上帝视角)\n",
    "            \"observed_loss\": observed_loss if observed_loss is not None else np.nan,\n",
    "            \"avg_risk\": current_avg_risk,      # 累积平均风险\n",
    "            \"token_ratio\": current_token_ratio, # 累积 Token 消耗比\n",
    "            \"expert_call_ratio\": current_expert_ratio, # 累积专家调用比\n",
    "            \"wealth\": 0                   # 当前财富水平\n",
    "        })\n",
    "\n",
    "    return pd.DataFrame(logs), model"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9597c0bc",
   "metadata": {},
   "source": [
    "## base"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dd6e6d32",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from scipy.stats import norm\n",
    "\n",
    "def compute_pac_loss(y_correct, y_hat_correct, gt_anss=None):\n",
    "    \"\"\"\n",
    "    计算二元 Loss (Eq. 9 in paper)\n",
    "    gt_anss 在这里只是占位，因为 y_correct 和 y_hat_correct 已经是 0/1 了\n",
    "    \"\"\"\n",
    "    y_correct = np.array(y_correct, dtype=float)\n",
    "    y_hat_correct = np.array(y_hat_correct, dtype=float)\n",
    "    \n",
    "    # 只有当：专家对(1) 且 小模型错(0) 时，Loss = 1\n",
    "    weak_wrong = 1.0 - y_hat_correct\n",
    "    loss = (y_correct * weak_wrong)\n",
    "    return loss\n",
    "\n",
    "def pac_labeling_two_stage(\n",
    "    y_solved_calib, y_hat_solved_calib, U_calib, gt_anss_calib,\n",
    "    y_solved_test, y_hat_solved_test, U_test, gt_anss_test,\n",
    "    epsilon=0.1, alpha=0.05,\n",
    "    m=500, pi=0.5, grid_size=200, seed=None\n",
    "):\n",
    "    \"\"\"\n",
    "    PAC Labeling (两阶段版本):\n",
    "      - calibration set: 估计 u_hat\n",
    "      - test set: 用 u_hat 贴标并评估性能\n",
    "\n",
    "    参数:\n",
    "      y_calib, y_hat_calib, U_calib: 修正集 (用于估计 u_hat)\n",
    "      y_test, y_hat_test, U_test   : 测试集 (用于评估)\n",
    "      epsilon, alpha, m, pi, grid_size, seed: 同单阶段版本\n",
    "    \"\"\"\n",
    "    if seed is not None:\n",
    "        rng = np.random.default_rng(seed)\n",
    "    else:\n",
    "        rng = np.random.default_rng()\n",
    "\n",
    "    # -------- 修正集阶段 --------\n",
    "    y_solved_c = np.array(y_solved_calib, dtype=object)\n",
    "    y_hat_solved_c = np.array(y_hat_solved_calib, dtype=object)\n",
    "    U_c = np.asarray(U_calib, dtype=float)\n",
    "    gt_anss_c = np.array(gt_anss_calib, dtype=object)\n",
    "    n_c = len(y_solved_c)\n",
    "\n",
    "    # loss_full_c = (y_c != y_hat_c).astype(float)\n",
    "    # print(np.mean(loss_full_c))\n",
    "    loss_full_c = compute_pac_loss(y_solved_c, y_hat_solved_c, gt_anss_c)\n",
    "    # print(np.mean(loss_full_c))\n",
    "    \n",
    "    # print(loss_full_c)\n",
    "\n",
    "    # 放回抽样\n",
    "    sample_idx = rng.choice(n_c, size=m, replace=True)\n",
    "    U_s = U_c[sample_idx]\n",
    "    loss_s = loss_full_c[sample_idx]\n",
    "\n",
    "    # Bernoulli(pi) 审计\n",
    "    phi = (rng.random(m) < pi).astype(float)\n",
    "    weights = phi / pi\n",
    "\n",
    "    z = norm.ppf(1 - alpha)\n",
    "    \n",
    "    # u_grid = np.unique(np.quantile(U_c, np.linspace(0, 1, grid_size)))\n",
    "\n",
    "    def upper_conf_bound(u):\n",
    "        mask = (U_s <= u).astype(float)\n",
    "        X = loss_s * mask * weights\n",
    "        mean = X.mean()\n",
    "        std = X.std(ddof=1) if X.size > 1 else 0.0\n",
    "        return mean + z * std / np.sqrt(X.size)\n",
    "\n",
    "    u_grid = np.sort(U_c)\n",
    "    ucbs = np.array([upper_conf_bound(u) for u in u_grid])\n",
    "    ok_idx = np.where(ucbs <= epsilon)[0]\n",
    "    if ok_idx.size > 0:\n",
    "        u_hat = float(u_grid[ok_idx.max()])\n",
    "    else:\n",
    "        u_hat = float(u_grid[0])\n",
    "        \n",
    "    # u_hat = 0.05\n",
    "\n",
    "    # -------- 测试集阶段 --------\n",
    "    y_solved_t = np.array(y_solved_test, dtype=object)\n",
    "    y_hat_solved_t = np.array(y_hat_solved_test, dtype=object)\n",
    "    U_t = np.asarray(U_test, dtype=float)\n",
    "   \n",
    "\n",
    "    Y_tilde_solved_test = np.where(U_t >= u_hat, y_solved_t, y_hat_solved_t)\n",
    "    \n",
    "    call_expert_idx = np.where(U_t >= u_hat)[0]\n",
    "\n",
    "    diagnostics = {\n",
    "        \"u_hat\": u_hat,\n",
    "        \"calib_u_grid\": u_grid,\n",
    "        \"calib_ucbs\": ucbs,\n",
    "        \"epsilon\": epsilon,\n",
    "        \"alpha\": alpha,\n",
    "        \"m\": m,\n",
    "        \"pi\": pi,\n",
    "        \"z\": z,\n",
    "        \"expert_call_nums\":(U_t >= u_hat).sum(),\n",
    "        \"call_expert_idx\": call_expert_idx\n",
    "    }\n",
    "\n",
    "    return u_hat, Y_tilde_solved_test, diagnostics\n",
    "\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "def split_calib_test(y, calib_ratio=None, calib_num=None, seed=None):\n",
    "    \"\"\"\n",
    "    将数据随机划分为修正集 (calibration set) 和测试集 (test set)\n",
    "\n",
    "    参数:\n",
    "      y           : 数据 (长度 n)\n",
    "      calib_ratio : 修正集占比 (0, 1)\n",
    "      calib_num   : 修正集样本数 (int)\n",
    "      seed        : 随机种子\n",
    "\n",
    "    返回:\n",
    "      calib_idx, test_idx\n",
    "    \"\"\"\n",
    "    # -------- 参数合法性检查 --------\n",
    "    if calib_ratio is None and calib_num is None:\n",
    "        raise ValueError(\"必须指定 calib_ratio 或 calib_num 其中之一\")\n",
    "\n",
    "    if calib_ratio is not None and calib_num is not None:\n",
    "        raise ValueError(\"calib_ratio 和 calib_num 不能同时指定\")\n",
    "\n",
    "    if calib_ratio is not None:\n",
    "        if not (0 < calib_ratio < 1):\n",
    "            raise ValueError(\"calib_ratio 必须在 (0, 1) 之间\")\n",
    "\n",
    "    if calib_num is not None:\n",
    "        if not isinstance(calib_num, int) or calib_num <= 0:\n",
    "            raise ValueError(\"calib_num 必须是正整数\")\n",
    "\n",
    "    # -------- 随机数生成器 --------\n",
    "    rng = np.random.default_rng(seed)\n",
    "\n",
    "    n = len(y)\n",
    "\n",
    "    # -------- 计算 calib_size --------\n",
    "    if calib_ratio is not None:\n",
    "        calib_size = int(n * calib_ratio)\n",
    "    else:\n",
    "        calib_size = calib_num\n",
    "\n",
    "    if calib_size >= n:\n",
    "        raise ValueError(\"calibration set 的大小必须小于数据总量\")\n",
    "\n",
    "    # -------- 打乱并划分 --------\n",
    "    indices = np.arange(n)\n",
    "    rng.shuffle(indices)\n",
    "\n",
    "    calib_idx = indices[:calib_size]\n",
    "    test_idx = indices[calib_size:]\n",
    "\n",
    "    return calib_idx, test_idx\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from scipy.stats import norm\n",
    "\n",
    "def run_baseline_continuous(\n",
    "    y_solved, y_hat_solved, U, y_token, y_hat_token,\n",
    "    calib_ratio=0.5, epsilon=0.05, alpha=0.05, seed=42,\n",
    "    pi=0.5, m=None # 新增 m 参数，默认为 None (即等于 calib_size)\n",
    "):\n",
    "    \"\"\"\n",
    "    连续模拟 Two-Stage Baseline，其中计算 u_hat 的逻辑严格复刻原始代码。\n",
    "    \"\"\"\n",
    "    # -------------------------------------------------------------------------\n",
    "    # 1. 数据准备与重排 (Calibration -> Test)\n",
    "    # -------------------------------------------------------------------------\n",
    "    n = len(y_solved)\n",
    "    calib_size = int(n * calib_ratio)\n",
    "    \n",
    "    # 设定随机种子\n",
    "    rng = np.random.default_rng(seed)\n",
    "    \n",
    "    # 打乱索引\n",
    "    indices = np.arange(n)\n",
    "    rng.shuffle(indices)\n",
    "    \n",
    "    # 重排索引：前 calib_size 放前面，剩下的放后面\n",
    "    sorted_indices = np.concatenate([indices[:calib_size], indices[calib_size:]])\n",
    "    \n",
    "    # 辅助重排函数\n",
    "    def reorder(arr):\n",
    "        return [arr[i] for i in sorted_indices]\n",
    "    \n",
    "    # 重排数据\n",
    "    y_s = reorder(y_solved)       # 专家正确性\n",
    "    y_h = reorder(y_hat_solved)   # 小模型正确性\n",
    "    u_s_list = reorder(U)         # 不确定性 (list)\n",
    "    tok_s = reorder(y_token)      # 专家Token\n",
    "    tok_h = reorder(y_hat_token)  # 小模型Token\n",
    "    \n",
    "    # -------------------------------------------------------------------------\n",
    "    # 2. 计算固定阈值 u_hat (完全保留原始逻辑)\n",
    "    # -------------------------------------------------------------------------\n",
    "    \n",
    "    # (A) 提取校准数据 (前 calib_size 个)\n",
    "    y_solved_c = np.array(y_s[:calib_size], dtype=object)\n",
    "    y_hat_solved_c = np.array(y_h[:calib_size], dtype=object)\n",
    "    U_c = np.asarray(u_s_list[:calib_size], dtype=float)\n",
    "    # 这里造一个 dummy 的 gt_anss，因为 compute_pac_loss 需要占位符\n",
    "    gt_anss_c = np.array([None] * calib_size, dtype=object)\n",
    "    \n",
    "    n_c = len(y_solved_c)\n",
    "    \n",
    "    # 如果没指定 m，默认使用校准集大小 (和原始代码中 calib_idx.shape[0] 对应)\n",
    "    if m is None:\n",
    "        m = n_c\n",
    "\n",
    "    # (B) 计算 Full Loss\n",
    "    loss_full_c = compute_pac_loss(y_solved_c, y_hat_solved_c, gt_anss_c)\n",
    "\n",
    "    # (C) 放回抽样 (Bootstrap) --- [原始逻辑]\n",
    "    sample_idx = rng.choice(n_c, size=m, replace=True)\n",
    "    U_s_sampled = U_c[sample_idx] # 注意变量名避免冲突\n",
    "    loss_s_sampled = loss_full_c[sample_idx]\n",
    "\n",
    "    # (D) Bernoulli(pi) 审计 --- [原始逻辑]\n",
    "    phi = (rng.random(m) < pi).astype(float)\n",
    "    weights = phi / pi\n",
    "\n",
    "    # (E) 定义 UCB 函数 --- [原始逻辑]\n",
    "    z = norm.ppf(1 - alpha)\n",
    "    \n",
    "    def upper_conf_bound(u):\n",
    "        mask = (U_s_sampled <= u).astype(float)\n",
    "        X = loss_s_sampled * mask * weights\n",
    "        mean = X.mean()\n",
    "        std = X.std(ddof=1) if X.size > 1 else 0.0\n",
    "        return mean + z * std / np.sqrt(X.size)\n",
    "\n",
    "    # (F) Grid Search 与 u_hat 选择 --- [原始逻辑]\n",
    "    # 原始代码使用 np.sort(U_c) 作为网格\n",
    "    u_grid = np.sort(U_c)\n",
    "    \n",
    "    # 计算所有点的 UCB\n",
    "    ucbs = np.array([upper_conf_bound(u) for u in u_grid])\n",
    "    \n",
    "    # 找到满足条件的最大的 u\n",
    "    ok_idx = np.where(ucbs <= epsilon)[0]\n",
    "    if ok_idx.size > 0:\n",
    "        u_hat = float(u_grid[ok_idx.max()])\n",
    "    else:\n",
    "        # 如果都不满足，取最小的（最保守）\n",
    "        u_hat = float(u_grid[0])\n",
    "        \n",
    "    # print(f\"Run Seed={seed}: u_hat = {u_hat:.4f}\")\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    # 3. 连续流模拟 (生成 Logs)\n",
    "    # -------------------------------------------------------------------------\n",
    "    logs = []\n",
    "    \n",
    "    total_actual_tokens = 0\n",
    "    total_baseline_tokens = 0\n",
    "    cumulative_loss = 0\n",
    "    expert_calls = 0\n",
    "    \n",
    "    for t in range(n):\n",
    "        cur_u = u_s_list[t]\n",
    "        cur_y_exp = y_s[t]\n",
    "        cur_y_inst = y_h[t]\n",
    "        cur_tok_exp = tok_s[t]\n",
    "        cur_tok_inst = tok_h[t]\n",
    "        \n",
    "        # 判断阶段\n",
    "        is_in_calibration = (t < calib_size)\n",
    "        \n",
    "        if is_in_calibration:\n",
    "            # === 校准阶段 ===\n",
    "            action = 1 \n",
    "            threshold = 0.0 # 占位\n",
    "            \n",
    "            # 校准阶段虽然强制调用专家，但为了对齐 Risk 曲线的起点，\n",
    "            # 我们通常认为此时 Loss=0 (因为获得了专家/真实标签)\n",
    "            true_loss = 0.0 \n",
    "            observed_loss = 0.0\n",
    "        else:\n",
    "            # === 测试阶段 ===\n",
    "            threshold = u_hat\n",
    "            # 应用计算出的 u_hat\n",
    "            if cur_u >= u_hat:\n",
    "                action = 1 # Expert\n",
    "            else:\n",
    "                action = 0 # Instant\n",
    "            \n",
    "            # Loss 计算\n",
    "            if action == 1:\n",
    "                true_loss = 0.0\n",
    "                observed_loss = 0.0\n",
    "            else:\n",
    "                # 没调专家：如果专家对(1)且小模型错(0)，则 loss=1\n",
    "                true_loss = float(cur_y_exp == 1 and cur_y_inst == 0)\n",
    "                observed_loss = None\n",
    "\n",
    "        # 统计\n",
    "        step_baseline = cur_tok_exp\n",
    "        if action == 1:\n",
    "            step_actual = cur_tok_inst + cur_tok_exp\n",
    "            expert_calls += 1\n",
    "        else:\n",
    "            step_actual = cur_tok_inst\n",
    "            \n",
    "        total_actual_tokens += step_actual\n",
    "        total_baseline_tokens += step_baseline\n",
    "        cumulative_loss += true_loss\n",
    "        \n",
    "        # 实时指标\n",
    "        current_token_ratio = total_actual_tokens / total_baseline_tokens if total_baseline_tokens > 0 else 1.0\n",
    "        current_expert_ratio = expert_calls / (t + 1)\n",
    "        current_avg_risk = cumulative_loss / (t + 1)\n",
    "        \n",
    "        logs.append({\n",
    "            \"step\": t,\n",
    "            \"phase\": \"Calibration\" if is_in_calibration else \"Test\",\n",
    "            \"uncertainty\": cur_u,\n",
    "            \"threshold\": threshold,\n",
    "            \"action\": action,\n",
    "            \"true_loss\": true_loss,\n",
    "            \"observed_loss\": observed_loss,\n",
    "            \"avg_risk\": current_avg_risk,\n",
    "            \"token_ratio\": current_token_ratio,\n",
    "            \"expert_call_ratio\": current_expert_ratio,\n",
    "            \"wealth\": 1.0 # 占位，方便画图\n",
    "        })\n",
    "        \n",
    "    return pd.DataFrame(logs), u_hat"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "277acf1e",
   "metadata": {},
   "source": [
    "# bbh-logit score"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7f4b81ca",
   "metadata": {},
   "source": [
    "## Qwen3-ins"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ed18ff00",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "expert_data3 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/bbh/qwen3-think.json\")\n",
    "expert_data = expert_data3[expert_data3[\"matched\"] == True]\n",
    "session_ids = list(expert_data[\"session_id\"])\n",
    "\n",
    "instant_data1 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/bbh/qwen3-ins.json\")\n",
    "instant_data = instant_data1[instant_data1[\"session_id\"].isin(session_ids)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2fcb1e04",
   "metadata": {},
   "outputs": [],
   "source": [
    "instant_data1.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fe613126",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(list(set(instant_data[\"session_id\"]))),len(list(set(expert_data[\"session_id\"])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6612bb3e",
   "metadata": {},
   "outputs": [],
   "source": [
    "instant_data.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c56030ae",
   "metadata": {},
   "outputs": [],
   "source": [
    "instant_data.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a5ea6cfa",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_list =[]\n",
    "selcet_ids =[]\n",
    "for i,row in expert_data.iterrows():\n",
    "    session_id = row['session_id']\n",
    "    if session_id not in instant_data['session_id'].values:\n",
    "        print(f\"Instant data missing at index:{i}, session_id:{session_id}\")\n",
    "        continue\n",
    "    tmp_dict = {}\n",
    "    instant_row = instant_data[instant_data['session_id'] == session_id].iloc[0]\n",
    "    selcet_ids.append(session_id)\n",
    "    if instant_row[\"question\"] != row[\"question\"]:\n",
    "        print(f\"Question mismatch at index:{i}, session_id:{session_id}\")\n",
    "        continue\n",
    "    tmp_dict['uncertainty'] = 1- instant_row['token_probs'][0]\n",
    "    tmp_dict['instant_correct'] = int(instant_row['matched']) if instant_row['matched'] != \"No answer extracted\" else 0\n",
    "    tmp_dict['expert_correct'] = int(expert_data.loc[i, 'matched'])\n",
    "    tmp_dict['instant_token'] = instant_row['gen_token_count']\n",
    "    tmp_dict['expert_token'] = expert_data.loc[i, 'gen_token_count']\n",
    "    data_list.append(tmp_dict)\n",
    "\n",
    "print(f\"Total samples prepared: {len(data_list)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "00800907",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "# =========================\n",
    "# 1. 读取与筛选数据\n",
    "# =========================\n",
    "data1 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/bbh/qwen3-ins-all.json\")\n",
    "data2 = data1[data1[\"session_id\"].isin(selcet_ids)]\n",
    "df = data2.copy()\n",
    "df[\"token_prob_scalar\"] = df[\"token_probs\"].apply(\n",
    "    lambda x: x[0] if x else None\n",
    ")\n",
    "\n",
    "\n",
    "# =========================\n",
    "# 2. 按 matched 分组\n",
    "# =========================\n",
    "tp_true = df.loc[df[\"matched\"] == True, \"token_prob_scalar\"].dropna()\n",
    "tp_false = df.loc[df[\"matched\"] == False, \"token_prob_scalar\"].dropna()\n",
    "\n",
    "# =========================\n",
    "# 3. 置信度分布直方图\n",
    "# =========================\n",
    "plt.figure()\n",
    "plt.hist(tp_true, bins=30, alpha=0.6, label=\"matched=True\")\n",
    "plt.hist(tp_false, bins=30, alpha=0.6, label=\"matched=False\")\n",
    "plt.xlabel(\"token_prob_scalar\")\n",
    "plt.ylabel(\"Count\")\n",
    "plt.title(\"Token Probability Distribution by Matched\")\n",
    "plt.legend()\n",
    "plt.show()\n",
    "\n",
    "# =========================\n",
    "# 4. Boxplot\n",
    "# =========================\n",
    "plt.figure()\n",
    "plt.boxplot([tp_true, tp_false], labels=[\"Matched=True\", \"Matched=False\"])\n",
    "plt.ylabel(\"token_prob_scalar\")\n",
    "plt.title(\"Token Probability by Matched\")\n",
    "plt.show()\n",
    "\n",
    "\n",
    "# =====================================================================\n",
    "# 第二部分：Instant / Expert 不确定性分析\n",
    "# =====================================================================\n",
    "\n",
    "# data_list: list[dict]\n",
    "instant_tokens = [item[\"instant_token\"] for item in data_list]\n",
    "expert_tokens = [item[\"expert_token\"] for item in data_list]\n",
    "\n",
    "uncertainty_scores = [item[\"uncertainty\"] for item in data_list]\n",
    "correct_instant = [item[\"instant_correct\"] for item in data_list]\n",
    "correct_expert = [item[\"expert_correct\"] for item in data_list]\n",
    "\n",
    "# =========================\n",
    "# 5. 准确率\n",
    "# =========================\n",
    "accuracy_instant = sum(correct_instant) / len(correct_instant)\n",
    "accuracy_expert = sum(correct_expert) / len(correct_expert)\n",
    "\n",
    "print(f\"Instant Model Accuracy: {accuracy_instant:.4f}\")\n",
    "print(f\"Expert Model Accuracy:  {accuracy_expert:.4f}\")\n",
    "\n",
    "expert_tokens = [item['expert_token'] for item in data_list]\n",
    "instant_tokens = [item['instant_token'] for item in data_list]\n",
    "mean_expert_tokens = np.mean(expert_tokens)\n",
    "mean_instant_tokens = np.mean(instant_tokens)\n",
    "print(f\"Mean Expert Tokens: {mean_expert_tokens}, Mean Instant Tokens: {mean_instant_tokens}\")\n",
    "\n",
    "\n",
    "# =========================\n",
    "# 6. Uncertainty 直方图\n",
    "# =========================\n",
    "plt.figure()\n",
    "plt.hist(uncertainty_scores, bins=50)\n",
    "plt.xlabel(\"Uncertainty\")\n",
    "plt.ylabel(\"Count\")\n",
    "plt.title(\"Uncertainty Distribution\")\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "de17ae64",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_list1 = np.array(data_list)\n",
    "\n",
    "np.random.shuffle(data_list1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "030fc8c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 运行配置\n",
    "cfg = BPACConfig(epsilon=0.1, alpha=0.1, rho=0.1,warm_up=0)\n",
    "\n",
    "# 执行模拟\n",
    "df_result, model = run_simulation(data_list1, cfg)\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "def plot_bpac_results(df_logs, config):\n",
    "    \"\"\"\n",
    "    绘制 B-PAC 实验结果面板\n",
    "    \"\"\"\n",
    "    if df_logs.empty:\n",
    "        print(\"Log is empty. Check warmup settings or data.\")\n",
    "        return\n",
    "\n",
    "    fig, axes = plt.subplots(4, 1, figsize=(12, 16), sharex=True)\n",
    "    \n",
    "    # ----------------------------------\n",
    "    # Subplot 1: Threshold Adaptation\n",
    "    # ----------------------------------\n",
    "    ax = axes[0]\n",
    "    ax.plot(df_logs['step'], df_logs['threshold'], label='Threshold ($u_t$)', color='#1f77b4', linewidth=2)\n",
    "    # 可以在背景里画出 uncertainty 的散点，展示数据分布（可选）\n",
    "    # ax.scatter(df_logs['step'], df_logs['uncertainty'], alpha=0.1, color='gray', s=1, label='Input Uncertainty')\n",
    "    ax.set_ylabel('Uncertainty Score')\n",
    "    ax.set_title(f'Threshold Adaptation (Confidence {1-config.alpha:.0%})')\n",
    "    ax.legend(loc='upper right')\n",
    "    ax.grid(True, alpha=0.3)\n",
    "    ax.set_ylim(-0.05, 1.05)\n",
    "\n",
    "    # ----------------------------------\n",
    "    # Subplot 2: Safety (Risk Control)\n",
    "    # ----------------------------------\n",
    "    ax = axes[1]\n",
    "    ax.plot(df_logs['step'], df_logs['avg_risk'], color='#d62728', label='Cumulative Avg Risk', linewidth=2)\n",
    "    ax.axhline(y=config.epsilon, color='black', linestyle='--', linewidth=2, label=f'Target Risk ($\\epsilon={config.epsilon}$)')\n",
    "    ax.set_ylabel('Risk Rate')\n",
    "    ax.set_title('Safety Guarantee: Realized Risk vs. Target')\n",
    "    ax.legend(loc='upper right')\n",
    "    ax.grid(True, alpha=0.3)\n",
    "    \n",
    "    # ----------------------------------\n",
    "    # Subplot 3: Efficiency (Token & Expert Ratio)\n",
    "    # ----------------------------------\n",
    "    ax = axes[2]\n",
    "    ax.plot(df_logs['step'], df_logs['token_ratio'], color='#2ca02c', label='Token Ratio (vs. All-Expert)', linewidth=2)\n",
    "    ax.plot(df_logs['step'], df_logs['expert_call_ratio'], color='#ff7f0e', linestyle='-.', label='Expert Call Ratio', linewidth=2)\n",
    "    ax.set_ylabel('Ratio')\n",
    "    ax.set_title('Efficiency: Cost Reduction')\n",
    "    ax.legend(loc='upper right')\n",
    "    ax.grid(True, alpha=0.3)\n",
    "    \n",
    "    # ----------------------------------\n",
    "    # Subplot 4: Stability (Wealth Process)\n",
    "    # ----------------------------------\n",
    "    ax = axes[3]\n",
    "    ax.plot(df_logs['step'], df_logs['wealth'], color='#9467bd', label='Wealth of Selected Threshold ($K_t$)', linewidth=1.5)\n",
    "    # 画出安全线 1/alpha\n",
    "    safe_wealth = 1.0 / config.alpha\n",
    "    ax.axhline(y=safe_wealth, color='purple', linestyle=':', label=f'Safety Barrier ($1/\\\\alpha={safe_wealth:.1f}$)')\n",
    "    \n",
    "    ax.set_ylabel('Wealth Value')\n",
    "    ax.set_xlabel('Simulation Step (post-warmup)')\n",
    "    ax.set_title('Martingale Wealth Process')\n",
    "    ax.set_yscale('log') # 财富值通常是指数增长的，用对数坐标更好看\n",
    "    ax.legend(loc='upper left')\n",
    "    ax.grid(True, alpha=0.3)\n",
    "\n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "\n",
    "plot_bpac_results(df_result, cfg)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "07167cac",
   "metadata": {},
   "source": [
    "### compare to pac"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "291811b8",
   "metadata": {},
   "source": [
    "#### BPAC"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c7087a8c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from tqdm import tqdm \n",
    "\n",
    "\n",
    "# 假设 data_list 已经准备好\n",
    "y_solved = [item['expert_correct'] for item in data_list]\n",
    "y_hat_solved = [item['instant_correct'] for item in data_list]\n",
    "uncertainty_values = [item['uncertainty'] for item in data_list]\n",
    "y_token_list = [item['expert_token'] for item in data_list]\n",
    "y_hat_token_list = [item['instant_token'] for item in data_list]\n",
    "\n",
    "# 参数设置\n",
    "NUM_RUNS = 100         # 重复次数\n",
    "TARGET_EPSILON = 0.08   # 目标 Risk\n",
    "TARGET_ALPHA = 0.1     # 置信度\n",
    "CALIB_NUM = 500        # 校准集大小\n",
    "TOTAL_NUM = len(y_solved)\n",
    "calib_ratio = CALIB_NUM / TOTAL_NUM\n",
    "\n",
    "# -------------------------------------------------------------------------\n",
    "# 1. 循环运行 Simulation\n",
    "# -------------------------------------------------------------------------\n",
    "\n",
    "# 用于存储每次实验的完整序列\n",
    "# 维度: (100, N_samples)\n",
    "all_risks = []\n",
    "all_token_ratios = []\n",
    "all_expert_ratios = []\n",
    "all_u_hats = []\n",
    "\n",
    "print(f\"Starting {NUM_RUNS} runs simulation...\")\n",
    "\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    # 每次使用不同的随机种子 (seed=0, 1, 2, ..., 99)\n",
    "    # 这样 run_baseline_continuous 内部的 shuffle 结果每次都不一样\n",
    "    df, u_hat = run_baseline_continuous(\n",
    "        y_solved, \n",
    "        y_hat_solved, \n",
    "        uncertainty_values, \n",
    "        y_token_list, \n",
    "        y_hat_token_list,\n",
    "        calib_ratio=calib_ratio, \n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=TARGET_ALPHA,\n",
    "        seed=seed\n",
    "    )\n",
    "    \n",
    "    # 记录整条曲线 (df['avg_risk'] 是一个 Series)\n",
    "    all_risks.append(df['avg_risk'].values)\n",
    "    all_token_ratios.append(df['token_ratio'].values)\n",
    "    all_expert_ratios.append(df['expert_call_ratio'].values)\n",
    "    all_u_hats.append(u_hat)\n",
    "\n",
    "# 转为 Numpy Array 方便计算均值方差\n",
    "# shape = (100, N)\n",
    "arr_risks = np.array(all_risks)\n",
    "arr_token_ratios = np.array(all_token_ratios)\n",
    "arr_expert_ratios = np.array(all_expert_ratios)\n",
    "arr_thresholds = np.array(all_u_hats)\n",
    "\n",
    "print(\"\\nSimulation Finished!\")\n",
    "print(f\"Average u_hat across runs: {np.mean(all_u_hats):.4f} (std: {np.std(all_u_hats):.4f})\")\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "# 参数设置\n",
    "NUM_RUNS = 100        # 重复 100 次\n",
    "WARM_UP_STEPS = 0   # BPAC 的 Warm-up 步数 (对应 Baseline 的校准集大小)\n",
    "TARGET_EPSILON = 0.08  # 目标 Risk\n",
    "CFG_ALPHA= 0.1       # BPAC 参数\n",
    "CFG_RHO = 0      # BPAC 参数\n",
    "BPA_beta = 1\n",
    "CFG_RHO_0 = 0.05\n",
    "CFG_RHO_1 = 0.7\n",
    "CFG_CHANGE = 200\n",
    "C_CLIP = 0.9\n",
    "\n",
    "bpac_risks = []\n",
    "bpac_token_ratios = []\n",
    "bpac_expert_ratios = []\n",
    "bpac_wealths = []\n",
    "bpac_thresholds = []\n",
    "\n",
    "print(f\"Starting BPAC Simulation ({NUM_RUNS} runs)...\")\n",
    "\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    # -----------------------------------------------------------\n",
    "    # 1. 严格的数据对齐 (Data Alignment)\n",
    "    # -----------------------------------------------------------\n",
    "    # 使用与 Baseline 相同的种子生成随机数生成器\n",
    "    rng = np.random.default_rng(seed)\n",
    "    \n",
    "    # 生成打乱的索引\n",
    "    n = len(data_list)\n",
    "    indices = np.arange(n)\n",
    "    rng.shuffle(indices)\n",
    "    \n",
    "    # 关键修正：根据 indices 重排 data_list\n",
    "    # 注意：data_list 是 list of dicts，不能直接用 indices 索引，除非转 numpy\n",
    "    # 这里用列表推导式最稳妥\n",
    "    shuffled_data = [data_list[i] for i in indices]\n",
    "    \n",
    "    # -----------------------------------------------------------\n",
    "    # 2. 配置并运行 BPAC\n",
    "    # -----------------------------------------------------------\n",
    "    # 你的 BPACConfig\n",
    "    cfg = BPACConfig(\n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=CFG_ALPHA, \n",
    "        rho=CFG_RHO, \n",
    "        warm_up=WARM_UP_STEPS,\n",
    "        num_thresholds=1001, \n",
    "        beta=BPA_beta, \n",
    "        c_clip=C_CLIP,\n",
    "        rho_0=CFG_RHO_0,\n",
    "        rho_1=CFG_RHO_1,\n",
    "        change_point=CFG_CHANGE\n",
    "    )\n",
    "    \n",
    "    # 运行模拟 (传入打乱后的数据)\n",
    "    df_result, _ = run_simulation(shuffled_data, cfg)\n",
    "    \n",
    "    bpac_risks.append(df_result['avg_risk'].values)\n",
    "    bpac_token_ratios.append(df_result['token_ratio'].values)\n",
    "    bpac_expert_ratios.append(df_result['expert_call_ratio'].values)\n",
    "    bpac_wealths.append(df_result['wealth'].values)\n",
    "    bpac_thresholds.append(df_result['threshold'].values)\n",
    "\n",
    "\n",
    "# 转换为 Numpy 数组方便计算均值\n",
    "bpac_risks_arr = np.array(bpac_risks)       # Shape: (100, N)\n",
    "bpac_token_ratios_arr = np.array(bpac_token_ratios) # Shape: (100, N)\n",
    "bpac_expert_ratios_arr = np.array(bpac_expert_ratios) # Shape: (100, N)\n",
    "bpac_wealths_arr = np.array(bpac_wealths)   # Shape: (100, N)\n",
    "bpac_thresholds_arr = np.array(bpac_thresholds) # Shape: (100, N)\n",
    "print(\"BPAC Simulation Finished!\")\n",
    "bpac_risks_arr.shape, bpac_token_ratios_arr.shape, bpac_expert_ratios_arr.shape\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c848c795",
   "metadata": {},
   "source": [
    "##### 图1 threshold"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6c2e61a9",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "from scipy import stats\n",
    "from matplotlib.ticker import MultipleLocator\n",
    "\n",
    "# ==========================================\n",
    "# 1. 统计与辅助函数\n",
    "# ==========================================\n",
    "SHADOW_TYPE = 'std'  \n",
    "CALIB_NUM = 500\n",
    "\n",
    "def get_error_bounds(data_array, type='std', scale=1):\n",
    "    \"\"\"计算均值和阴影上下界\"\"\"\n",
    "    if data_array.ndim == 1 or data_array.shape[0] <= 1:\n",
    "        mean_val = data_array.flatten() * scale\n",
    "        return mean_val, mean_val, mean_val\n",
    "\n",
    "    mean_val = np.mean(data_array, axis=0) * scale\n",
    "    \n",
    "    if type == 'std':\n",
    "        dev_val = np.std(data_array, axis=0) * scale\n",
    "    elif type == 'sem':\n",
    "        dev_val = stats.sem(data_array, axis=0) * scale\n",
    "    else:\n",
    "        dev_val = np.std(data_array, axis=0) * scale\n",
    "        \n",
    "    lower = mean_val - dev_val\n",
    "    upper = mean_val + dev_val\n",
    "        \n",
    "    return mean_val, lower, upper\n",
    "\n",
    "# ==========================================\n",
    "# 2. 数据预处理\n",
    "# ==========================================\n",
    "# 获取维度\n",
    "NUM_RUNS, N_STEPS = bpac_risks_arr.shape\n",
    "steps = np.arange(N_STEPS)\n",
    "\n",
    "# --- 2.1 构建 Baseline 阈值序列 ---\n",
    "# 逻辑：前 CALIB_NUM 步是 0，之后是 arr_thresholds[i]\n",
    "base_threshold_seqs = np.zeros((NUM_RUNS, N_STEPS))\n",
    "for i in range(NUM_RUNS):\n",
    "    base_threshold_seqs[i, :CALIB_NUM] = 0.0\n",
    "    # arr_thresholds 是标量数组 (100,)\n",
    "    base_threshold_seqs[i, CALIB_NUM:] = arr_thresholds[i]\n",
    "\n",
    "# --- 2.2 计算统计量 ---\n",
    "\n",
    "# Risk (ER) - 保持 0-0.2\n",
    "base_risk_m, base_risk_l, base_risk_h = get_error_bounds(arr_risks, SHADOW_TYPE)\n",
    "bpac_risk_m, bpac_risk_l, bpac_risk_h = get_error_bounds(bpac_risks_arr, SHADOW_TYPE)\n",
    "\n",
    "# ECP (Expert Call %) - 放大 100 倍\n",
    "base_ecp_m, base_ecp_l, base_ecp_h = get_error_bounds(arr_expert_ratios, SHADOW_TYPE, scale=100)\n",
    "bpac_ecp_m, bpac_ecp_l, bpac_ecp_h = get_error_bounds(bpac_expert_ratios_arr, SHADOW_TYPE, scale=100)\n",
    "\n",
    "# TCP (Token Cost %) - 放大 100 倍\n",
    "base_tcp_m, base_tcp_l, base_tcp_h = get_error_bounds(arr_token_ratios, SHADOW_TYPE, scale=100)\n",
    "bpac_tcp_m, bpac_tcp_l, bpac_tcp_h = get_error_bounds(bpac_token_ratios_arr, SHADOW_TYPE, scale=100)\n",
    "\n",
    "# Threshold ($u_t$) - 保持 0-1\n",
    "base_th_m, base_th_l, base_th_h = get_error_bounds(base_threshold_seqs, SHADOW_TYPE)\n",
    "bpac_th_m, bpac_th_l, bpac_th_h = get_error_bounds(bpac_thresholds_arr, SHADOW_TYPE)\n",
    "\n",
    "\n",
    "# ==========================================\n",
    "# 3. 绘图 (ICML 风格 - 4 Subplots)\n",
    "# ==========================================\n",
    "plt.rcParams.update({\n",
    "    'font.size': 14,\n",
    "    'axes.labelsize': 20,\n",
    "    'xtick.labelsize': 16,\n",
    "    'ytick.labelsize': 16,\n",
    "    'legend.fontsize': 16,\n",
    "    'lines.linewidth': 3.0,\n",
    "    'axes.grid': True, \n",
    "    'grid.alpha': 0.3, \n",
    "    'grid.linestyle': '--',\n",
    "    'figure.autolayout': True,\n",
    "    'savefig.dpi': 300\n",
    "})\n",
    "\n",
    "# 1行4列，宽一点\n",
    "fig, axes = plt.subplots(1, 4, figsize=(26, 5), sharex=True)\n",
    "\n",
    "# 颜色定义\n",
    "C_BASE = '#1f77b4'  # Blue\n",
    "C_OURS = '#d62728'  # Red\n",
    "C_TGT = '#2ca02c'   # Green (或者黑色 'black')\n",
    "C_WARM = 'gray'\n",
    "\n",
    "# --- (a) Risk (ER) ---\n",
    "ax = axes[0]\n",
    "# Baseline\n",
    "ax.plot(steps, base_risk_m, color=C_BASE, linestyle='--', label='PAC (Baseline)')\n",
    "ax.fill_between(steps, base_risk_l, base_risk_h, color=C_BASE, alpha=0.25)\n",
    "# BPAC\n",
    "ax.plot(steps, bpac_risk_m, color=C_OURS, linestyle='-', label='B-PAC (Ours)')\n",
    "ax.fill_between(steps, bpac_risk_l, bpac_risk_h, color=C_OURS, alpha=0.25)\n",
    "\n",
    "# Target Line\n",
    "ax.axhline(y=TARGET_EPSILON, color=C_TGT, linestyle='-', linewidth=2, label='Tolerance')\n",
    "if CALIB_NUM > 0:\n",
    "    ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "\n",
    "ax.set_ylabel('ER', fontsize=22)\n",
    "ax.set_xlabel('Time Step')\n",
    "# 自动调整 ylim\n",
    "ax.set_ylim(0, max(0.15, TARGET_EPSILON * 2.0))\n",
    "ax.yaxis.set_major_locator(MultipleLocator(0.04))\n",
    "\n",
    "# 图例放在第一个图\n",
    "ax.legend(loc='upper right', frameon=True, framealpha=0.95, fontsize=14)\n",
    "\n",
    "\n",
    "# --- (b) ECP (%) ---\n",
    "ax = axes[1]\n",
    "# Baseline\n",
    "ax.plot(steps, base_ecp_m, color=C_BASE, linestyle='--')\n",
    "ax.fill_between(steps, base_ecp_l, base_ecp_h, color=C_BASE, alpha=0.25)\n",
    "# BPAC\n",
    "ax.plot(steps, bpac_ecp_m, color=C_OURS, linestyle='-')\n",
    "ax.fill_between(steps, bpac_ecp_l, bpac_ecp_h, color=C_OURS, alpha=0.25)\n",
    "\n",
    "if CALIB_NUM > 0:\n",
    "    ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "\n",
    "ax.set_ylabel('ECP (%)', fontsize=20)\n",
    "ax.set_xlabel('Time Step')\n",
    "ax.set_ylim(0, 105)\n",
    "\n",
    "\n",
    "# --- (c) TCP (%) ---\n",
    "ax = axes[2]\n",
    "# Baseline\n",
    "ax.plot(steps, base_tcp_m, color=C_BASE, linestyle='--')\n",
    "ax.fill_between(steps, base_tcp_l, base_tcp_h, color=C_BASE, alpha=0.25)\n",
    "# BPAC\n",
    "ax.plot(steps, bpac_tcp_m, color=C_OURS, linestyle='-')\n",
    "ax.fill_between(steps, bpac_tcp_l, bpac_tcp_h, color=C_OURS, alpha=0.25)\n",
    "\n",
    "# Full Expert Reference\n",
    "ax.axhline(y=100, color='black', linestyle='-.', linewidth=1.5, label='Full Expert')\n",
    "if CALIB_NUM > 0:\n",
    "    ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "\n",
    "ax.set_ylabel('TP (%)', fontsize=20) # 或者 'TP(%)'\n",
    "ax.set_xlabel('Time Step')\n",
    "\n",
    "\n",
    "# --- (d) Threshold (u_t) ---\n",
    "ax = axes[3]\n",
    "# Baseline (hat{u})\n",
    "ax.plot(steps, base_th_m, color=C_BASE, linestyle='--', label=r'Baseline $\\hat{u}$')\n",
    "ax.fill_between(steps, base_th_l, base_th_h, color=C_BASE, alpha=0.25)\n",
    "\n",
    "# BPAC (u_t)\n",
    "ax.plot(steps, bpac_th_m, color=C_OURS, linestyle='-', label=r'BPAC $u_t$')\n",
    "ax.fill_between(steps, bpac_th_l, bpac_th_h, color=C_OURS, alpha=0.25)\n",
    "\n",
    "if CALIB_NUM > 0:\n",
    "    ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "\n",
    "ax.set_ylabel(r'Threshold ($\\hat{u}_t$)', fontsize=20)\n",
    "ax.set_xlabel('Time Step')\n",
    "ax.set_ylim(-0.05, 1.05)\n",
    "# ax.legend(loc='lower right', frameon=True, framealpha=0.95, fontsize=14)\n",
    "\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('threshold_bbh.pdf', bbox_inches='tight')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fb5da403",
   "metadata": {},
   "source": [
    "#### o-naive"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "64c1f8b1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "# 参数设置\n",
    "NUM_RUNS = 100        # 重复 100 次\n",
    "WARM_UP_STEPS = 0   # BPAC 的 Warm-up 步数 (对应 Baseline 的校准集大小)\n",
    "TARGET_EPSILON = 0.08  # 目标 Risk\n",
    "CFG_ALPHA= 0.1      # BPAC 参数\n",
    "CFG_RHO = 0.05        # BPAC 参数\n",
    "\n",
    "\n",
    "bpaconaive_risks = []\n",
    "bpaconaive_token_ratios = []\n",
    "bpaconaive_expert_ratios = []\n",
    "bpaconaive_wealths = []\n",
    "bpaconaive_thresholds = []\n",
    "print(f\"Starting BPAC-Naive Simulation ({NUM_RUNS} runs)...\")\n",
    "\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    # -----------------------------------------------------------\n",
    "    # 1. 严格的数据对齐 (Data Alignment)\n",
    "    # -----------------------------------------------------------\n",
    "    # 使用与 Baseline 相同的种子生成随机数生成器\n",
    "    rng = np.random.default_rng(seed)\n",
    "    \n",
    "    # 生成打乱的索引\n",
    "    n = len(data_list)\n",
    "    indices = np.arange(n)\n",
    "    rng.shuffle(indices)\n",
    "    \n",
    "    # 关键修正：根据 indices 重排 data_list\n",
    "    # 注意：data_list 是 list of dicts，不能直接用 indices 索引，除非转 numpy\n",
    "    # 这里用列表推导式最稳妥\n",
    "    shuffled_data = [data_list[i] for i in indices]\n",
    "    \n",
    "    # -----------------------------------------------------------\n",
    "    # 2. 配置并运行 BPAC\n",
    "    # -----------------------------------------------------------\n",
    "    # 你的 BPACConfig\n",
    "    cfg = BPACConfignaive(\n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=CFG_ALPHA, \n",
    "        rho=CFG_RHO, \n",
    "        warm_up=WARM_UP_STEPS,\n",
    "        num_thresholds=1001, \n",
    "    )\n",
    "    \n",
    "    # 运行模拟 (传入打乱后的数据)\n",
    "    df_result, _ = run_simulation_onaive(shuffled_data, cfg)\n",
    "    \n",
    "    bpaconaive_risks.append(df_result['avg_risk'].values)\n",
    "    bpaconaive_token_ratios.append(df_result['token_ratio'].values)\n",
    "    bpaconaive_expert_ratios.append(df_result['expert_call_ratio'].values)\n",
    "    bpaconaive_wealths.append(df_result['wealth'].values)\n",
    "    bpaconaive_thresholds.append(df_result['threshold'].values) \n",
    "\n",
    "# 转换为 Numpy 数组方便计算均值\n",
    "bpaconaive_risks_arr = np.array(bpaconaive_risks)       # Shape: (100, N)\n",
    "bpaconaive_token_ratios_arr = np.array(bpaconaive_token_ratios) # Shape: (100, N)\n",
    "bpaconaive_expert_ratios_arr = np.array(bpaconaive_expert_ratios) # Shape: (100, N)\n",
    "bpaconaive_wealths_arr = np.array(bpaconaive_wealths)   # Shape: (100, N)\n",
    "bpaconaive_thresholds_arr = np.array(bpaconaive_thresholds) # Shape: (100, N)\n",
    "print(\"BPAC-Naive Simulation Finished!\")\n",
    "bpaconaive_risks_arr.shape, bpaconaive_token_ratios_arr.shape, bpaconaive_expert_ratios_arr.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d0b24983",
   "metadata": {},
   "source": [
    "#### ips"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eaa14843",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "# 参数设置\n",
    "NUM_RUNS = 100        # 重复 100 次\n",
    "WARM_UP_STEPS = 0   # BPAC 的 Warm-up 步数 (对应 Baseline 的校准集大小)\n",
    "TARGET_EPSILON = 0.08  # 目标 Risk\n",
    "CFG_ALPHA= 0.1      # BPAC 参数\n",
    "CFG_RHO = 0.05        # BPAC 参数\n",
    "\n",
    "\n",
    "bpacips_risks = []\n",
    "bpacips_token_ratios = []\n",
    "bpacips_expert_ratios = []\n",
    "bpacips_wealths = []\n",
    "bpacips_thresholds = []\n",
    "print(f\"Starting BPAC-IPS Simulation ({NUM_RUNS} runs)...\")\n",
    "\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    # -----------------------------------------------------------\n",
    "    # 1. 严格的数据对齐 (Data Alignment)\n",
    "    # -----------------------------------------------------------\n",
    "    # 使用与 Baseline 相同的种子生成随机数生成器\n",
    "    rng = np.random.default_rng(seed)\n",
    "    \n",
    "    # 生成打乱的索引\n",
    "    n = len(data_list)\n",
    "    indices = np.arange(n)\n",
    "    rng.shuffle(indices)\n",
    "    \n",
    "    # 关键修正：根据 indices 重排 data_list\n",
    "    # 注意：data_list 是 list of dicts，不能直接用 indices 索引，除非转 numpy\n",
    "    # 这里用列表推导式最稳妥\n",
    "    shuffled_data = [data_list[i] for i in indices]\n",
    "    \n",
    "    # -----------------------------------------------------------\n",
    "    # 2. 配置并运行 BPAC\n",
    "    # -----------------------------------------------------------\n",
    "    # 你的 BPACConfig\n",
    "    cfg = BPACConfigIPS(\n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=CFG_ALPHA, \n",
    "        rho=CFG_RHO, \n",
    "        warm_up=WARM_UP_STEPS,\n",
    "        num_thresholds=1001, \n",
    "    )\n",
    "    \n",
    "    # 运行模拟 (传入打乱后的数据)\n",
    "    df_result, _ = run_simulation_ips(shuffled_data, cfg)\n",
    "    \n",
    "    bpacips_risks.append(df_result['avg_risk'].values)\n",
    "    bpacips_token_ratios.append(df_result['token_ratio'].values)\n",
    "    bpacips_expert_ratios.append(df_result['expert_call_ratio'].values)\n",
    "    bpacips_wealths.append(df_result['wealth'].values)\n",
    "    bpacips_thresholds.append(df_result['threshold'].values) \n",
    "\n",
    "# 转换为 Numpy 数组方便计算均值\n",
    "bpacips_risks_arr = np.array(bpacips_risks)       # Shape: (100, N)\n",
    "bpacips_token_ratios_arr = np.array(bpacips_token_ratios) # Shape: (100, N)\n",
    "bpacips_expert_ratios_arr = np.array(bpacips_expert_ratios) # Shape: (100, N)\n",
    "bpacips_wealths_arr = np.array(bpacips_wealths)   # Shape: (100, N)\n",
    "bpacips_thresholds_arr = np.array(bpacips_thresholds) # Shape: (100, N)\n",
    "print(\"BPAC-IPS Simulation Finished!\")\n",
    "bpacips_risks_arr.shape, bpacips_token_ratios_arr.shape, bpacips_expert_ratios_arr.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "01e2d62b",
   "metadata": {},
   "source": [
    "#### 画图"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b403d753",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "def calc_stats(arr):\n",
    "    \"\"\"辅助函数：计算均值和标准差\"\"\"\n",
    "    return np.mean(arr, axis=0), np.std(arr, axis=0)\n",
    "\n",
    "# --- 计算 BPAC (Ours) 统计量 ---\n",
    "bpac_mean_risk, bpac_std_risk = calc_stats(bpac_risks_arr)\n",
    "bpac_mean_token, bpac_std_token = calc_stats(bpac_token_ratios_arr)\n",
    "bpac_mean_expert, bpac_std_expert = calc_stats(bpac_expert_ratios_arr)\n",
    "bpac_mean_threshold, bpac_std_threshold = calc_stats(bpac_thresholds_arr)\n",
    "bpac_mean_wealth, bpac_std_wealth = calc_stats(bpac_wealths_arr)\n",
    "\n",
    "# --- 计算 BPAC-Naive 统计量 ---\n",
    "naive_mean_risk, naive_std_risk = calc_stats(bpaconaive_risks_arr)\n",
    "naive_mean_token, naive_std_token = calc_stats(bpaconaive_token_ratios_arr)\n",
    "naive_mean_expert, naive_std_expert = calc_stats(bpaconaive_expert_ratios_arr)\n",
    "naive_mean_threshold, naive_std_threshold = calc_stats(bpaconaive_thresholds_arr)\n",
    "# Naive 通常没有 Wealth，这里忽略\n",
    "\n",
    "# --- 计算 BPAC-IPS 统计量 ---\n",
    "ips_mean_risk, ips_std_risk = calc_stats(bpacips_risks_arr)\n",
    "ips_mean_token, ips_std_token = calc_stats(bpacips_token_ratios_arr)\n",
    "ips_mean_expert, ips_std_expert = calc_stats(bpacips_expert_ratios_arr)\n",
    "ips_mean_threshold, ips_std_threshold = calc_stats(bpacips_thresholds_arr)\n",
    "ips_mean_wealth, ips_std_wealth = calc_stats(bpacips_wealths_arr)\n",
    "\n",
    "# ==========================================\n",
    "# 2. 绘图 (Plotting)\n",
    "# ==========================================\n",
    "# 获取时间步\n",
    "N_STEPS = bpac_risks_arr.shape[1]\n",
    "steps = np.arange(N_STEPS)\n",
    "\n",
    "# 使用 5 行子图\n",
    "fig, axes = plt.subplots(5, 1, figsize=(12, 22), sharex=True)\n",
    "(ax1, ax2, ax3, ax4, ax5) = axes\n",
    "\n",
    "# 通用绘图辅助函数\n",
    "def plot_with_std(ax, x, mean, std, label, color, linestyle='-'):\n",
    "    ax.plot(x, mean, label=label, color=color, linestyle=linestyle, linewidth=2)\n",
    "    # 填充误差带 (透明度设低一点以免遮挡)\n",
    "    ax.fill_between(x, mean - std, mean + std, color=color, alpha=0.1)\n",
    "\n",
    "# === 子图 1: Average Risk (风险控制) ===\n",
    "plot_with_std(ax1, steps, naive_mean_risk, naive_std_risk, 'BPAC-Naive', 'blue', '--')\n",
    "plot_with_std(ax1, steps, ips_mean_risk, ips_std_risk, 'BPAC-IPS', 'green', '-.')\n",
    "plot_with_std(ax1, steps, bpac_mean_risk, bpac_std_risk, 'BPAC (Ours)', 'red')\n",
    "\n",
    "ax1.axhline(y=TARGET_EPSILON, color='black', linestyle='-', linewidth=1.5, label=f'Target $\\epsilon={TARGET_EPSILON}$')\n",
    "if WARM_UP_STEPS > 0:\n",
    "    ax1.axvline(x=WARM_UP_STEPS, color='gray', linestyle=':', label='Warm-up End')\n",
    "\n",
    "ax1.set_ylabel('Avg Risk')\n",
    "ax1.set_title(f'1. Risk Control (Target $\\epsilon < {TARGET_EPSILON}$)')\n",
    "ax1.legend(loc='upper right')\n",
    "ax1.grid(True, alpha=0.3)\n",
    "# 动态调整 Y 轴范围，避免初期波动过大\n",
    "max_risk_show = max(TARGET_EPSILON * 3.0, np.max(bpac_mean_risk[-100:]), np.max(ips_mean_risk[-100:]))\n",
    "ax1.set_ylim(0, max_risk_show)\n",
    "\n",
    "# === 子图 2: Token Ratio (成本节省) ===\n",
    "plot_with_std(ax2, steps, naive_mean_token, naive_std_token, 'BPAC-Naive', 'blue', '--')\n",
    "plot_with_std(ax2, steps, ips_mean_token, ips_std_token, 'BPAC-IPS', 'green', '-.')\n",
    "plot_with_std(ax2, steps, bpac_mean_token, bpac_std_token, 'BPAC (Ours)', 'red')\n",
    "\n",
    "ax2.axhline(y=1.0, color='black', linestyle=':')\n",
    "if WARM_UP_STEPS > 0:\n",
    "    ax2.axvline(x=WARM_UP_STEPS, color='gray', linestyle=':')\n",
    "\n",
    "ax2.set_ylabel('Token Ratio')\n",
    "ax2.set_title('2. Cost Efficiency (Lower is Better)')\n",
    "ax2.legend(loc='upper right')\n",
    "ax2.grid(True, alpha=0.3)\n",
    "\n",
    "# === 子图 3: Expert Call Ratio (专家调用率) ===\n",
    "plot_with_std(ax3, steps, naive_mean_expert, naive_std_expert, 'BPAC-Naive', 'blue', '--')\n",
    "plot_with_std(ax3, steps, ips_mean_expert, ips_std_expert, 'BPAC-IPS', 'green', '-.')\n",
    "plot_with_std(ax3, steps, bpac_mean_expert, bpac_std_expert, 'BPAC (Ours)', 'red')\n",
    "\n",
    "if WARM_UP_STEPS > 0:\n",
    "    ax3.axvline(x=WARM_UP_STEPS, color='gray', linestyle=':')\n",
    "\n",
    "ax3.set_ylabel('Expert Call Ratio')\n",
    "ax3.set_title('3. Expert Usage Behavior')\n",
    "ax3.grid(True, alpha=0.3)\n",
    "ax3.set_ylim(-0.05, 1.05)\n",
    "\n",
    "# === 子图 4: Threshold Evolution (阈值变化) ===\n",
    "plot_with_std(ax4, steps, naive_mean_threshold, naive_std_threshold, 'BPAC-Naive $\\hat{u}$', 'blue', '--')\n",
    "plot_with_std(ax4, steps, ips_mean_threshold, ips_std_threshold, 'BPAC-IPS $u_t$', 'green', '-.')\n",
    "plot_with_std(ax4, steps, bpac_mean_threshold, bpac_std_threshold, 'BPAC $u_t$', 'red')\n",
    "\n",
    "if WARM_UP_STEPS > 0:\n",
    "    ax4.axvline(x=WARM_UP_STEPS, color='gray', linestyle=':')\n",
    "\n",
    "ax4.set_ylabel('Threshold')\n",
    "ax4.set_title('4. Threshold Adaptation Strategy')\n",
    "ax4.legend(loc='center right')\n",
    "ax4.grid(True, alpha=0.3)\n",
    "ax4.set_ylim(-0.05, 1.05)\n",
    "\n",
    "# === 子图 5: Wealth Evolution (BPAC & IPS 对比) ===\n",
    "# Naive 没有 Wealth，只画 BPAC 和 IPS\n",
    "plot_with_std(ax5, steps, ips_mean_wealth, ips_std_wealth, 'BPAC-IPS Wealth', 'green', '-.')\n",
    "plot_with_std(ax5, steps, bpac_mean_wealth, bpac_std_wealth, 'BPAC Wealth', 'red')\n",
    "\n",
    "ax5.axhline(y=1.0, color='black', linestyle='--', label='Initial Wealth')\n",
    "if WARM_UP_STEPS > 0:\n",
    "    ax5.axvline(x=WARM_UP_STEPS, color='gray', linestyle=':')\n",
    "\n",
    "# 财富通常呈指数增长，建议使用对数坐标\n",
    "ax5.set_yscale('log')\n",
    "\n",
    "ax5.set_ylabel('Wealth (log scale)')\n",
    "ax5.set_xlabel('Time Step')\n",
    "ax5.set_title('5. Martingale Wealth Process Comparison')\n",
    "ax5.legend(loc='upper left')\n",
    "ax5.grid(True, alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9e1360b5",
   "metadata": {},
   "source": [
    "#### paper"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7f2820a9",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "from scipy import stats\n",
    "from matplotlib.ticker import MultipleLocator\n",
    "\n",
    "# ==========================================\n",
    "# 1. 统计与辅助函数\n",
    "# ==========================================\n",
    "SHADOW_TYPE = 'std'  # 可选: 'std', 'sem', 'ci', 'percentile'\n",
    "CALIB_NUM =0\n",
    "def get_error_bounds(data_array, type='sem',scale=1):\n",
    "    \"\"\"计算均值和阴影上下界\"\"\"\n",
    "    mean_val = np.mean(data_array, axis=0)\n",
    "    sem_val = np.std(data_array, axis=0)\n",
    "    lower = mean_val - sem_val\n",
    "    upper = mean_val + sem_val\n",
    "        \n",
    "    return mean_val*scale, lower*scale, upper*scale\n",
    "\n",
    "def clip_bounds(low, high, min_v=0, max_v=None):\n",
    "    \"\"\"物理截断 (例如比率不能小于0)\"\"\"\n",
    "    low = np.maximum(low, min_v)\n",
    "    if max_v is not None:\n",
    "        high = np.minimum(high, max_v)\n",
    "    return low, high\n",
    "\n",
    "# ==========================================\n",
    "# 2. 计算绘图数据\n",
    "# ==========================================\n",
    "# 假设 steps 与数组长度一致\n",
    "steps = np.arange(bpac_risks_arr.shape[1])\n",
    "\n",
    "# --- Risk (ER) ---\n",
    "naive_risk_m, naive_risk_l, naive_risk_h = get_error_bounds(bpaconaive_risks_arr, SHADOW_TYPE)\n",
    "ips_risk_m, ips_risk_l, ips_risk_h = get_error_bounds(bpacips_risks_arr, SHADOW_TYPE)\n",
    "bpac_risk_m, bpac_risk_l, bpac_risk_h = get_error_bounds(bpac_risks_arr, SHADOW_TYPE)\n",
    "\n",
    "# --- ECP (Expert Call %) ---\n",
    "naive_ecp_m, naive_ecp_l, naive_ecp_h = get_error_bounds(bpaconaive_expert_ratios_arr, SHADOW_TYPE,scale=100)\n",
    "ips_ecp_m, ips_ecp_l, ips_ecp_h = get_error_bounds(bpacips_expert_ratios_arr, SHADOW_TYPE,scale=100)\n",
    "bpac_ecp_m, bpac_ecp_l, bpac_ecp_h = get_error_bounds(bpac_expert_ratios_arr, SHADOW_TYPE,scale=100)\n",
    "\n",
    "# --- TCP (Token Cost %) ---\n",
    "naive_tcp_m, naive_tcp_l, naive_tcp_h = get_error_bounds(bpaconaive_token_ratios_arr, SHADOW_TYPE,scale=100)\n",
    "ips_tcp_m, ips_tcp_l, ips_tcp_h = get_error_bounds(bpacips_token_ratios_arr, SHADOW_TYPE,scale=100)\n",
    "bpac_tcp_m, bpac_tcp_l, bpac_tcp_h = get_error_bounds(bpac_token_ratios_arr, SHADOW_TYPE,scale=100)\n",
    "\n",
    "# # 截断不合理的数值\n",
    "# naive_risk_l, naive_risk_h = clip_bounds(naive_risk_l, naive_risk_h, 0, 1.0)\n",
    "# ips_risk_l, ips_risk_h = clip_bounds(ips_risk_l, ips_risk_h, 0, 1.0)\n",
    "# bpac_risk_l, bpac_risk_h = clip_bounds(bpac_risk_l, bpac_risk_h, 0, 1.0)\n",
    "\n",
    "# naive_ecp_l, naive_ecp_h = clip_bounds(naive_ecp_l, naive_ecp_h, 0, 1.0)\n",
    "# ips_ecp_l, ips_ecp_h = clip_bounds(ips_ecp_l, ips_ecp_h, 0, 1.0)\n",
    "# bpac_ecp_l, bpac_ecp_h = clip_bounds(bpac_ecp_l, bpac_ecp_h, 0, 1.0)\n",
    "\n",
    "# naive_tcp_l, naive_tcp_h = clip_bounds(naive_tcp_l, naive_tcp_h, 0, None)\n",
    "# ips_tcp_l, ips_tcp_h = clip_bounds(ips_tcp_l, ips_tcp_h, 0, None)\n",
    "# bpac_tcp_l, bpac_tcp_h = clip_bounds(bpac_tcp_l, bpac_tcp_h, 0, None)\n",
    "\n",
    "# ==========================================\n",
    "# 3. 绘图 (ICML 风格 - 3 Methods)\n",
    "# ==========================================\n",
    "plt.rcParams.update({\n",
    "    'font.size': 14,\n",
    "    'axes.labelsize': 20,\n",
    "    'xtick.labelsize': 16,\n",
    "    'ytick.labelsize': 16,\n",
    "    'legend.fontsize': 16,\n",
    "    'lines.linewidth': 3.0,\n",
    "    'axes.grid': True, \n",
    "    'grid.alpha': 0.3, \n",
    "    'grid.linestyle': '--',\n",
    "    'figure.autolayout': True,\n",
    "    'savefig.dpi': 300\n",
    "})\n",
    "\n",
    "fig, axes = plt.subplots(1, 3, figsize=(20, 5), sharex=True)\n",
    "\n",
    "# 颜色定义\n",
    "C_NAIVE = '#1f77b4'  # Blue\n",
    "C_IPS = '#2ca02c'    # Green\n",
    "C_OURS = '#d62728'   # Red\n",
    "C_TGT = 'black'      # Tolerance Line\n",
    "C_WARM = 'gray'      # Warmup Line\n",
    "\n",
    "# --- (a) Risk ---\n",
    "ax = axes[0]\n",
    "# 1. Naive\n",
    "ax.plot(steps, naive_risk_m, color=C_NAIVE, linestyle='--', label='O-Naive')\n",
    "ax.fill_between(steps, naive_risk_l, naive_risk_h, color=C_NAIVE, alpha=0.3)\n",
    "# 2. IPS\n",
    "ax.plot(steps, ips_risk_m, color=C_IPS, linestyle='-.', label='IPS+Hoeff')\n",
    "ax.fill_between(steps, ips_risk_l, ips_risk_h, color=C_IPS, alpha=0.3)\n",
    "# 3. Ours\n",
    "ax.plot(steps, bpac_risk_m, color=C_OURS, linestyle='-', label='B-PAC (Ours)')\n",
    "ax.fill_between(steps, bpac_risk_l, bpac_risk_h, color=C_OURS, alpha=0.3)\n",
    "\n",
    "ax.axhline(y=TARGET_EPSILON, color=C_TGT, linestyle='--', linewidth=2, label='Tolerance')\n",
    "# ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "\n",
    "ax.set_ylabel('ER', fontsize=22)\n",
    "ax.set_xlabel('Time Step')\n",
    "# 自动调整 ylim: 取三个方法最大值的 1.1 倍，或者至少展示到 Target 的 2 倍\n",
    "max_risk_show = max(TARGET_EPSILON * 1.5, np.max(naive_risk_m[-50:]), np.max(bpac_risk_m[-50:])) * 1.1\n",
    "ax.set_ylim(0, 0.2)\n",
    "ax.yaxis.set_major_locator(MultipleLocator(0.04))\n",
    "\n",
    "# 图例\n",
    "ax.legend(loc='upper right', frameon=True, framealpha=0.95, fontsize=14)\n",
    "\n",
    "# --- (b) ECP ---\n",
    "ax = axes[1]\n",
    "ax.plot(steps, naive_ecp_m, color=C_NAIVE, linestyle='--')\n",
    "ax.fill_between(steps, naive_ecp_l, naive_ecp_h, color=C_NAIVE, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, ips_ecp_m, color=C_IPS, linestyle='-.')\n",
    "ax.fill_between(steps, ips_ecp_l, ips_ecp_h, color=C_IPS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, bpac_ecp_m, color=C_OURS, linestyle='-')\n",
    "ax.fill_between(steps, bpac_ecp_l, bpac_ecp_h, color=C_OURS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "# ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "ax.set_ylabel('ECP(%)')\n",
    "ax.set_xlabel('Time Step')\n",
    "ax.set_ylim(0, 100)\n",
    "\n",
    "# --- (c) TCP ---\n",
    "ax = axes[2]\n",
    "ax.plot(steps, naive_tcp_m, color=C_NAIVE, linestyle='--')\n",
    "ax.fill_between(steps, naive_tcp_l, naive_tcp_h, color=C_NAIVE, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, ips_tcp_m, color=C_IPS, linestyle='-.')\n",
    "ax.fill_between(steps, ips_tcp_l, ips_tcp_h, color=C_IPS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, bpac_tcp_m, color=C_OURS, linestyle='-')\n",
    "ax.fill_between(steps, bpac_tcp_l, bpac_tcp_h, color=C_OURS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.axhline(y=100, color='black', linestyle='-.', linewidth=1.5, label='Full Expert')\n",
    "# ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "\n",
    "ax.set_ylabel('TP(%)')\n",
    "ax.set_xlabel('Time Step')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('qwen3ins_bbh.pdf', bbox_inches='tight') # 如需保存请取消注释\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aaead667",
   "metadata": {},
   "outputs": [],
   "source": [
    "bpac_risk_m, bpac_risk_l, bpac_risk_h "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "862ecbc4",
   "metadata": {},
   "source": [
    "#### 不同epsilon画图"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ed509eb2",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "import pickle  # 用于保存结果到硬盘，防止跑太久丢失\n",
    "\n",
    "# ================= 配置区域 =================\n",
    "# 实验参数\n",
    "NUM_RUNS = 100        # 重复次数\n",
    "WARM_UP_STEPS = 0     \n",
    "EPSILON_LIST = [0.05, 0.06, 0.07, 0.08, 0.09, 0.10] # <--- 这里是你想要遍历的 epsilon 列表\n",
    "\n",
    "# 固定参数 (保持不变)\n",
    "CFG_ALPHA = 0.1       \n",
    "CFG_RHO = 0       \n",
    "BPA_beta = 1\n",
    "CFG_RHO_0 = 0.05\n",
    "CFG_RHO_1 = 0.7\n",
    "CFG_CHANGE = 200\n",
    "C_CLIP = 0.9\n",
    "\n",
    "# 结果存储容器：Key 是 epsilon 值，Value 是包含各指标数组的字典\n",
    "experiment_results = {} \n",
    "\n",
    "def run_single_epsilon_experiment(target_eps):\n",
    "    \"\"\"\n",
    "    针对单个 Epsilon 运行 NUM_RUNS 次实验\n",
    "    \"\"\"\n",
    "    local_risks = []\n",
    "    local_token_ratios = []\n",
    "    local_expert_ratios = []\n",
    "    local_wealths = []\n",
    "    local_thresholds = []\n",
    "    \n",
    "    # tqdm desc 参数可以让你知道当前跑的是哪个 epsilon\n",
    "    for seed in tqdm(range(NUM_RUNS), desc=f\"Simulating Eps={target_eps}\", leave=False):\n",
    "        # 1. 数据对齐 (保证不同 epsilon 使用的是完全相同的随机数据顺序)\n",
    "        rng = np.random.default_rng(seed)\n",
    "        n = len(data_list)\n",
    "        indices = np.arange(n)\n",
    "        rng.shuffle(indices)\n",
    "        shuffled_data = [data_list[i] for i in indices]\n",
    "        \n",
    "        # 2. 配置 BPAC (传入当前的 target_eps)\n",
    "        cfg = BPACConfig(\n",
    "            epsilon=target_eps,  # <--- 动态变化的参数\n",
    "            alpha=CFG_ALPHA, \n",
    "            rho=CFG_RHO, \n",
    "            warm_up=WARM_UP_STEPS,\n",
    "            num_thresholds=1001, \n",
    "            beta=BPA_beta, \n",
    "            c_clip=C_CLIP,\n",
    "            rho_0=CFG_RHO_0,\n",
    "            rho_1=CFG_RHO_1,\n",
    "            change_point=CFG_CHANGE\n",
    "        )\n",
    "        \n",
    "        # 3. 运行\n",
    "        df_result, _ = run_simulation(shuffled_data, cfg)\n",
    "        \n",
    "        # 4. 收集\n",
    "        local_risks.append(df_result['avg_risk'].values)\n",
    "        local_token_ratios.append(df_result['token_ratio'].values)\n",
    "        local_expert_ratios.append(df_result['expert_call_ratio'].values)\n",
    "        local_wealths.append(df_result['wealth'].values)\n",
    "        local_thresholds.append(df_result['threshold'].values)\n",
    "\n",
    "    # 转 Numpy 并打包返回\n",
    "    return {\n",
    "        \"risks\": np.array(local_risks),         # Shape: (100, N)\n",
    "        \"tokens\": np.array(local_token_ratios), # Shape: (100, N)\n",
    "        \"expert_calls\": np.array(local_expert_ratios),\n",
    "        \"wealths\": np.array(local_wealths),\n",
    "        \"thresholds\": np.array(local_thresholds)\n",
    "    }\n",
    "\n",
    "# ================= 主循环 =================\n",
    "print(f\"Starting Multi-Epsilon Simulation for: {EPSILON_LIST}\")\n",
    "\n",
    "for eps in EPSILON_LIST:\n",
    "    print(f\"\\n>>> Processing Target Epsilon: {eps}\")\n",
    "    result_bundle = run_single_epsilon_experiment(eps)\n",
    "    experiment_results[eps] = result_bundle\n",
    "\n",
    "print(\"\\nAll experiments finished!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8554ecc8",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "from scipy import stats\n",
    "from matplotlib.ticker import MultipleLocator\n",
    "import matplotlib.cm as cm\n",
    "from matplotlib import colors as mcolors\n",
    "# ==========================================\n",
    "# 1. 统计与辅助函数 (保持不变)\n",
    "# ==========================================\n",
    "SHADOW_TYPE = 'std'  # 推荐使用 std 看得更清楚\n",
    "CALIB_NUM = 0\n",
    "\n",
    "def get_error_bounds(data_array, type='std', scale=1):\n",
    "    \"\"\"计算均值和阴影上下界\"\"\"\n",
    "    if data_array.ndim == 1 or data_array.shape[0] <= 1:\n",
    "        mean_val = data_array.flatten() * scale\n",
    "        return mean_val, mean_val, mean_val\n",
    "\n",
    "    mean_val = np.mean(data_array, axis=0) * scale\n",
    "    \n",
    "    if type == 'std':\n",
    "        dev_val = np.std(data_array, axis=0) * scale\n",
    "    elif type == 'sem':\n",
    "        dev_val = stats.sem(data_array, axis=0) * scale\n",
    "    else:\n",
    "        dev_val = np.std(data_array, axis=0) * scale\n",
    "        \n",
    "    lower = mean_val - dev_val\n",
    "    upper = mean_val + dev_val\n",
    "        \n",
    "    return mean_val, lower, upper\n",
    "\n",
    "# ==========================================\n",
    "# 2. 绘图设置 (ICML 风格)\n",
    "# ==========================================\n",
    "plt.rcParams.update({\n",
    "    'font.size': 14,\n",
    "    'axes.labelsize': 20,\n",
    "    'xtick.labelsize': 16,\n",
    "    'ytick.labelsize': 16,\n",
    "    'legend.fontsize': 16,\n",
    "    'lines.linewidth': 3.0,\n",
    "    'axes.grid': True, \n",
    "    'grid.alpha': 0.3, \n",
    "    'grid.linestyle': '--',\n",
    "    'figure.autolayout': True,\n",
    "    'savefig.dpi': 300\n",
    "})\n",
    "\n",
    "fig, axes = plt.subplots(1, 3, figsize=(20, 5), sharex=True)\n",
    "\n",
    "# 准备颜色生成器 (根据 EPSILON_LIST 的长度生成渐变色)\n",
    "# sorted_epsilons = sorted(experiment_results.keys()) # 确保按顺序画\n",
    "sorted_epsilons = sorted(EPSILON_LIST) # 使用你定义的列表\n",
    "num_eps = len(sorted_epsilons)\n",
    "\n",
    "\n",
    "# 这样生成的渐变色既保留了原来的风格，又保证了中间线条清晰可见\n",
    "colors_nodes = ['#1f77b4', '#2ca02c', '#d62728'] \n",
    "custom_cmap = mcolors.LinearSegmentedColormap.from_list(\"BlueGreenRed\", colors_nodes)\n",
    "\n",
    "# 生成颜色列表\n",
    "colors = [custom_cmap(x) for x in np.linspace(0, 1, num_eps)]\n",
    "# 获取时间步 (假设所有实验长度一致)\n",
    "# 从第一个结果中获取长度\n",
    "first_key = sorted_epsilons[0]\n",
    "steps = np.arange(experiment_results[first_key]['risks'].shape[1])\n",
    "\n",
    "# ==========================================\n",
    "# 3. 循环绘图\n",
    "# ==========================================\n",
    "\n",
    "for idx, eps in enumerate(sorted_epsilons):\n",
    "    bundle = experiment_results[eps]\n",
    "    color = colors[idx]\n",
    "    label_str = f'$\\epsilon={eps}$'\n",
    "    \n",
    "    # --- 1. Risk (ER) ---\n",
    "    # 提取数据\n",
    "    risks_data = bundle['risks'] # 假设 key 是 'risks' (根据你之前的代码可能是 avg_risk)\n",
    "    # 如果你的 bundle 存的是 df_result['avg_risk'].values，那 key 需要对应上\n",
    "    # 这里假设你存的时候比如: 'risks': np.array(list_of_arrays)\n",
    "    \n",
    "    # 如果你的 bundle 结构不一样，请在这里调整提取逻辑，例如:\n",
    "    # risks_data = np.array([res['avg_risk'].values for res in bundle]) \n",
    "    \n",
    "    r_mean, r_low, r_high = get_error_bounds(risks_data, SHADOW_TYPE)\n",
    "    \n",
    "    ax = axes[0]\n",
    "    ax.plot(steps, r_mean, color=color, linestyle='-', label=label_str)\n",
    "    ax.fill_between(steps, r_low, r_high, color=color, alpha=0.15) # 阴影淡一点，避免重叠太乱\n",
    "\n",
    "    # --- 2. ECP ---\n",
    "    # 注意：scale=100\n",
    "    ecp_data = bundle['expert_calls'] # 对应 run_single_epsilon_experiment 里的 key\n",
    "    e_mean, e_low, e_high = get_error_bounds(ecp_data, SHADOW_TYPE, scale=100)\n",
    "    \n",
    "    ax = axes[1]\n",
    "    ax.plot(steps, e_mean, color=color, linestyle='-')\n",
    "    ax.fill_between(steps, e_low, e_high, color=color, alpha=0.15)\n",
    "\n",
    "    # --- 3. TCP ---\n",
    "    # 注意：scale=100\n",
    "    tcp_data = bundle['tokens'] # 对应 run_single_epsilon_experiment 里的 key\n",
    "    t_mean, t_low, t_high = get_error_bounds(tcp_data, SHADOW_TYPE, scale=100)\n",
    "    \n",
    "    ax = axes[2]\n",
    "    ax.plot(steps, t_mean, color=color, linestyle='-')\n",
    "    ax.fill_between(steps, t_low, t_high, color=color, alpha=0.15)\n",
    "\n",
    "\n",
    "# ==========================================\n",
    "# 4. 细节修饰 (Axes Labels & Limits)\n",
    "# ==========================================\n",
    "\n",
    "# --- Ax0: Risk ---\n",
    "ax = axes[0]\n",
    "ax.set_ylabel('ER', fontsize=22)\n",
    "ax.set_xlabel('Time Step')\n",
    "# ax.set_ylim(0, max(sorted_epsilons)*2.0) # 动态调整，大概是最大 epsilon 的两倍\n",
    "ax.set_ylim(0, 0.1) # 或者手动固定\n",
    "ax.legend(loc='upper right', frameon=True, framealpha=0.95, fontsize=12, ncol=2) # 分两列显示图例\n",
    "ax.set_title(\"Varying Tolerance ($\\epsilon$)\")\n",
    "\n",
    "# --- Ax1: ECP ---\n",
    "ax = axes[1]\n",
    "ax.set_ylabel('ECP (%)')\n",
    "ax.set_xlabel('Time Step')\n",
    "ax.set_ylim(0, 100)\n",
    "# ax.legend() # 不需要重复图例\n",
    "\n",
    "# --- Ax2: TCP ---\n",
    "ax = axes[2]\n",
    "ax.set_ylabel('TP (%)')\n",
    "ax.set_xlabel('Time Step')\n",
    "ax.axhline(y=100, color='black', linestyle='-.', linewidth=1.5, label='Full Expert', zorder=1)\n",
    "\n",
    "# 保存\n",
    "plt.tight_layout()\n",
    "plt.savefig('epsilon_sensitivity_bbh.pdf', bbox_inches='tight')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "504d62f2",
   "metadata": {},
   "source": [
    "#### 不同转折步数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3282448f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "import pickle  # 用于保存结果到硬盘，防止跑太久丢失\n",
    "\n",
    "# ================= 配置区域 =================\n",
    "# 实验参数\n",
    "NUM_RUNS = 100        # 重复次数\n",
    "WARM_UP_STEPS = 0     \n",
    "CFG_CHANGES = [10,50,100,200,300,500] # <--- 这里是你想要遍历的 epsilon 列表\n",
    "\n",
    "# 固定参数 (保持不变)\n",
    "CFG_ALPHA = 0.1       \n",
    "CFG_RHO = 0       \n",
    "BPA_beta = 1\n",
    "CFG_RHO_0 = 0.05\n",
    "CFG_RHO_1 = 0.7\n",
    "CFG_EPSILON = 0.08\n",
    "# CFG_CHANGE = 200\n",
    "C_CLIP = 0.9\n",
    "\n",
    "# 结果存储容器：Key 是 epsilon 值，Value 是包含各指标数组的字典\n",
    "experiment_results = {} \n",
    "\n",
    "def run_single_epsilon_experiment(target_steps):\n",
    "    \"\"\"\n",
    "    针对单个 Epsilon 运行 NUM_RUNS 次实验\n",
    "    \"\"\"\n",
    "    local_risks = []\n",
    "    local_token_ratios = []\n",
    "    local_expert_ratios = []\n",
    "    local_wealths = []\n",
    "    local_thresholds = []\n",
    "    \n",
    "    # tqdm desc 参数可以让你知道当前跑的是哪个 epsilon\n",
    "    for seed in tqdm(range(NUM_RUNS), desc=f\"Simulating Steps={target_steps}\", leave=False):\n",
    "        # 1. 数据对齐 (保证不同 epsilon 使用的是完全相同的随机数据顺序)\n",
    "        rng = np.random.default_rng(seed)\n",
    "        n = len(data_list)\n",
    "        indices = np.arange(n)\n",
    "        rng.shuffle(indices)\n",
    "        shuffled_data = [data_list[i] for i in indices]\n",
    "        \n",
    "        # 2. 配置 BPAC (传入当前的 target_eps)\n",
    "        cfg = BPACConfig(\n",
    "            epsilon=CFG_EPSILON,  # <--- 动态变化的参数\n",
    "            alpha=CFG_ALPHA, \n",
    "            rho=CFG_RHO, \n",
    "            warm_up=WARM_UP_STEPS,\n",
    "            num_thresholds=1001, \n",
    "            beta=BPA_beta, \n",
    "            c_clip=C_CLIP,\n",
    "            rho_0=CFG_RHO_0,\n",
    "            rho_1=CFG_RHO_1,\n",
    "            change_point=target_steps\n",
    "        )\n",
    "        \n",
    "        # 3. 运行\n",
    "        df_result, _ = run_simulation(shuffled_data, cfg)\n",
    "        \n",
    "        # 4. 收集\n",
    "        local_risks.append(df_result['avg_risk'].values)\n",
    "        local_token_ratios.append(df_result['token_ratio'].values)\n",
    "        local_expert_ratios.append(df_result['expert_call_ratio'].values)\n",
    "        local_wealths.append(df_result['wealth'].values)\n",
    "        local_thresholds.append(df_result['threshold'].values)\n",
    "\n",
    "    # 转 Numpy 并打包返回\n",
    "    return {\n",
    "        \"risks\": np.array(local_risks),         # Shape: (100, N)\n",
    "        \"tokens\": np.array(local_token_ratios), # Shape: (100, N)\n",
    "        \"expert_calls\": np.array(local_expert_ratios),\n",
    "        \"wealths\": np.array(local_wealths),\n",
    "        \"thresholds\": np.array(local_thresholds)\n",
    "    }\n",
    "\n",
    "# ================= 主循环 =================\n",
    "print(f\"Starting Multi-Epsilon Simulation for: {CFG_CHANGES}\")\n",
    "\n",
    "for eps in CFG_CHANGES:\n",
    "    print(f\"\\n>>> Processing Target Epsilon: {eps}\")\n",
    "    result_bundle = run_single_epsilon_experiment(eps)\n",
    "    experiment_results[eps] = result_bundle\n",
    "\n",
    "print(\"\\nAll experiments finished!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "95e5f4a7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "from scipy import stats\n",
    "from matplotlib.ticker import MultipleLocator\n",
    "from matplotlib import colors as mcolors\n",
    "# ==========================================\n",
    "# 1. 统计与辅助函数 (保持不变)\n",
    "# ==========================================\n",
    "SHADOW_TYPE = 'std'  # 推荐使用 std 看得更清楚\n",
    "CALIB_NUM = 0\n",
    "\n",
    "def get_error_bounds(data_array, type='std', scale=1):\n",
    "    \"\"\"计算均值和阴影上下界\"\"\"\n",
    "    if data_array.ndim == 1 or data_array.shape[0] <= 1:\n",
    "        mean_val = data_array.flatten() * scale\n",
    "        return mean_val, mean_val, mean_val\n",
    "\n",
    "    mean_val = np.mean(data_array, axis=0) * scale\n",
    "    \n",
    "    if type == 'std':\n",
    "        dev_val = np.std(data_array, axis=0) * scale\n",
    "    elif type == 'sem':\n",
    "        dev_val = stats.sem(data_array, axis=0) * scale\n",
    "    else:\n",
    "        dev_val = np.std(data_array, axis=0) * scale\n",
    "        \n",
    "    lower = mean_val - dev_val\n",
    "    upper = mean_val + dev_val\n",
    "        \n",
    "    return mean_val, lower, upper\n",
    "\n",
    "# ==========================================\n",
    "# 2. 绘图设置 (ICML 风格)\n",
    "# ==========================================\n",
    "plt.rcParams.update({\n",
    "    'font.size': 14,\n",
    "    'axes.labelsize': 20,\n",
    "    'xtick.labelsize': 16,\n",
    "    'ytick.labelsize': 16,\n",
    "    'legend.fontsize': 16,\n",
    "    'lines.linewidth': 3.0,\n",
    "    'axes.grid': True, \n",
    "    'grid.alpha': 0.3, \n",
    "    'grid.linestyle': '--',\n",
    "    'figure.autolayout': True,\n",
    "    'savefig.dpi': 300\n",
    "})\n",
    "\n",
    "fig, axes = plt.subplots(1, 3, figsize=(20, 5), sharex=True)\n",
    "\n",
    "sorted_epsilons = sorted(CFG_CHANGES) # 使用你定义的列表\n",
    "num_eps = len(sorted_epsilons)\n",
    "\n",
    "\n",
    "# 这样生成的渐变色既保留了原来的风格，又保证了中间线条清晰可见\n",
    "colors_nodes = ['#1f77b4', '#2ca02c', '#d62728'] \n",
    "custom_cmap = mcolors.LinearSegmentedColormap.from_list(\"BlueGreenRed\", colors_nodes)\n",
    "\n",
    "# 生成颜色列表\n",
    "colors = [custom_cmap(x) for x in np.linspace(0, 1, num_eps)]\n",
    "# 获取时间步 (假设所有实验长度一致)\n",
    "# 从第一个结果中获取长度\n",
    "first_key = sorted_epsilons[0]\n",
    "steps = np.arange(experiment_results[first_key]['risks'].shape[1])\n",
    "\n",
    "# ==========================================\n",
    "# 3. 循环绘图\n",
    "# ==========================================\n",
    "\n",
    "for idx, eps in enumerate(sorted_epsilons):\n",
    "    bundle = experiment_results[eps]\n",
    "    color = colors[idx]\n",
    "    label_str = f'$T_{{warm}}={eps}$'\n",
    "    \n",
    "\n",
    "    # --- 1. Risk (ER) ---\n",
    "    # 提取数据\n",
    "    risks_data = bundle['risks'] # 假设 key 是 'risks' (根据你之前的代码可能是 avg_risk)\n",
    "    # 如果你的 bundle 存的是 df_result['avg_risk'].values，那 key 需要对应上\n",
    "    # 这里假设你存的时候比如: 'risks': np.array(list_of_arrays)\n",
    "    \n",
    "    # 如果你的 bundle 结构不一样，请在这里调整提取逻辑，例如:\n",
    "    # risks_data = np.array([res['avg_risk'].values for res in bundle]) \n",
    "    \n",
    "    r_mean, r_low, r_high = get_error_bounds(risks_data, SHADOW_TYPE)\n",
    "    \n",
    "    ax = axes[0]\n",
    "    ax.plot(steps, r_mean, color=color, linestyle='-', label=label_str)\n",
    "    ax.fill_between(steps, r_low, r_high, color=color, alpha=0.15) # 阴影淡一点，避免重叠太乱\n",
    "\n",
    "    # --- 2. ECP ---\n",
    "    # 注意：scale=100\n",
    "    ecp_data = bundle['expert_calls'] # 对应 run_single_epsilon_experiment 里的 key\n",
    "    e_mean, e_low, e_high = get_error_bounds(ecp_data, SHADOW_TYPE, scale=100)\n",
    "    \n",
    "    ax = axes[1]\n",
    "    ax.plot(steps, e_mean, color=color, linestyle='-')\n",
    "    ax.fill_between(steps, e_low, e_high, color=color, alpha=0.15)\n",
    "\n",
    "    # --- 3. TCP ---\n",
    "    # 注意：scale=100\n",
    "    tcp_data = bundle['tokens'] # 对应 run_single_epsilon_experiment 里的 key\n",
    "    t_mean, t_low, t_high = get_error_bounds(tcp_data, SHADOW_TYPE, scale=100)\n",
    "    \n",
    "    ax = axes[2]\n",
    "    ax.plot(steps, t_mean, color=color, linestyle='-')\n",
    "    ax.fill_between(steps, t_low, t_high, color=color, alpha=0.15)\n",
    "\n",
    "\n",
    "# ==========================================\n",
    "# 4. 细节修饰 (Axes Labels & Limits)\n",
    "# ==========================================\n",
    "\n",
    "# --- Ax0: Risk ---\n",
    "ax = axes[0]\n",
    "ax.set_ylabel('ER', fontsize=22)\n",
    "ax.set_xlabel('Time Step')\n",
    "# ax.set_ylim(0, max(sorted_epsilons)*2.0) # 动态调整，大概是最大 epsilon 的两倍\n",
    "ax.set_ylim(0, 0.1) # 或者手动固定\n",
    "ax.legend(loc='upper right', frameon=True, framealpha=0.95, fontsize=12, ncol=2) # 分两列显示图例\n",
    "ax.set_title(\"Varying Tolerance ($\\epsilon$)\")\n",
    "\n",
    "# --- Ax1: ECP ---\n",
    "ax = axes[1]\n",
    "ax.set_ylabel('ECP (%)')\n",
    "ax.set_xlabel('Time Step')\n",
    "ax.set_ylim(0, 100)\n",
    "# ax.legend() # 不需要重复图例\n",
    "\n",
    "# --- Ax2: TCP ---\n",
    "ax = axes[2]\n",
    "ax.set_ylabel('TP (%)')\n",
    "ax.set_xlabel('Time Step')\n",
    "ax.axhline(y=100, color='black', linestyle='-.', linewidth=1.5, label='Full Expert', zorder=1)\n",
    "\n",
    "# 保存\n",
    "plt.tight_layout()\n",
    "plt.savefig('warm_step_bbh.pdf', bbox_inches='tight')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cf19f95d",
   "metadata": {},
   "source": [
    "## Qwen2.5-7b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9054c329",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "think= pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/bbh/qwen3-think.json\")\n",
    "expert_data = think[think[\"matched\"] == True]\n",
    "session_ids = list(expert_data[\"session_id\"])\n",
    "\n",
    "instant_data1 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/bbh/qwen2.5-7b.json\")\n",
    "instant_data = instant_data1[instant_data1[\"session_id\"].isin(session_ids)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f35bde0a",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(list(set(instant_data[\"session_id\"]))),len(list(set(expert_data[\"session_id\"])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "915aa201",
   "metadata": {},
   "outputs": [],
   "source": [
    "instant_data.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a6d55de5",
   "metadata": {},
   "outputs": [],
   "source": [
    "instant_data.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0dec68fa",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_list =[]\n",
    "selcet_ids =[]\n",
    "for i,row in expert_data.iterrows():\n",
    "    session_id = row['session_id']\n",
    "    if session_id not in instant_data['session_id'].values:\n",
    "        print(f\"Instant data missing at index:{i}, session_id:{session_id}\")\n",
    "        continue\n",
    "    tmp_dict = {}\n",
    "    instant_row = instant_data[instant_data['session_id'] == session_id].iloc[0]\n",
    "    selcet_ids.append(session_id)\n",
    "    if instant_row[\"question\"] != row[\"question\"]:\n",
    "        print(f\"Question mismatch at index:{i}, session_id:{session_id}\")\n",
    "        continue\n",
    "    tmp_dict['uncertainty'] = 1- instant_row['token_probs'][0]\n",
    "    tmp_dict['instant_correct'] = int(instant_row['matched']) if instant_row['matched'] != \"No answer extracted\" else 0\n",
    "    tmp_dict['expert_correct'] = int(expert_data.loc[i, 'matched'])\n",
    "    tmp_dict['instant_token'] = instant_row['gen_token_count']\n",
    "    tmp_dict['expert_token'] = expert_data.loc[i, 'gen_token_count']\n",
    "    data_list.append(tmp_dict)\n",
    "\n",
    "print(f\"Total samples prepared: {len(data_list)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f1d0fc29",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "data1 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/bbh/qwen2.5-7b.json\")\n",
    "data2 = data1[data1[\"session_id\"].isin(selcet_ids)]\n",
    "df = data2.copy()\n",
    "df[\"token_prob_scalar\"] = df[\"token_probs\"].apply(\n",
    "    lambda x: x[0] if x else None\n",
    ")\n",
    "# =========================\n",
    "# 2. 按 matched 分组\n",
    "# =========================\n",
    "tp_true = df.loc[df[\"matched\"] == True, \"token_prob_scalar\"].dropna()\n",
    "tp_false = df.loc[df[\"matched\"] == False, \"token_prob_scalar\"].dropna()\n",
    "\n",
    "# =========================\n",
    "# 3. 置信度分布直方图\n",
    "# =========================\n",
    "plt.figure()\n",
    "plt.hist(tp_true, bins=30, alpha=0.6, label=\"matched=True\")\n",
    "plt.hist(tp_false, bins=30, alpha=0.6, label=\"matched=False\")\n",
    "plt.xlabel(\"token_prob_scalar\")\n",
    "plt.ylabel(\"Count\")\n",
    "plt.title(\"Token Probability Distribution by Matched\")\n",
    "plt.legend()\n",
    "plt.show()\n",
    "\n",
    "# =========================\n",
    "# 4. Boxplot\n",
    "# =========================\n",
    "plt.figure()\n",
    "plt.boxplot([tp_true, tp_false], labels=[\"Matched=True\", \"Matched=False\"])\n",
    "plt.ylabel(\"token_prob_scalar\")\n",
    "plt.title(\"Token Probability by Matched\")\n",
    "plt.show()\n",
    "\n",
    "\n",
    "# =====================================================================\n",
    "# 第二部分：Instant / Expert 不确定性分析\n",
    "# =====================================================================\n",
    "\n",
    "# data_list: list[dict]\n",
    "instant_tokens = [item[\"instant_token\"] for item in data_list]\n",
    "expert_tokens = [item[\"expert_token\"] for item in data_list]\n",
    "\n",
    "uncertainty_scores = [item[\"uncertainty\"] for item in data_list]\n",
    "correct_instant = [item[\"instant_correct\"] for item in data_list]\n",
    "correct_expert = [item[\"expert_correct\"] for item in data_list]\n",
    "\n",
    "# =========================\n",
    "# 5. 准确率\n",
    "# =========================\n",
    "accuracy_instant = sum(correct_instant) / len(correct_instant)\n",
    "accuracy_expert = sum(correct_expert) / len(correct_expert)\n",
    "\n",
    "print(f\"Instant Model Accuracy: {accuracy_instant:.4f}\")\n",
    "print(f\"Expert Model Accuracy:  {accuracy_expert:.4f}\")\n",
    "\n",
    "expert_tokens = [item['expert_token'] for item in data_list]\n",
    "instant_tokens = [item['instant_token'] for item in data_list]\n",
    "mean_expert_tokens = np.mean(expert_tokens)\n",
    "mean_instant_tokens = np.mean(instant_tokens)\n",
    "print(f\"Mean Expert Tokens: {mean_expert_tokens}, Mean Instant Tokens: {mean_instant_tokens}\")\n",
    "\n",
    "\n",
    "# =========================\n",
    "# 6. Uncertainty 直方图\n",
    "# =========================\n",
    "plt.figure()\n",
    "plt.hist(uncertainty_scores, bins=50)\n",
    "plt.xlabel(\"Uncertainty\")\n",
    "plt.ylabel(\"Count\")\n",
    "plt.title(\"Uncertainty Distribution\")\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "af566c29",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "data_list1 = np.array(data_list)\n",
    "\n",
    "np.random.shuffle(data_list1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ec610764",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 运行配置\n",
    "cfg = BPACConfig(epsilon=0.1, alpha=0.1, rho=0.1,warm_up=0)\n",
    "\n",
    "# 执行模拟\n",
    "df_result, model = run_simulation(data_list1, cfg)\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "def plot_bpac_results(df_logs, config):\n",
    "    \"\"\"\n",
    "    绘制 B-PAC 实验结果面板\n",
    "    \"\"\"\n",
    "    if df_logs.empty:\n",
    "        print(\"Log is empty. Check warmup settings or data.\")\n",
    "        return\n",
    "\n",
    "    fig, axes = plt.subplots(4, 1, figsize=(12, 16), sharex=True)\n",
    "    \n",
    "    # ----------------------------------\n",
    "    # Subplot 1: Threshold Adaptation\n",
    "    # ----------------------------------\n",
    "    ax = axes[0]\n",
    "    ax.plot(df_logs['step'], df_logs['threshold'], label='Threshold ($u_t$)', color='#1f77b4', linewidth=2)\n",
    "    # 可以在背景里画出 uncertainty 的散点，展示数据分布（可选）\n",
    "    # ax.scatter(df_logs['step'], df_logs['uncertainty'], alpha=0.1, color='gray', s=1, label='Input Uncertainty')\n",
    "    ax.set_ylabel('Uncertainty Score')\n",
    "    ax.set_title(f'Threshold Adaptation (Confidence {1-config.alpha:.0%})')\n",
    "    ax.legend(loc='upper right')\n",
    "    ax.grid(True, alpha=0.3)\n",
    "    ax.set_ylim(-0.05, 1.05)\n",
    "\n",
    "    # ----------------------------------\n",
    "    # Subplot 2: Safety (Risk Control)\n",
    "    # ----------------------------------\n",
    "    ax = axes[1]\n",
    "    ax.plot(df_logs['step'], df_logs['avg_risk'], color='#d62728', label='Cumulative Avg Risk', linewidth=2)\n",
    "    ax.axhline(y=config.epsilon, color='black', linestyle='--', linewidth=2, label=f'Target Risk ($\\epsilon={config.epsilon}$)')\n",
    "    ax.set_ylabel('Risk Rate')\n",
    "    ax.set_title('Safety Guarantee: Realized Risk vs. Target')\n",
    "    ax.legend(loc='upper right')\n",
    "    ax.grid(True, alpha=0.3)\n",
    "    \n",
    "    # ----------------------------------\n",
    "    # Subplot 3: Efficiency (Token & Expert Ratio)\n",
    "    # ----------------------------------\n",
    "    ax = axes[2]\n",
    "    ax.plot(df_logs['step'], df_logs['token_ratio'], color='#2ca02c', label='Token Ratio (vs. All-Expert)', linewidth=2)\n",
    "    ax.plot(df_logs['step'], df_logs['expert_call_ratio'], color='#ff7f0e', linestyle='-.', label='Expert Call Ratio', linewidth=2)\n",
    "    ax.set_ylabel('Ratio')\n",
    "    ax.set_title('Efficiency: Cost Reduction')\n",
    "    ax.legend(loc='upper right')\n",
    "    ax.grid(True, alpha=0.3)\n",
    "    \n",
    "    # ----------------------------------\n",
    "    # Subplot 4: Stability (Wealth Process)\n",
    "    # ----------------------------------\n",
    "    ax = axes[3]\n",
    "    ax.plot(df_logs['step'], df_logs['wealth'], color='#9467bd', label='Wealth of Selected Threshold ($K_t$)', linewidth=1.5)\n",
    "    # 画出安全线 1/alpha\n",
    "    safe_wealth = 1.0 / config.alpha\n",
    "    ax.axhline(y=safe_wealth, color='purple', linestyle=':', label=f'Safety Barrier ($1/\\\\alpha={safe_wealth:.1f}$)')\n",
    "    \n",
    "    ax.set_ylabel('Wealth Value')\n",
    "    ax.set_xlabel('Simulation Step (post-warmup)')\n",
    "    ax.set_title('Martingale Wealth Process')\n",
    "    ax.set_yscale('log') # 财富值通常是指数增长的，用对数坐标更好看\n",
    "    ax.legend(loc='upper left')\n",
    "    ax.grid(True, alpha=0.3)\n",
    "\n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "\n",
    "plot_bpac_results(df_result, cfg)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "efe43d57",
   "metadata": {},
   "source": [
    "### compare to pac"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e0527322",
   "metadata": {},
   "source": [
    "#### 3baseline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ddb29f35",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from tqdm import tqdm \n",
    "\n",
    "# # 假设 data_list 已经准备好\n",
    "# y_solved = [item['expert_correct'] for item in data_list]\n",
    "# y_hat_solved = [item['instant_correct'] for item in data_list]\n",
    "# uncertainty_values = [item['uncertainty'] for item in data_list]\n",
    "# y_token_list = [item['expert_token'] for item in data_list]\n",
    "# y_hat_token_list = [item['instant_token'] for item in data_list]\n",
    "\n",
    "# # 参数设置\n",
    "# NUM_RUNS = 100         # 重复次数\n",
    "# TARGET_EPSILON = 0.05   # 目标 Risk\n",
    "# TARGET_ALPHA = 0.05     # 置信度\n",
    "# CALIB_NUM = 1000        # 校准集大小\n",
    "# TOTAL_NUM = len(y_solved)\n",
    "# calib_ratio = CALIB_NUM / TOTAL_NUM\n",
    "\n",
    "# # -------------------------------------------------------------------------\n",
    "# # 1. 循环运行 Simulation\n",
    "# # -------------------------------------------------------------------------\n",
    "\n",
    "# # 用于存储每次实验的完整序列\n",
    "# # 维度: (100, N_samples)\n",
    "# all_risks = []\n",
    "# all_token_ratios = []\n",
    "# all_expert_ratios = []\n",
    "# all_u_hats = []\n",
    "\n",
    "# print(f\"Starting {NUM_RUNS} runs simulation...\")\n",
    "\n",
    "# for seed in tqdm(range(NUM_RUNS)):\n",
    "#     # 每次使用不同的随机种子 (seed=0, 1, 2, ..., 99)\n",
    "#     # 这样 run_baseline_continuous 内部的 shuffle 结果每次都不一样\n",
    "#     df, u_hat = run_baseline_continuous(\n",
    "#         y_solved, \n",
    "#         y_hat_solved, \n",
    "#         uncertainty_values, \n",
    "#         y_token_list, \n",
    "#         y_hat_token_list,\n",
    "#         calib_ratio=calib_ratio, \n",
    "#         epsilon=TARGET_EPSILON, \n",
    "#         alpha=TARGET_ALPHA,\n",
    "#         seed=seed\n",
    "#     )\n",
    "    \n",
    "#     # 记录整条曲线 (df['avg_risk'] 是一个 Series)\n",
    "#     all_risks.append(df['avg_risk'].values)\n",
    "#     all_token_ratios.append(df['token_ratio'].values)\n",
    "#     all_expert_ratios.append(df['expert_call_ratio'].values)\n",
    "#     all_u_hats.append(u_hat)\n",
    "\n",
    "# # 转为 Numpy Array 方便计算均值方差\n",
    "# # shape = (100, N)\n",
    "# arr_risks = np.array(all_risks)\n",
    "# arr_token_ratios = np.array(all_token_ratios)\n",
    "# arr_expert_ratios = np.array(all_expert_ratios)\n",
    "# arr_thresholds = np.array(all_u_hats)\n",
    "\n",
    "# print(\"\\nSimulation Finished!\")\n",
    "# print(f\"Average u_hat across runs: {np.mean(all_u_hats):.4f} (std: {np.std(all_u_hats):.4f})\")\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "# 参数设置\n",
    "NUM_RUNS = 100        # 重复 100 次\n",
    "WARM_UP_STEPS = 0   # BPAC 的 Warm-up 步数 (对应 Baseline 的校准集大小)\n",
    "TARGET_EPSILON = 0.08  # 目标 Risk\n",
    "CFG_ALPHA= 0.1       # BPAC 参数\n",
    "CFG_RHO = 0      # BPAC 参数\n",
    "BPA_beta = 1\n",
    "CFG_RHO_0 = 0.05\n",
    "CFG_RHO_1 = 0.7\n",
    "CFG_CHANGE = 200\n",
    "C_CLIP = 0.9\n",
    "\n",
    "bpac_risks = []\n",
    "bpac_token_ratios = []\n",
    "bpac_expert_ratios = []\n",
    "bpac_wealths = []\n",
    "bpac_thresholds = []\n",
    "\n",
    "print(f\"Starting BPAC Simulation ({NUM_RUNS} runs)...\")\n",
    "\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    # -----------------------------------------------------------\n",
    "    # 1. 严格的数据对齐 (Data Alignment)\n",
    "    # -----------------------------------------------------------\n",
    "    # 使用与 Baseline 相同的种子生成随机数生成器\n",
    "    rng = np.random.default_rng(seed)\n",
    "    \n",
    "    # 生成打乱的索引\n",
    "    n = len(data_list)\n",
    "    indices = np.arange(n)\n",
    "    rng.shuffle(indices)\n",
    "    \n",
    "    # 关键修正：根据 indices 重排 data_list\n",
    "    # 注意：data_list 是 list of dicts，不能直接用 indices 索引，除非转 numpy\n",
    "    # 这里用列表推导式最稳妥\n",
    "    shuffled_data = [data_list[i] for i in indices]\n",
    "    \n",
    "    # -----------------------------------------------------------\n",
    "    # 2. 配置并运行 BPAC\n",
    "    # -----------------------------------------------------------\n",
    "    # 你的 BPACConfig\n",
    "    cfg = BPACConfig(\n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=CFG_ALPHA, \n",
    "        rho=CFG_RHO, \n",
    "        warm_up=WARM_UP_STEPS,\n",
    "        num_thresholds=1001, \n",
    "        beta=BPA_beta, \n",
    "        c_clip=C_CLIP,\n",
    "        rho_0=CFG_RHO_0,\n",
    "        rho_1=CFG_RHO_1,\n",
    "        change_point=CFG_CHANGE\n",
    "    )\n",
    "    \n",
    "    # 运行模拟 (传入打乱后的数据)\n",
    "    df_result, _ = run_simulation(shuffled_data, cfg)\n",
    "    \n",
    "    bpac_risks.append(df_result['avg_risk'].values)\n",
    "    bpac_token_ratios.append(df_result['token_ratio'].values)\n",
    "    bpac_expert_ratios.append(df_result['expert_call_ratio'].values)\n",
    "    bpac_wealths.append(df_result['wealth'].values)\n",
    "    bpac_thresholds.append(df_result['threshold'].values)\n",
    "\n",
    "\n",
    "# 转换为 Numpy 数组方便计算均值\n",
    "bpac_risks_arr = np.array(bpac_risks)       # Shape: (100, N)\n",
    "bpac_token_ratios_arr = np.array(bpac_token_ratios) # Shape: (100, N)\n",
    "bpac_expert_ratios_arr = np.array(bpac_expert_ratios) # Shape: (100, N)\n",
    "bpac_wealths_arr = np.array(bpac_wealths)   # Shape: (100, N)\n",
    "bpac_thresholds_arr = np.array(bpac_thresholds) # Shape: (100, N)\n",
    "print(\"BPAC Simulation Finished!\")\n",
    "bpac_risks_arr.shape, bpac_token_ratios_arr.shape, bpac_expert_ratios_arr.shape\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "from abpaconaive import BPACConfignaive,run_simulation_onaive\n",
    "# 参数设置\n",
    "NUM_RUNS = 100        # 重复 100 次\n",
    "WARM_UP_STEPS = 0   # BPAC 的 Warm-up 步数 (对应 Baseline 的校准集大小)\n",
    "TARGET_EPSILON = 0.08  # 目标 Risk\n",
    "CFG_ALPHA= 0.1      # BPAC 参数\n",
    "CFG_RHO = 0.05        # BPAC 参数\n",
    "\n",
    "\n",
    "bpaconaive_risks = []\n",
    "bpaconaive_token_ratios = []\n",
    "bpaconaive_expert_ratios = []\n",
    "bpaconaive_wealths = []\n",
    "bpaconaive_thresholds = []\n",
    "print(f\"Starting BPAC-Naive Simulation ({NUM_RUNS} runs)...\")\n",
    "\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    # -----------------------------------------------------------\n",
    "    # 1. 严格的数据对齐 (Data Alignment)\n",
    "    # -----------------------------------------------------------\n",
    "    # 使用与 Baseline 相同的种子生成随机数生成器\n",
    "    rng = np.random.default_rng(seed)\n",
    "    \n",
    "    # 生成打乱的索引\n",
    "    n = len(data_list)\n",
    "    indices = np.arange(n)\n",
    "    rng.shuffle(indices)\n",
    "    \n",
    "    # 关键修正：根据 indices 重排 data_list\n",
    "    # 注意：data_list 是 list of dicts，不能直接用 indices 索引，除非转 numpy\n",
    "    # 这里用列表推导式最稳妥\n",
    "    shuffled_data = [data_list[i] for i in indices]\n",
    "    \n",
    "    # -----------------------------------------------------------\n",
    "    # 2. 配置并运行 BPAC\n",
    "    # -----------------------------------------------------------\n",
    "    # 你的 BPACConfig\n",
    "    cfg = BPACConfignaive(\n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=CFG_ALPHA, \n",
    "        rho=CFG_RHO, \n",
    "        warm_up=WARM_UP_STEPS,\n",
    "        num_thresholds=1001, \n",
    "    )\n",
    "    \n",
    "    # 运行模拟 (传入打乱后的数据)\n",
    "    df_result, _ = run_simulation_onaive(shuffled_data, cfg)\n",
    "    \n",
    "    bpaconaive_risks.append(df_result['avg_risk'].values)\n",
    "    bpaconaive_token_ratios.append(df_result['token_ratio'].values)\n",
    "    bpaconaive_expert_ratios.append(df_result['expert_call_ratio'].values)\n",
    "    bpaconaive_wealths.append(df_result['wealth'].values)\n",
    "    bpaconaive_thresholds.append(df_result['threshold'].values) \n",
    "\n",
    "# 转换为 Numpy 数组方便计算均值\n",
    "bpaconaive_risks_arr = np.array(bpaconaive_risks)       # Shape: (100, N)\n",
    "bpaconaive_token_ratios_arr = np.array(bpaconaive_token_ratios) # Shape: (100, N)\n",
    "bpaconaive_expert_ratios_arr = np.array(bpaconaive_expert_ratios) # Shape: (100, N)\n",
    "bpaconaive_wealths_arr = np.array(bpaconaive_wealths)   # Shape: (100, N)\n",
    "bpaconaive_thresholds_arr = np.array(bpaconaive_thresholds) # Shape: (100, N)\n",
    "print(\"BPAC-Naive Simulation Finished!\")\n",
    "bpaconaive_risks_arr.shape, bpaconaive_token_ratios_arr.shape, bpaconaive_expert_ratios_arr.shape\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "from abpacips import BPACConfigIPS,run_simulation_ips\n",
    "# 参数设置\n",
    "NUM_RUNS = 100        # 重复 100 次\n",
    "WARM_UP_STEPS = 0   # BPAC 的 Warm-up 步数 (对应 Baseline 的校准集大小)\n",
    "TARGET_EPSILON = 0.08  # 目标 Risk\n",
    "CFG_ALPHA= 0.1      # BPAC 参数\n",
    "CFG_RHO = 0.05        # BPAC 参数\n",
    "\n",
    "\n",
    "bpacips_risks = []\n",
    "bpacips_token_ratios = []\n",
    "bpacips_expert_ratios = []\n",
    "bpacips_wealths = []\n",
    "bpacips_thresholds = []\n",
    "print(f\"Starting BPAC-IPS Simulation ({NUM_RUNS} runs)...\")\n",
    "\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    # -----------------------------------------------------------\n",
    "    # 1. 严格的数据对齐 (Data Alignment)\n",
    "    # -----------------------------------------------------------\n",
    "    # 使用与 Baseline 相同的种子生成随机数生成器\n",
    "    rng = np.random.default_rng(seed)\n",
    "    \n",
    "    # 生成打乱的索引\n",
    "    n = len(data_list)\n",
    "    indices = np.arange(n)\n",
    "    rng.shuffle(indices)\n",
    "    \n",
    "    # 关键修正：根据 indices 重排 data_list\n",
    "    # 注意：data_list 是 list of dicts，不能直接用 indices 索引，除非转 numpy\n",
    "    # 这里用列表推导式最稳妥\n",
    "    shuffled_data = [data_list[i] for i in indices]\n",
    "    \n",
    "    # -----------------------------------------------------------\n",
    "    # 2. 配置并运行 BPAC\n",
    "    # -----------------------------------------------------------\n",
    "    # 你的 BPACConfig\n",
    "    cfg = BPACConfigIPS(\n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=CFG_ALPHA, \n",
    "        rho=CFG_RHO, \n",
    "        warm_up=WARM_UP_STEPS,\n",
    "        num_thresholds=1001, \n",
    "    )\n",
    "    \n",
    "    # 运行模拟 (传入打乱后的数据)\n",
    "    df_result, _ = run_simulation_ips(shuffled_data, cfg)\n",
    "    \n",
    "    bpacips_risks.append(df_result['avg_risk'].values)\n",
    "    bpacips_token_ratios.append(df_result['token_ratio'].values)\n",
    "    bpacips_expert_ratios.append(df_result['expert_call_ratio'].values)\n",
    "    bpacips_wealths.append(df_result['wealth'].values)\n",
    "    bpacips_thresholds.append(df_result['threshold'].values) \n",
    "\n",
    "# 转换为 Numpy 数组方便计算均值\n",
    "bpacips_risks_arr = np.array(bpacips_risks)       # Shape: (100, N)\n",
    "bpacips_token_ratios_arr = np.array(bpacips_token_ratios) # Shape: (100, N)\n",
    "bpacips_expert_ratios_arr = np.array(bpacips_expert_ratios) # Shape: (100, N)\n",
    "bpacips_wealths_arr = np.array(bpacips_wealths)   # Shape: (100, N)\n",
    "bpacips_thresholds_arr = np.array(bpacips_thresholds) # Shape: (100, N)\n",
    "print(\"BPAC-IPS Simulation Finished!\")\n",
    "bpacips_risks_arr.shape, bpacips_token_ratios_arr.shape, bpacips_expert_ratios_arr.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aca332b8",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "from scipy import stats\n",
    "from matplotlib.ticker import MultipleLocator\n",
    "\n",
    "# ==========================================\n",
    "# 1. 统计与辅助函数\n",
    "# ==========================================\n",
    "SHADOW_TYPE = 'std'  # 可选: 'std', 'sem', 'ci', 'percentile'\n",
    "CALIB_NUM =0\n",
    "def get_error_bounds(data_array, type='sem',scale=1):\n",
    "    \"\"\"计算均值和阴影上下界\"\"\"\n",
    "    mean_val = np.mean(data_array, axis=0)\n",
    "    sem_val = np.std(data_array, axis=0)\n",
    "    lower = mean_val - sem_val\n",
    "    upper = mean_val + sem_val\n",
    "        \n",
    "    return mean_val*scale, lower*scale, upper*scale\n",
    "\n",
    "def clip_bounds(low, high, min_v=0, max_v=None):\n",
    "    \"\"\"物理截断 (例如比率不能小于0)\"\"\"\n",
    "    low = np.maximum(low, min_v)\n",
    "    if max_v is not None:\n",
    "        high = np.minimum(high, max_v)\n",
    "    return low, high\n",
    "\n",
    "# ==========================================\n",
    "# 2. 计算绘图数据\n",
    "# ==========================================\n",
    "# 假设 steps 与数组长度一致\n",
    "steps = np.arange(bpac_risks_arr.shape[1])\n",
    "\n",
    "# --- Risk (ER) ---\n",
    "naive_risk_m, naive_risk_l, naive_risk_h = get_error_bounds(bpaconaive_risks_arr, SHADOW_TYPE)\n",
    "ips_risk_m, ips_risk_l, ips_risk_h = get_error_bounds(bpacips_risks_arr, SHADOW_TYPE)\n",
    "bpac_risk_m, bpac_risk_l, bpac_risk_h = get_error_bounds(bpac_risks_arr, SHADOW_TYPE)\n",
    "\n",
    "# --- ECP (Expert Call %) ---\n",
    "naive_ecp_m, naive_ecp_l, naive_ecp_h = get_error_bounds(bpaconaive_expert_ratios_arr, SHADOW_TYPE,scale=100)\n",
    "ips_ecp_m, ips_ecp_l, ips_ecp_h = get_error_bounds(bpacips_expert_ratios_arr, SHADOW_TYPE,scale=100)\n",
    "bpac_ecp_m, bpac_ecp_l, bpac_ecp_h = get_error_bounds(bpac_expert_ratios_arr, SHADOW_TYPE,scale=100)\n",
    "\n",
    "# --- TCP (Token Cost %) ---\n",
    "naive_tcp_m, naive_tcp_l, naive_tcp_h = get_error_bounds(bpaconaive_token_ratios_arr, SHADOW_TYPE,scale=100)\n",
    "ips_tcp_m, ips_tcp_l, ips_tcp_h = get_error_bounds(bpacips_token_ratios_arr, SHADOW_TYPE,scale=100)\n",
    "bpac_tcp_m, bpac_tcp_l, bpac_tcp_h = get_error_bounds(bpac_token_ratios_arr, SHADOW_TYPE,scale=100)\n",
    "\n",
    "# # 截断不合理的数值\n",
    "# naive_risk_l, naive_risk_h = clip_bounds(naive_risk_l, naive_risk_h, 0, 1.0)\n",
    "# ips_risk_l, ips_risk_h = clip_bounds(ips_risk_l, ips_risk_h, 0, 1.0)\n",
    "# bpac_risk_l, bpac_risk_h = clip_bounds(bpac_risk_l, bpac_risk_h, 0, 1.0)\n",
    "\n",
    "# naive_ecp_l, naive_ecp_h = clip_bounds(naive_ecp_l, naive_ecp_h, 0, 1.0)\n",
    "# ips_ecp_l, ips_ecp_h = clip_bounds(ips_ecp_l, ips_ecp_h, 0, 1.0)\n",
    "# bpac_ecp_l, bpac_ecp_h = clip_bounds(bpac_ecp_l, bpac_ecp_h, 0, 1.0)\n",
    "\n",
    "# naive_tcp_l, naive_tcp_h = clip_bounds(naive_tcp_l, naive_tcp_h, 0, None)\n",
    "# ips_tcp_l, ips_tcp_h = clip_bounds(ips_tcp_l, ips_tcp_h, 0, None)\n",
    "# bpac_tcp_l, bpac_tcp_h = clip_bounds(bpac_tcp_l, bpac_tcp_h, 0, None)\n",
    "\n",
    "# ==========================================\n",
    "# 3. 绘图 (ICML 风格 - 3 Methods)\n",
    "# ==========================================\n",
    "plt.rcParams.update({\n",
    "    'font.size': 14,\n",
    "    'axes.labelsize': 20,\n",
    "    'xtick.labelsize': 16,\n",
    "    'ytick.labelsize': 16,\n",
    "    'legend.fontsize': 16,\n",
    "    'lines.linewidth': 3.0,\n",
    "    'axes.grid': True, \n",
    "    'grid.alpha': 0.3, \n",
    "    'grid.linestyle': '--',\n",
    "    'figure.autolayout': True,\n",
    "    'savefig.dpi': 300\n",
    "})\n",
    "\n",
    "fig, axes = plt.subplots(1, 3, figsize=(20, 5), sharex=True)\n",
    "\n",
    "# 颜色定义\n",
    "C_NAIVE = '#1f77b4'  # Blue\n",
    "C_IPS = '#2ca02c'    # Green\n",
    "C_OURS = '#d62728'   # Red\n",
    "C_TGT = 'black'      # Tolerance Line\n",
    "C_WARM = 'gray'      # Warmup Line\n",
    "\n",
    "# --- (a) Risk ---\n",
    "ax = axes[0]\n",
    "# 1. Naive\n",
    "ax.plot(steps, naive_risk_m, color=C_NAIVE, linestyle='--', label='O-Naive')\n",
    "ax.fill_between(steps, naive_risk_l, naive_risk_h, color=C_NAIVE, alpha=0.3)\n",
    "# 2. IPS\n",
    "ax.plot(steps, ips_risk_m, color=C_IPS, linestyle='-.', label='IPS+Hoeff')\n",
    "ax.fill_between(steps, ips_risk_l, ips_risk_h, color=C_IPS, alpha=0.3)\n",
    "# 3. Ours\n",
    "ax.plot(steps, bpac_risk_m, color=C_OURS, linestyle='-', label='B-PAC (Ours)')\n",
    "ax.fill_between(steps, bpac_risk_l, bpac_risk_h, color=C_OURS, alpha=0.3)\n",
    "\n",
    "ax.axhline(y=TARGET_EPSILON, color=C_TGT, linestyle='--', linewidth=2, label='Tolerance')\n",
    "# ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "\n",
    "ax.set_ylabel('ER', fontsize=22)\n",
    "ax.set_xlabel('Time Step')\n",
    "# 自动调整 ylim: 取三个方法最大值的 1.1 倍，或者至少展示到 Target 的 2 倍\n",
    "max_risk_show = max(TARGET_EPSILON * 1.5, np.max(naive_risk_m[-50:]), np.max(bpac_risk_m[-50:])) * 1.1\n",
    "ax.set_ylim(0, 0.4)\n",
    "ax.yaxis.set_major_locator(MultipleLocator(0.04))\n",
    "\n",
    "# 图例\n",
    "ax.legend(loc='best', frameon=True, framealpha=0.95, fontsize=14)\n",
    "\n",
    "# --- (b) ECP ---\n",
    "ax = axes[1]\n",
    "ax.plot(steps, naive_ecp_m, color=C_NAIVE, linestyle='--')\n",
    "ax.fill_between(steps, naive_ecp_l, naive_ecp_h, color=C_NAIVE, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, ips_ecp_m, color=C_IPS, linestyle='-.')\n",
    "ax.fill_between(steps, ips_ecp_l, ips_ecp_h, color=C_IPS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, bpac_ecp_m, color=C_OURS, linestyle='-')\n",
    "ax.fill_between(steps, bpac_ecp_l, bpac_ecp_h, color=C_OURS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "# ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "ax.set_ylabel('ECP(%)')\n",
    "ax.set_xlabel('Time Step')\n",
    "ax.set_ylim(0, 100)\n",
    "\n",
    "# --- (c) TCP ---\n",
    "ax = axes[2]\n",
    "ax.plot(steps, naive_tcp_m, color=C_NAIVE, linestyle='--')\n",
    "ax.fill_between(steps, naive_tcp_l, naive_tcp_h, color=C_NAIVE, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, ips_tcp_m, color=C_IPS, linestyle='-.')\n",
    "ax.fill_between(steps, ips_tcp_l, ips_tcp_h, color=C_IPS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, bpac_tcp_m, color=C_OURS, linestyle='-')\n",
    "ax.fill_between(steps, bpac_tcp_l, bpac_tcp_h, color=C_OURS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.axhline(y=100, color='black', linestyle='-.', linewidth=1.5, label='Full Expert')\n",
    "# ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "\n",
    "ax.set_ylabel('TP(%)')\n",
    "ax.set_xlabel('Time Step')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('qwen2.5ins_bbh.pdf', bbox_inches='tight') # 如需保存请取消注释\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "52211edb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from tqdm import tqdm \n",
    "\n",
    "# 假设 data_list 已经准备好\n",
    "y_solved = [item['expert_correct'] for item in data_list]\n",
    "y_hat_solved = [item['instant_correct'] for item in data_list]\n",
    "uncertainty_values = [item['uncertainty'] for item in data_list]\n",
    "y_token_list = [item['expert_token'] for item in data_list]\n",
    "y_hat_token_list = [item['instant_token'] for item in data_list]\n",
    "\n",
    "# 参数设置\n",
    "NUM_RUNS = 100         # 重复次数\n",
    "TARGET_EPSILON = 0.05   # 目标 Risk\n",
    "TARGET_ALPHA = 0.05     # 置信度\n",
    "CALIB_NUM = 500        # 校准集大小\n",
    "TOTAL_NUM = len(y_solved)\n",
    "calib_ratio = CALIB_NUM / TOTAL_NUM\n",
    "\n",
    "# -------------------------------------------------------------------------\n",
    "# 1. 循环运行 Simulation\n",
    "# -------------------------------------------------------------------------\n",
    "\n",
    "# 用于存储每次实验的完整序列\n",
    "# 维度: (100, N_samples)\n",
    "all_risks = []\n",
    "all_token_ratios = []\n",
    "all_expert_ratios = []\n",
    "all_u_hats = []\n",
    "\n",
    "print(f\"Starting {NUM_RUNS} runs simulation...\")\n",
    "\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    # 每次使用不同的随机种子 (seed=0, 1, 2, ..., 99)\n",
    "    # 这样 run_baseline_continuous 内部的 shuffle 结果每次都不一样\n",
    "    df, u_hat = run_baseline_continuous(\n",
    "        y_solved, \n",
    "        y_hat_solved, \n",
    "        uncertainty_values, \n",
    "        y_token_list, \n",
    "        y_hat_token_list,\n",
    "        calib_ratio=calib_ratio, \n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=TARGET_ALPHA,\n",
    "        seed=seed\n",
    "    )\n",
    "    \n",
    "    # 记录整条曲线 (df['avg_risk'] 是一个 Series)\n",
    "    all_risks.append(df['avg_risk'].values)\n",
    "    all_token_ratios.append(df['token_ratio'].values)\n",
    "    all_expert_ratios.append(df['expert_call_ratio'].values)\n",
    "    all_u_hats.append(u_hat)\n",
    "\n",
    "# 转为 Numpy Array 方便计算均值方差\n",
    "# shape = (100, N)\n",
    "arr_risks = np.array(all_risks)\n",
    "arr_token_ratios = np.array(all_token_ratios)\n",
    "arr_expert_ratios = np.array(all_expert_ratios)\n",
    "arr_thresholds = np.array(all_u_hats)\n",
    "\n",
    "print(\"\\nSimulation Finished!\")\n",
    "print(f\"Average u_hat across runs: {np.mean(all_u_hats):.4f} (std: {np.std(all_u_hats):.4f})\")\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "# 参数设置\n",
    "NUM_RUNS = 100        # 重复 100 次\n",
    "WARM_UP_STEPS = 0   # BPAC 的 Warm-up 步数 (对应 Baseline 的校准集大小)\n",
    "TARGET_EPSILON = 0.05  # 目标 Risk\n",
    "CFG_ALPHA= 0.05       # BPAC 参数\n",
    "CFG_RHO = 0.1         # BPAC 参数\n",
    "BPA_beta = 1\n",
    "\n",
    "bpac_risks = []\n",
    "bpac_token_ratios = []\n",
    "bpac_expert_ratios = []\n",
    "bpac_wealths = []\n",
    "bpac_thresholds = []\n",
    "\n",
    "print(f\"Starting BPAC Simulation ({NUM_RUNS} runs)...\")\n",
    "\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    # -----------------------------------------------------------\n",
    "    # 1. 严格的数据对齐 (Data Alignment)\n",
    "    # -----------------------------------------------------------\n",
    "    # 使用与 Baseline 相同的种子生成随机数生成器\n",
    "    rng = np.random.default_rng(seed)\n",
    "    \n",
    "    # 生成打乱的索引\n",
    "    n = len(data_list)\n",
    "    indices = np.arange(n)\n",
    "    rng.shuffle(indices)\n",
    "    \n",
    "    # 关键修正：根据 indices 重排 data_list\n",
    "    # 注意：data_list 是 list of dicts，不能直接用 indices 索引，除非转 numpy\n",
    "    # 这里用列表推导式最稳妥\n",
    "    shuffled_data = [data_list[i] for i in indices]\n",
    "    \n",
    "    # -----------------------------------------------------------\n",
    "    # 2. 配置并运行 BPAC\n",
    "    # -----------------------------------------------------------\n",
    "    # 你的 BPACConfig\n",
    "    cfg = BPACConfig(\n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=CFG_ALPHA, \n",
    "        rho=CFG_RHO, \n",
    "        warm_up=WARM_UP_STEPS,\n",
    "        num_thresholds=1001, \n",
    "        beta=BPA_beta, \n",
    "        c_clip=0.9\n",
    "    )\n",
    "    \n",
    "    # 运行模拟 (传入打乱后的数据)\n",
    "    df_result, _ = run_simulation(shuffled_data, cfg)\n",
    "    \n",
    "    bpac_risks.append(df_result['avg_risk'].values)\n",
    "    bpac_token_ratios.append(df_result['token_ratio'].values)\n",
    "    bpac_expert_ratios.append(df_result['expert_call_ratio'].values)\n",
    "    bpac_wealths.append(df_result['wealth'].values)\n",
    "    bpac_thresholds.append(df_result['threshold'].values)\n",
    "\n",
    "\n",
    "# 转换为 Numpy 数组方便计算均值\n",
    "bpac_risks_arr = np.array(bpac_risks)       # Shape: (100, N)\n",
    "bpac_token_ratios_arr = np.array(bpac_token_ratios) # Shape: (100, N)\n",
    "bpac_expert_ratios_arr = np.array(bpac_expert_ratios) # Shape: (100, N)\n",
    "bpac_wealths_arr = np.array(bpac_wealths)   # Shape: (100, N)\n",
    "bpac_thresholds_arr = np.array(bpac_thresholds) # Shape: (100, N)\n",
    "print(\"BPAC Simulation Finished!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5c2d1a02",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# Baseline 数据\n",
    "base_mean_risk = np.mean(arr_risks, axis=0)\n",
    "base_std_risk = np.std(arr_risks, axis=0)\n",
    "\n",
    "base_mean_token = np.mean(arr_token_ratios, axis=0)\n",
    "base_std_token = np.std(arr_token_ratios, axis=0)\n",
    "\n",
    "base_mean_expert = np.mean(arr_expert_ratios, axis=0)\n",
    "base_std_expert = np.std(arr_expert_ratios, axis=0)\n",
    "\n",
    "# 获取总时间步数\n",
    "N_STEPS = bpac_risks_arr.shape[1] \n",
    "\n",
    "# A. 重构 Baseline Threshold 序列 (因为你只存了最终的一个 u_hat)\n",
    "# 逻辑：在校准期 (前 CALIB_NUM 步) 阈值设为 0，之后设为 u_hat\n",
    "# arr_thresholds 是 (100,) 的标量数组\n",
    "base_threshold_seqs = np.zeros((NUM_RUNS, N_STEPS))\n",
    "for i in range(NUM_RUNS):\n",
    "    base_threshold_seqs[i, :CALIB_NUM] = 0.0  # 校准期阈值通常视为0 (强制Action=1)\n",
    "    base_threshold_seqs[i, CALIB_NUM:] = arr_thresholds[i] # 测试期应用 u_hat\n",
    "\n",
    "base_mean_threshold_seq = np.mean(base_threshold_seqs, axis=0)\n",
    "base_std_threshold_seq = np.std(base_threshold_seqs, axis=0)\n",
    "\n",
    "\n",
    "# BPAC 数据\n",
    "bpac_mean_risk = np.mean(bpac_risks_arr, axis=0)\n",
    "bpac_std_risk = np.std(bpac_risks_arr, axis=0)\n",
    "\n",
    "bpac_mean_token = np.mean(bpac_token_ratios_arr, axis=0)\n",
    "bpac_std_token = np.std(bpac_token_ratios_arr, axis=0)\n",
    "\n",
    "bpac_mean_expert = np.mean(bpac_expert_ratios_arr, axis=0)\n",
    "bpac_std_expert = np.std(bpac_expert_ratios_arr, axis=0)\n",
    "\n",
    "# C. 确保其他 BPAC 统计量已准备好 (以防万一)\n",
    "bpac_mean_threshold = np.mean(bpac_thresholds_arr, axis=0)\n",
    "bpac_std_threshold = np.std(bpac_thresholds_arr, axis=0)\n",
    "\n",
    "# B. 计算 BPAC Wealth 统计 (你存了 bpac_wealths_arr 但还没算均值)\n",
    "bpac_mean_wealth = np.mean(bpac_wealths_arr, axis=0)\n",
    "bpac_std_wealth = np.std(bpac_wealths_arr, axis=0)\n",
    "\n",
    "\n",
    "fig, axes = plt.subplots(5, 1, figsize=(12, 24), sharex=True)\n",
    "(ax1, ax2, ax3, ax4, ax5) = axes\n",
    "\n",
    "# X轴\n",
    "steps = np.arange(N_STEPS)\n",
    "\n",
    "# === 子图 1: Average Risk (风险控制) ===\n",
    "ax1.plot(steps, base_mean_risk, label='Baseline', color='blue', linestyle='--')\n",
    "ax1.fill_between(steps, base_mean_risk - base_std_risk, base_mean_risk + base_std_risk, color='blue', alpha=0.1)\n",
    "\n",
    "ax1.plot(steps, bpac_mean_risk, label='BPAC (Ours)', color='red', linewidth=2)\n",
    "ax1.fill_between(steps, bpac_mean_risk - bpac_std_risk, bpac_mean_risk + bpac_std_risk, color='red', alpha=0.1)\n",
    "\n",
    "ax1.axhline(y=TARGET_EPSILON, color='green', linestyle='-', linewidth=2, label=f'Target $\\epsilon={TARGET_EPSILON}$')\n",
    "ax1.axvline(x=CALIB_NUM, color='gray', linestyle=':', label='Calibration End')\n",
    "\n",
    "ax1.set_ylabel('Avg Risk')\n",
    "ax1.set_title(f'1. Risk Control (Target $\\epsilon < {TARGET_EPSILON}$)')\n",
    "ax1.legend(loc='upper right')\n",
    "ax1.grid(True, alpha=0.3)\n",
    "ax1.set_ylim(0, TARGET_EPSILON * 3.0) # 聚焦在 Epsilon 附近\n",
    "\n",
    "# === 子图 2: Token Ratio (成本节省) ===\n",
    "ax2.plot(steps, base_mean_token, label='Baseline', color='blue', linestyle='--')\n",
    "ax2.fill_between(steps, base_mean_token - base_std_token, base_mean_token + base_std_token, color='blue', alpha=0.1)\n",
    "\n",
    "ax2.plot(steps, bpac_mean_token, label='BPAC', color='red', linewidth=2)\n",
    "ax2.fill_between(steps, bpac_mean_token - bpac_std_token, bpac_mean_token + bpac_std_token, color='red', alpha=0.1)\n",
    "\n",
    "ax2.axhline(y=1.0, color='black', linestyle=':')\n",
    "ax2.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "ax2.set_ylabel('Token Ratio')\n",
    "ax2.set_title('2. Cost Efficiency (Lower is Better)')\n",
    "ax2.grid(True, alpha=0.3)\n",
    "\n",
    "# === 子图 3: Expert Call Ratio (专家调用率) ===\n",
    "ax3.plot(steps, base_mean_expert, label='Baseline', color='blue', linestyle='--')\n",
    "ax3.fill_between(steps, base_mean_expert - base_std_expert, base_mean_expert + base_std_expert, color='blue', alpha=0.1)\n",
    "\n",
    "ax3.plot(steps, bpac_mean_expert, label='BPAC', color='red', linewidth=2)\n",
    "ax3.fill_between(steps, bpac_mean_expert - bpac_std_expert, bpac_mean_expert + bpac_std_expert, color='red', alpha=0.1)\n",
    "\n",
    "ax3.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "ax3.set_ylabel('Expert Call Ratio')\n",
    "ax3.set_title('3. Expert Usage Behavior')\n",
    "ax3.grid(True, alpha=0.3)\n",
    "ax3.set_ylim(0, 1.1)\n",
    "\n",
    "# === 子图 4: Threshold Evolution (阈值变化) ===\n",
    "# Baseline: 前期是0，后期是固定值\n",
    "ax4.plot(steps, base_mean_threshold_seq, label='Baseline $\\hat{u}$', color='blue', linestyle='--')\n",
    "ax4.fill_between(steps, \n",
    "                 base_mean_threshold_seq - base_std_threshold_seq, \n",
    "                 base_mean_threshold_seq + base_std_threshold_seq, \n",
    "                 color='blue', alpha=0.1)\n",
    "\n",
    "# BPAC: 动态变化\n",
    "ax4.plot(steps, bpac_mean_threshold, label='BPAC $u_t$', color='red', linewidth=1.5)\n",
    "ax4.fill_between(steps, \n",
    "                 bpac_mean_threshold - bpac_std_threshold, \n",
    "                 bpac_mean_threshold + bpac_std_threshold, \n",
    "                 color='red', alpha=0.1)\n",
    "\n",
    "ax4.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "ax4.set_ylabel('Uncertainty Threshold')\n",
    "ax4.set_title('4. Threshold Adaptation (Dynamic vs Fixed)')\n",
    "ax4.legend(loc='upper right')\n",
    "ax4.grid(True, alpha=0.3)\n",
    "ax4.set_ylim(0, 1.0) \n",
    "\n",
    "# === 子图 5: Wealth Evolution (财富积累) ===\n",
    "# 只有 BPAC 有 Wealth\n",
    "ax5.plot(steps, bpac_mean_wealth, label='BPAC Wealth', color='purple', linewidth=1.5)\n",
    "ax5.fill_between(steps, \n",
    "                 bpac_mean_wealth - bpac_std_wealth, \n",
    "                 bpac_mean_wealth + bpac_std_wealth, \n",
    "                 color='purple', alpha=0.1)\n",
    "\n",
    "ax5.axhline(y=1.0, color='black', linestyle='--', label='Initial Wealth (1.0)')\n",
    "ax5.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "# 如果财富增长非常快，建议开启对数坐标\n",
    "# ax5.set_yscale('log') \n",
    "\n",
    "ax5.set_ylabel('Wealth')\n",
    "ax5.set_xlabel('Time Step')\n",
    "ax5.set_title('5. BPAC Wealth Accumulation')\n",
    "ax5.legend(loc='upper left')\n",
    "ax5.grid(True, alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3c8b9ce9",
   "metadata": {},
   "source": [
    "#### paper"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0bd2b7bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "from scipy import stats\n",
    "\n",
    "# ==========================================\n",
    "# 1. 设置阴影类型 (ICML 关键设置)\n",
    "# ==========================================\n",
    "# 选项: \n",
    "# 'std': 标准差 (Mean ± Std) -> 阴影最大，容易超线\n",
    "# 'sem': 标准误 (Mean ± Std/sqrt(N)) -> 阴影更窄，聚焦于均值的准确性 (推荐!)\n",
    "# 'ci':  95% 置信区间 (Bootstrap/t-distribution) -> 严谨的统计学区间\n",
    "# 'percentile': 分位数 (10% - 90%) -> 展示数据分布的真实范围\n",
    "\n",
    "SHADOW_TYPE = 'sem' \n",
    "NUM_RUNS = 100  # 你的实验次数\n",
    "\n",
    "def get_error_bounds(data_array, type='sem'):\n",
    "    \"\"\"根据选择的类型计算上下界\"\"\"\n",
    "    mean_val = np.mean(data_array, axis=0)\n",
    "    \n",
    "    if type == 'std':\n",
    "        std_val = np.std(data_array, axis=0)\n",
    "        lower = mean_val - std_val\n",
    "        upper = mean_val + std_val\n",
    "        \n",
    "    elif type == 'sem':\n",
    "        # 标准误 = Std / sqrt(N)\n",
    "        sem_val = stats.sem(data_array, axis=0)\n",
    "        lower = mean_val - sem_val\n",
    "        upper = mean_val + sem_val\n",
    "        \n",
    "    elif type == 'ci':\n",
    "        # 95% t-distribution 置信区间\n",
    "        sem_val = stats.sem(data_array, axis=0)\n",
    "        # ppf(0.975) 对应双侧 95%\n",
    "        ci_scale = stats.t.ppf(0.975, df=NUM_RUNS-1) \n",
    "        lower = mean_val - sem_val * ci_scale\n",
    "        upper = mean_val + sem_val * ci_scale\n",
    "        \n",
    "    elif type == 'percentile':\n",
    "        # 10% - 90% 分位数 (剔除最极端的异常值)\n",
    "        lower = np.percentile(data_array, 10, axis=0)\n",
    "        upper = np.percentile(data_array, 90, axis=0)\n",
    "        \n",
    "    else:\n",
    "        raise ValueError(\"Unknown shadow type\")\n",
    "        \n",
    "    return mean_val, lower, upper\n",
    "\n",
    "# ==========================================\n",
    "# 2. 计算绘图数据\n",
    "# ==========================================\n",
    "steps = np.arange(bpac_risks_arr.shape[1])\n",
    "\n",
    "# --- 计算 Risk ---\n",
    "base_mean, base_low, base_high = get_error_bounds(arr_risks, SHADOW_TYPE)\n",
    "bpac_mean, bpac_low, bpac_high = get_error_bounds(bpac_risks_arr, SHADOW_TYPE)\n",
    "\n",
    "# --- 计算 ECP ---\n",
    "base_ecp_mean, base_ecp_low, base_ecp_high = get_error_bounds(arr_expert_ratios, SHADOW_TYPE)\n",
    "bpac_ecp_mean, bpac_ecp_low, bpac_ecp_high = get_error_bounds(bpac_expert_ratios_arr, SHADOW_TYPE)\n",
    "\n",
    "# --- 计算 TCP ---\n",
    "base_tcp_mean, base_tcp_low, base_tcp_high = get_error_bounds(arr_token_ratios, SHADOW_TYPE)\n",
    "bpac_tcp_mean, bpac_tcp_low, bpac_tcp_high = get_error_bounds(bpac_token_ratios_arr, SHADOW_TYPE)\n",
    "\n",
    "# --- 物理截断 (Clipping) ---\n",
    "# 无论用什么统计方法，Risk 和 Ratio 都不可能小于 0\n",
    "def clip_bounds(low, high, min_v=0, max_v=None):\n",
    "    low = np.maximum(low, min_v)\n",
    "    if max_v is not None:\n",
    "        high = np.minimum(high, max_v)\n",
    "    return low, high\n",
    "\n",
    "base_low, base_high = clip_bounds(base_low, base_high, 0, 1.0) # Risk 通常<1，也可不设上限\n",
    "bpac_low, bpac_high = clip_bounds(bpac_low, bpac_high, 0, 1.0)\n",
    "\n",
    "base_ecp_low, base_ecp_high = clip_bounds(base_ecp_low, base_ecp_high, 0, 1.0)\n",
    "bpac_ecp_low, bpac_ecp_high = clip_bounds(bpac_ecp_low, bpac_ecp_high, 0, 1.0)\n",
    "\n",
    "base_tcp_low, base_tcp_high = clip_bounds(base_tcp_low, base_tcp_high, 0, None)\n",
    "bpac_tcp_low, bpac_tcp_high = clip_bounds(bpac_tcp_low, bpac_tcp_high, 0, None)\n",
    "\n",
    "\n",
    "# ==========================================\n",
    "# 3. 绘图 (ICML Style)\n",
    "# ==========================================\n",
    "# 设置字体和线宽\n",
    "plt.rcParams.update({\n",
    "    'font.size': 14,\n",
    "    'axes.labelsize': 16, 'axes.titlesize': 16,\n",
    "    'xtick.labelsize': 14, 'ytick.labelsize': 14,\n",
    "    'legend.fontsize': 13, 'lines.linewidth': 2.5,\n",
    "    'axes.grid': True, 'grid.alpha': 0.3, 'grid.linestyle': '--'\n",
    "})\n",
    "\n",
    "fig, axes = plt.subplots(1, 3, figsize=(20, 5), sharex=True)\n",
    "\n",
    "# 颜色\n",
    "C_BASE = '#1f77b4'  # Blue\n",
    "C_OURS = '#d62728'  # Red\n",
    "C_TGT = '#2ca02c'   # Green\n",
    "C_WARM = 'gray'\n",
    "\n",
    "# --- (a) Risk ---\n",
    "ax = axes[0]\n",
    "ax.plot(steps, base_mean, color=C_BASE, linestyle='--', label='PAC (Baseline)')\n",
    "ax.fill_between(steps, base_low, base_high, color=C_BASE, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, bpac_mean, color=C_OURS, linestyle='-', label='B-PAC (Ours)')\n",
    "ax.fill_between(steps, bpac_low, bpac_high, color=C_OURS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.axhline(y=TARGET_EPSILON, color=C_TGT, linestyle='-', linewidth=2, label=r'Target $\\epsilon$')\n",
    "ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "\n",
    "# ax.set_title('(a) Cumulative Risk')\n",
    "ax.set_ylabel('ER')\n",
    "ax.set_xlabel('Time Step')\n",
    "ax.set_ylim(0, TARGET_EPSILON * 2.0) # 视野聚焦\n",
    "# 图例中说明阴影含义\n",
    "if SHADOW_TYPE == 'sem':\n",
    "    shadow_label = 'Shaded: SEM'\n",
    "elif SHADOW_TYPE == 'std':\n",
    "    shadow_label = 'Shaded: Std. Dev.'\n",
    "elif SHADOW_TYPE == 'ci':\n",
    "    shadow_label = 'Shaded: 95% CI'\n",
    "else:\n",
    "    shadow_label = 'Shaded: 10-90% Pctl'\n",
    "    \n",
    "# 创建一个空的 handle 来显示阴影说明\n",
    "from matplotlib.lines import Line2D\n",
    "handles, labels = ax.get_legend_handles_labels()\n",
    "handles.append(Line2D([0], [0], color='gray', alpha=0.3, linewidth=10))\n",
    "# labels.append(shadow_label)\n",
    "ax.legend(handles, labels, loc='upper right', frameon=True, framealpha=0.9)\n",
    "\n",
    "\n",
    "# --- (b) ECP ---\n",
    "ax = axes[1]\n",
    "ax.plot(steps, base_ecp_mean, color=C_BASE, linestyle='--')\n",
    "ax.fill_between(steps, base_ecp_low, base_ecp_high, color=C_BASE, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, bpac_ecp_mean, color=C_OURS, linestyle='-')\n",
    "ax.fill_between(steps, bpac_ecp_low, bpac_ecp_high, color=C_OURS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "# ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2, label='Warm-up End')\n",
    "ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "# ax.set_title('(b) Expert Call Rate (ECP)')\n",
    "ax.set_ylabel('ECP(%)')\n",
    "ax.set_xlabel('Time Step')\n",
    "ax.set_ylim(0, 1.05)\n",
    "ax.legend(loc='upper right')\n",
    "\n",
    "# --- (c) TCP ---\n",
    "ax = axes[2]\n",
    "ax.plot(steps, base_tcp_mean, color=C_BASE, linestyle='--')\n",
    "ax.fill_between(steps, base_tcp_low, base_tcp_high, color=C_BASE, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, bpac_tcp_mean, color=C_OURS, linestyle='-')\n",
    "ax.fill_between(steps, bpac_tcp_low, bpac_tcp_high, color=C_OURS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.axhline(y=1.0, color='black', linestyle='-.', linewidth=1.5, label='Full Expert')\n",
    "ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "\n",
    "# ax.set_title('(c) Token Cost Ratio (TCP)')\n",
    "ax.set_ylabel('TCP(%)')\n",
    "ax.set_xlabel('Time Step')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('icml_final_plot.pdf', bbox_inches='tight')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f8bfc999",
   "metadata": {},
   "source": [
    "## gemma3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f0742ada",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "expert_data3 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/bbh/qwen3-think.json\")\n",
    "expert_data = expert_data3[expert_data3[\"matched\"] == True]\n",
    "session_ids = list(expert_data[\"session_id\"])\n",
    "\n",
    "instant_data1 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/bbh/gemma3-ins.json\")\n",
    "instant_data = instant_data1[instant_data1[\"session_id\"].isin(session_ids)]\n",
    "len(list(set(instant_data[\"session_id\"]))),len(list(set(expert_data[\"session_id\"])))\n",
    "instant_data.head(1)\n",
    "instant_data.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e10da32c",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "data_list =[]\n",
    "selcet_ids =[]\n",
    "for i,row in expert_data.iterrows():\n",
    "    session_id = row['session_id']\n",
    "    if session_id not in instant_data['session_id'].values:\n",
    "        print(f\"Instant data missing at index:{i}, session_id:{session_id}\")\n",
    "        continue\n",
    "    tmp_dict = {}\n",
    "    instant_row = instant_data[instant_data['session_id'] == session_id].iloc[0]\n",
    "    selcet_ids.append(session_id)\n",
    "    if instant_row[\"question\"] != row[\"question\"]:\n",
    "        print(f\"Question mismatch at index:{i}, session_id:{session_id}\")\n",
    "        continue\n",
    "    tmp_dict['uncertainty'] = 1- instant_row['token_probs'][0]\n",
    "    tmp_dict['instant_correct'] = int(instant_row['matched']) if instant_row['matched'] != \"No answer extracted\" else 0\n",
    "    tmp_dict['expert_correct'] = int(expert_data.loc[i, 'matched'])\n",
    "    tmp_dict['instant_token'] = instant_row['gen_token_count']\n",
    "    tmp_dict['expert_token'] = expert_data.loc[i, 'gen_token_count']\n",
    "    data_list.append(tmp_dict)\n",
    "\n",
    "print(f\"Total samples prepared: {len(data_list)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "958d7160",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "# =========================\n",
    "# 1. 读取与筛选数据\n",
    "# =========================\n",
    "data1 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/bbh/gemma3-ins.json\")\n",
    "data2 = data1[data1[\"session_id\"].isin(selcet_ids)]\n",
    "df = data2.copy()\n",
    "df[\"token_prob_scalar\"] = df[\"token_probs\"].apply(\n",
    "    lambda x: x[0] if x else None\n",
    ")\n",
    "\n",
    "\n",
    "# =========================\n",
    "# 2. 按 matched 分组\n",
    "# =========================\n",
    "tp_true = df.loc[df[\"matched\"] == True, \"token_prob_scalar\"].dropna()\n",
    "tp_false = df.loc[df[\"matched\"] == False, \"token_prob_scalar\"].dropna()\n",
    "\n",
    "# =========================\n",
    "# 3. 置信度分布直方图\n",
    "# =========================\n",
    "plt.figure()\n",
    "plt.hist(tp_true, bins=30, alpha=0.6, label=\"matched=True\")\n",
    "plt.hist(tp_false, bins=30, alpha=0.6, label=\"matched=False\")\n",
    "plt.xlabel(\"token_prob_scalar\")\n",
    "plt.ylabel(\"Count\")\n",
    "plt.title(\"Token Probability Distribution by Matched\")\n",
    "plt.legend()\n",
    "plt.show()\n",
    "\n",
    "# =========================\n",
    "# 4. Boxplot\n",
    "# =========================\n",
    "plt.figure()\n",
    "plt.boxplot([tp_true, tp_false], labels=[\"Matched=True\", \"Matched=False\"])\n",
    "plt.ylabel(\"token_prob_scalar\")\n",
    "plt.title(\"Token Probability by Matched\")\n",
    "plt.show()\n",
    "\n",
    "\n",
    "# =====================================================================\n",
    "# 第二部分：Instant / Expert 不确定性分析\n",
    "# =====================================================================\n",
    "\n",
    "# data_list: list[dict]\n",
    "instant_tokens = [item[\"instant_token\"] for item in data_list]\n",
    "expert_tokens = [item[\"expert_token\"] for item in data_list]\n",
    "\n",
    "uncertainty_scores = [item[\"uncertainty\"] for item in data_list]\n",
    "correct_instant = [item[\"instant_correct\"] for item in data_list]\n",
    "correct_expert = [item[\"expert_correct\"] for item in data_list]\n",
    "\n",
    "# =========================\n",
    "# 5. 准确率\n",
    "# =========================\n",
    "accuracy_instant = sum(correct_instant) / len(correct_instant)\n",
    "accuracy_expert = sum(correct_expert) / len(correct_expert)\n",
    "\n",
    "print(f\"Instant Model Accuracy: {accuracy_instant:.4f}\")\n",
    "print(f\"Expert Model Accuracy:  {accuracy_expert:.4f}\")\n",
    "\n",
    "expert_tokens = [item['expert_token'] for item in data_list]\n",
    "instant_tokens = [item['instant_token'] for item in data_list]\n",
    "mean_expert_tokens = np.mean(expert_tokens)\n",
    "mean_instant_tokens = np.mean(instant_tokens)\n",
    "print(f\"Mean Expert Tokens: {mean_expert_tokens}, Mean Instant Tokens: {mean_instant_tokens}\")\n",
    "\n",
    "\n",
    "# =========================\n",
    "# 6. Uncertainty 直方图\n",
    "# =========================\n",
    "plt.figure()\n",
    "plt.hist(uncertainty_scores, bins=50)\n",
    "plt.xlabel(\"Uncertainty\")\n",
    "plt.ylabel(\"Count\")\n",
    "plt.title(\"Uncertainty Distribution\")\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b50014b8",
   "metadata": {},
   "source": [
    "### compare to pac"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d9772044",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from tqdm import tqdm \n",
    "\n",
    "# -------------------------------------------------------------------------\n",
    "# 0. 准备全量数据 (只做一次)\n",
    "# -------------------------------------------------------------------------\n",
    "# 假设 data_list 已经准备好\n",
    "y_solved = [item['expert_correct'] for item in data_list]\n",
    "y_hat_solved = [item['instant_correct'] for item in data_list]\n",
    "uncertainty_values = [item['uncertainty'] for item in data_list]\n",
    "y_token_list = [item['expert_token'] for item in data_list]\n",
    "y_hat_token_list = [item['instant_token'] for item in data_list]\n",
    "\n",
    "# 参数设置\n",
    "NUM_RUNS = 100         # 重复次数\n",
    "TARGET_EPSILON = 0.05   # 目标 Risk\n",
    "TARGET_ALPHA = 0.05     # 置信度\n",
    "CALIB_NUM = 500        # 校准集大小\n",
    "TOTAL_NUM = len(y_solved)\n",
    "calib_ratio = CALIB_NUM / TOTAL_NUM\n",
    "\n",
    "# -------------------------------------------------------------------------\n",
    "# 1. 循环运行 Simulation\n",
    "# -------------------------------------------------------------------------\n",
    "\n",
    "# 用于存储每次实验的完整序列\n",
    "# 维度: (100, N_samples)\n",
    "all_risks = []\n",
    "all_token_ratios = []\n",
    "all_expert_ratios = []\n",
    "all_u_hats = []\n",
    "\n",
    "print(f\"Starting {NUM_RUNS} runs simulation...\")\n",
    "\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    # 每次使用不同的随机种子 (seed=0, 1, 2, ..., 99)\n",
    "    # 这样 run_baseline_continuous 内部的 shuffle 结果每次都不一样\n",
    "    df, u_hat = run_baseline_continuous(\n",
    "        y_solved, \n",
    "        y_hat_solved, \n",
    "        uncertainty_values, \n",
    "        y_token_list, \n",
    "        y_hat_token_list,\n",
    "        calib_ratio=calib_ratio, \n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=TARGET_ALPHA,\n",
    "        seed=seed\n",
    "    )\n",
    "    \n",
    "    # 记录整条曲线 (df['avg_risk'] 是一个 Series)\n",
    "    all_risks.append(df['avg_risk'].values)\n",
    "    all_token_ratios.append(df['token_ratio'].values)\n",
    "    all_expert_ratios.append(df['expert_call_ratio'].values)\n",
    "    all_u_hats.append(u_hat)\n",
    "\n",
    "# 转为 Numpy Array 方便计算均值方差\n",
    "# shape = (100, N)\n",
    "arr_risks = np.array(all_risks)\n",
    "arr_token_ratios = np.array(all_token_ratios)\n",
    "arr_expert_ratios = np.array(all_expert_ratios)\n",
    "arr_thresholds = np.array(all_u_hats)\n",
    "\n",
    "print(\"\\nSimulation Finished!\")\n",
    "print(f\"Average u_hat across runs: {np.mean(all_u_hats):.4f} (std: {np.std(all_u_hats):.4f})\")\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "# 参数设置\n",
    "NUM_RUNS = 100        # 重复 100 次\n",
    "WARM_UP_STEPS = 0   # BPAC 的 Warm-up 步数 (对应 Baseline 的校准集大小)\n",
    "TARGET_EPSILON = 0.05  # 目标 Risk\n",
    "CFG_ALPHA= 0.05       # BPAC 参数\n",
    "CFG_RHO = 0.1         # BPAC 参数\n",
    "BPA_beta = 1\n",
    "\n",
    "bpac_risks = []\n",
    "bpac_token_ratios = []\n",
    "bpac_expert_ratios = []\n",
    "bpac_wealths = []\n",
    "bpac_thresholds = []\n",
    "\n",
    "print(f\"Starting BPAC Simulation ({NUM_RUNS} runs)...\")\n",
    "\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    # -----------------------------------------------------------\n",
    "    # 1. 严格的数据对齐 (Data Alignment)\n",
    "    # -----------------------------------------------------------\n",
    "    # 使用与 Baseline 相同的种子生成随机数生成器\n",
    "    rng = np.random.default_rng(seed)\n",
    "    \n",
    "    # 生成打乱的索引\n",
    "    n = len(data_list)\n",
    "    indices = np.arange(n)\n",
    "    rng.shuffle(indices)\n",
    "    \n",
    "    # 关键修正：根据 indices 重排 data_list\n",
    "    # 注意：data_list 是 list of dicts，不能直接用 indices 索引，除非转 numpy\n",
    "    # 这里用列表推导式最稳妥\n",
    "    shuffled_data = [data_list[i] for i in indices]\n",
    "    \n",
    "    # -----------------------------------------------------------\n",
    "    # 2. 配置并运行 BPAC\n",
    "    # -----------------------------------------------------------\n",
    "    # 你的 BPACConfig\n",
    "    cfg = BPACConfig(\n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=CFG_ALPHA, \n",
    "        rho=CFG_RHO, \n",
    "        warm_up=WARM_UP_STEPS,\n",
    "        num_thresholds=1001, \n",
    "        beta=BPA_beta, \n",
    "        c_clip=0.9\n",
    "    )\n",
    "    \n",
    "    # 运行模拟 (传入打乱后的数据)\n",
    "    df_result, _ = run_simulation(shuffled_data, cfg)\n",
    "    \n",
    "    bpac_risks.append(df_result['avg_risk'].values)\n",
    "    bpac_token_ratios.append(df_result['token_ratio'].values)\n",
    "    bpac_expert_ratios.append(df_result['expert_call_ratio'].values)\n",
    "    bpac_wealths.append(df_result['wealth'].values)\n",
    "    bpac_thresholds.append(df_result['threshold'].values)\n",
    "\n",
    "\n",
    "# 转换为 Numpy 数组方便计算均值\n",
    "bpac_risks_arr = np.array(bpac_risks)       # Shape: (100, N)\n",
    "bpac_token_ratios_arr = np.array(bpac_token_ratios) # Shape: (100, N)\n",
    "bpac_expert_ratios_arr = np.array(bpac_expert_ratios) # Shape: (100, N)\n",
    "bpac_wealths_arr = np.array(bpac_wealths)   # Shape: (100, N)\n",
    "bpac_thresholds_arr = np.array(bpac_thresholds) # Shape: (100, N)\n",
    "print(\"BPAC Simulation Finished!\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "40f5065b",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# Baseline 数据\n",
    "base_mean_risk = np.mean(arr_risks, axis=0)\n",
    "base_std_risk = np.std(arr_risks, axis=0)\n",
    "\n",
    "base_mean_token = np.mean(arr_token_ratios, axis=0)\n",
    "base_std_token = np.std(arr_token_ratios, axis=0)\n",
    "\n",
    "base_mean_expert = np.mean(arr_expert_ratios, axis=0)\n",
    "base_std_expert = np.std(arr_expert_ratios, axis=0)\n",
    "\n",
    "# 获取总时间步数\n",
    "N_STEPS = bpac_risks_arr.shape[1] \n",
    "\n",
    "# A. 重构 Baseline Threshold 序列 (因为你只存了最终的一个 u_hat)\n",
    "# 逻辑：在校准期 (前 CALIB_NUM 步) 阈值设为 0，之后设为 u_hat\n",
    "# arr_thresholds 是 (100,) 的标量数组\n",
    "base_threshold_seqs = np.zeros((NUM_RUNS, N_STEPS))\n",
    "for i in range(NUM_RUNS):\n",
    "    base_threshold_seqs[i, :CALIB_NUM] = 0.0  # 校准期阈值通常视为0 (强制Action=1)\n",
    "    base_threshold_seqs[i, CALIB_NUM:] = arr_thresholds[i] # 测试期应用 u_hat\n",
    "\n",
    "base_mean_threshold_seq = np.mean(base_threshold_seqs, axis=0)\n",
    "base_std_threshold_seq = np.std(base_threshold_seqs, axis=0)\n",
    "\n",
    "\n",
    "# BPAC 数据\n",
    "bpac_mean_risk = np.mean(bpac_risks_arr, axis=0)\n",
    "bpac_std_risk = np.std(bpac_risks_arr, axis=0)\n",
    "\n",
    "bpac_mean_token = np.mean(bpac_token_ratios_arr, axis=0)\n",
    "bpac_std_token = np.std(bpac_token_ratios_arr, axis=0)\n",
    "\n",
    "bpac_mean_expert = np.mean(bpac_expert_ratios_arr, axis=0)\n",
    "bpac_std_expert = np.std(bpac_expert_ratios_arr, axis=0)\n",
    "\n",
    "# C. 确保其他 BPAC 统计量已准备好 (以防万一)\n",
    "bpac_mean_threshold = np.mean(bpac_thresholds_arr, axis=0)\n",
    "bpac_std_threshold = np.std(bpac_thresholds_arr, axis=0)\n",
    "\n",
    "# B. 计算 BPAC Wealth 统计 (你存了 bpac_wealths_arr 但还没算均值)\n",
    "bpac_mean_wealth = np.mean(bpac_wealths_arr, axis=0)\n",
    "bpac_std_wealth = np.std(bpac_wealths_arr, axis=0)\n",
    "\n",
    "\n",
    "fig, axes = plt.subplots(5, 1, figsize=(12, 24), sharex=True)\n",
    "(ax1, ax2, ax3, ax4, ax5) = axes\n",
    "\n",
    "# X轴\n",
    "steps = np.arange(N_STEPS)\n",
    "\n",
    "# === 子图 1: Average Risk (风险控制) ===\n",
    "ax1.plot(steps, base_mean_risk, label='Baseline', color='blue', linestyle='--')\n",
    "ax1.fill_between(steps, base_mean_risk - base_std_risk, base_mean_risk + base_std_risk, color='blue', alpha=0.1)\n",
    "\n",
    "ax1.plot(steps, bpac_mean_risk, label='BPAC (Ours)', color='red', linewidth=2)\n",
    "ax1.fill_between(steps, bpac_mean_risk - bpac_std_risk, bpac_mean_risk + bpac_std_risk, color='red', alpha=0.1)\n",
    "\n",
    "ax1.axhline(y=TARGET_EPSILON, color='green', linestyle='-', linewidth=2, label=f'Target $\\epsilon={TARGET_EPSILON}$')\n",
    "ax1.axvline(x=CALIB_NUM, color='gray', linestyle=':', label='Calibration End')\n",
    "\n",
    "ax1.set_ylabel('Avg Risk')\n",
    "ax1.set_title(f'1. Risk Control (Target $\\epsilon < {TARGET_EPSILON}$)')\n",
    "ax1.legend(loc='upper right')\n",
    "ax1.grid(True, alpha=0.3)\n",
    "ax1.set_ylim(0, TARGET_EPSILON * 3.0) # 聚焦在 Epsilon 附近\n",
    "\n",
    "# === 子图 2: Token Ratio (成本节省) ===\n",
    "ax2.plot(steps, base_mean_token, label='Baseline', color='blue', linestyle='--')\n",
    "ax2.fill_between(steps, base_mean_token - base_std_token, base_mean_token + base_std_token, color='blue', alpha=0.1)\n",
    "\n",
    "ax2.plot(steps, bpac_mean_token, label='BPAC', color='red', linewidth=2)\n",
    "ax2.fill_between(steps, bpac_mean_token - bpac_std_token, bpac_mean_token + bpac_std_token, color='red', alpha=0.1)\n",
    "\n",
    "ax2.axhline(y=1.0, color='black', linestyle=':')\n",
    "ax2.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "ax2.set_ylabel('Token Ratio')\n",
    "ax2.set_title('2. Cost Efficiency (Lower is Better)')\n",
    "ax2.grid(True, alpha=0.3)\n",
    "\n",
    "# === 子图 3: Expert Call Ratio (专家调用率) ===\n",
    "ax3.plot(steps, base_mean_expert, label='Baseline', color='blue', linestyle='--')\n",
    "ax3.fill_between(steps, base_mean_expert - base_std_expert, base_mean_expert + base_std_expert, color='blue', alpha=0.1)\n",
    "\n",
    "ax3.plot(steps, bpac_mean_expert, label='BPAC', color='red', linewidth=2)\n",
    "ax3.fill_between(steps, bpac_mean_expert - bpac_std_expert, bpac_mean_expert + bpac_std_expert, color='red', alpha=0.1)\n",
    "\n",
    "ax3.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "ax3.set_ylabel('Expert Call Ratio')\n",
    "ax3.set_title('3. Expert Usage Behavior')\n",
    "ax3.grid(True, alpha=0.3)\n",
    "ax3.set_ylim(0, 1.1)\n",
    "\n",
    "# === 子图 4: Threshold Evolution (阈值变化) ===\n",
    "# Baseline: 前期是0，后期是固定值\n",
    "ax4.plot(steps, base_mean_threshold_seq, label='Baseline $\\hat{u}$', color='blue', linestyle='--')\n",
    "ax4.fill_between(steps, \n",
    "                 base_mean_threshold_seq - base_std_threshold_seq, \n",
    "                 base_mean_threshold_seq + base_std_threshold_seq, \n",
    "                 color='blue', alpha=0.1)\n",
    "\n",
    "# BPAC: 动态变化\n",
    "ax4.plot(steps, bpac_mean_threshold, label='BPAC $u_t$', color='red', linewidth=1.5)\n",
    "ax4.fill_between(steps, \n",
    "                 bpac_mean_threshold - bpac_std_threshold, \n",
    "                 bpac_mean_threshold + bpac_std_threshold, \n",
    "                 color='red', alpha=0.1)\n",
    "\n",
    "ax4.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "ax4.set_ylabel('Uncertainty Threshold')\n",
    "ax4.set_title('4. Threshold Adaptation (Dynamic vs Fixed)')\n",
    "ax4.legend(loc='upper right')\n",
    "ax4.grid(True, alpha=0.3)\n",
    "ax4.set_ylim(0, 1.0) \n",
    "\n",
    "# === 子图 5: Wealth Evolution (财富积累) ===\n",
    "# 只有 BPAC 有 Wealth\n",
    "ax5.plot(steps, bpac_mean_wealth, label='BPAC Wealth', color='purple', linewidth=1.5)\n",
    "ax5.fill_between(steps, \n",
    "                 bpac_mean_wealth - bpac_std_wealth, \n",
    "                 bpac_mean_wealth + bpac_std_wealth, \n",
    "                 color='purple', alpha=0.1)\n",
    "\n",
    "ax5.axhline(y=1.0, color='black', linestyle='--', label='Initial Wealth (1.0)')\n",
    "ax5.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "# 如果财富增长非常快，建议开启对数坐标\n",
    "# ax5.set_yscale('log') \n",
    "\n",
    "ax5.set_ylabel('Wealth')\n",
    "ax5.set_xlabel('Time Step')\n",
    "ax5.set_title('5. BPAC Wealth Accumulation')\n",
    "ax5.legend(loc='upper left')\n",
    "ax5.grid(True, alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "de1326a7",
   "metadata": {},
   "source": [
    "#### paper"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e38b9aa6",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "447f732e",
   "metadata": {},
   "source": [
    "# math-verbalized score"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "af330f6c",
   "metadata": {},
   "source": [
    "## qwen3-ins"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f0951809",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "# expert_data1 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/bbh/qwen3-think.json\")\n",
    "expert_data3 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/bbh/qwen3-think.json\")\n",
    "\n",
    "expert_data = expert_data3[expert_data3[\"matched\"] == True]\n",
    "session_ids = list(expert_data[\"session_id\"])\n",
    "\n",
    "instant_data1 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/bbh/qwen3-ins.json\")\n",
    "instant_data = instant_data1[instant_data1[\"session_id\"].isin(session_ids)]\n",
    "len(list(set(instant_data[\"session_id\"]))), len(list(set(expert_data[\"session_id\"])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "84f37a38",
   "metadata": {},
   "outputs": [],
   "source": [
    "instant_data.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8329ae93",
   "metadata": {},
   "outputs": [],
   "source": [
    "instant_data.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7d20941f",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_list =[]\n",
    "selcet_ids =[]\n",
    "for i,row in expert_data.iterrows():\n",
    "    session_id = row['session_id']\n",
    "    if session_id not in instant_data['session_id'].values:\n",
    "        print(f\"Instant data missing at index:{i}, session_id:{session_id}\")\n",
    "        continue\n",
    "    tmp_dict = {}\n",
    "    instant_row = instant_data[instant_data['session_id'] == session_id].iloc[0]\n",
    "    selcet_ids.append(session_id)\n",
    "    if instant_row[\"question\"] != row[\"question\"]:\n",
    "        print(f\"Question mismatch at index:{i}, session_id:{session_id}\")\n",
    "        continue\n",
    "    tmp_dict['uncertainty'] = 1- instant_row['verbalized_prob']\n",
    "    tmp_dict['instant_correct'] = int(instant_row['matched']) if instant_row['matched'] != \"No answer extracted\" else 0\n",
    "    tmp_dict['expert_correct'] = int(expert_data.loc[i, 'matched'])\n",
    "    tmp_dict['instant_token'] = instant_row['gen_token_count']\n",
    "    tmp_dict['expert_token'] = expert_data.loc[i, 'gen_token_count']\n",
    "    data_list.append(tmp_dict)\n",
    "\n",
    "print(f\"Total samples prepared: {len(data_list)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "612ee4b9",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "# =========================\n",
    "# 1. 读取与筛选数据\n",
    "# =========================\n",
    "data1 = pd.read_json(\n",
    "    \"/home/-/-/pac/zeroeval/result_dirs_parsed/bbh/qwen3-ins.json\"\n",
    ")\n",
    "\n",
    "data2 = data1[data1[\"session_id\"].isin(selcet_ids)]\n",
    "df = data2.copy()\n",
    "\n",
    "# 统一使用一个 scalar 置信度\n",
    "df[\"token_prob_scalar\"] = df[\"verbalized_prob\"]\n",
    "\n",
    "# =========================\n",
    "# 2. 按 matched 分组\n",
    "# =========================\n",
    "tp_true = df.loc[df[\"matched\"] == True, \"token_prob_scalar\"].dropna()\n",
    "tp_false = df.loc[df[\"matched\"] == False, \"token_prob_scalar\"].dropna()\n",
    "\n",
    "# =========================\n",
    "# 3. 置信度分布直方图\n",
    "# =========================\n",
    "plt.figure()\n",
    "plt.hist(tp_true, bins=30, alpha=0.6, label=\"matched=True\")\n",
    "plt.hist(tp_false, bins=30, alpha=0.6, label=\"matched=False\")\n",
    "plt.xlabel(\"token_prob_scalar\")\n",
    "plt.ylabel(\"Count\")\n",
    "plt.title(\"Token Probability Distribution by Matched\")\n",
    "plt.legend()\n",
    "plt.show()\n",
    "\n",
    "# =========================\n",
    "# 4. Boxplot\n",
    "# =========================\n",
    "plt.figure()\n",
    "plt.boxplot([tp_true, tp_false], labels=[\"Matched=True\", \"Matched=False\"])\n",
    "plt.ylabel(\"token_prob_scalar\")\n",
    "plt.title(\"Token Probability by Matched\")\n",
    "plt.show()\n",
    "\n",
    "\n",
    "# =====================================================================\n",
    "# 第二部分：Instant / Expert 不确定性分析\n",
    "# =====================================================================\n",
    "\n",
    "# data_list: list[dict]\n",
    "instant_tokens = [item[\"instant_token\"] for item in data_list]\n",
    "expert_tokens = [item[\"expert_token\"] for item in data_list]\n",
    "\n",
    "uncertainty_scores = [item[\"uncertainty\"] for item in data_list]\n",
    "correct_instant = [item[\"instant_correct\"] for item in data_list]\n",
    "correct_expert = [item[\"expert_correct\"] for item in data_list]\n",
    "\n",
    "# =========================\n",
    "# 5. 准确率\n",
    "# =========================\n",
    "accuracy_instant = sum(correct_instant) / len(correct_instant)\n",
    "accuracy_expert = sum(correct_expert) / len(correct_expert)\n",
    "\n",
    "print(f\"Instant Model Accuracy: {accuracy_instant:.4f}\")\n",
    "print(f\"Expert Model Accuracy:  {accuracy_expert:.4f}\")\n",
    "\n",
    "expert_tokens = [item['expert_token'] for item in data_list]\n",
    "instant_tokens = [item['instant_token'] for item in data_list]\n",
    "mean_expert_tokens = np.mean(expert_tokens)\n",
    "mean_instant_tokens = np.mean(instant_tokens)\n",
    "print(f\"Mean Expert Tokens: {mean_expert_tokens}, Mean Instant Tokens: {mean_instant_tokens}\")\n",
    "\n",
    "\n",
    "# =========================\n",
    "# 6. Uncertainty 直方图\n",
    "# =========================\n",
    "plt.figure()\n",
    "plt.hist(uncertainty_scores, bins=50)\n",
    "plt.xlabel(\"Uncertainty\")\n",
    "plt.ylabel(\"Count\")\n",
    "plt.title(\"Uncertainty Distribution\")\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "565c98bc",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "data_list1 = np.array(data_list)\n",
    "\n",
    "# np.random.shuffle(data_list1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a008b71b",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# 运行配置\n",
    "cfg = BPACConfig(epsilon=0.1, alpha=0.1, rho=0.1,warm_up=0)\n",
    "\n",
    "# 执行模拟\n",
    "df_result, model = run_simulation(data_list1, cfg)\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "def plot_bpac_results(df_logs, config):\n",
    "    \"\"\"\n",
    "    绘制 B-PAC 实验结果面板\n",
    "    \"\"\"\n",
    "    if df_logs.empty:\n",
    "        print(\"Log is empty. Check warmup settings or data.\")\n",
    "        return\n",
    "\n",
    "    fig, axes = plt.subplots(4, 1, figsize=(12, 16), sharex=True)\n",
    "    \n",
    "    # ----------------------------------\n",
    "    # Subplot 1: Threshold Adaptation\n",
    "    # ----------------------------------\n",
    "    ax = axes[0]\n",
    "    ax.plot(df_logs['step'], df_logs['threshold'], label='Threshold ($u_t$)', color='#1f77b4', linewidth=2)\n",
    "    # 可以在背景里画出 uncertainty 的散点，展示数据分布（可选）\n",
    "    # ax.scatter(df_logs['step'], df_logs['uncertainty'], alpha=0.1, color='gray', s=1, label='Input Uncertainty')\n",
    "    ax.set_ylabel('Uncertainty Score')\n",
    "    ax.set_title(f'Threshold Adaptation (Confidence {1-config.alpha:.0%})')\n",
    "    ax.legend(loc='upper right')\n",
    "    ax.grid(True, alpha=0.3)\n",
    "    ax.set_ylim(-0.05, 1.05)\n",
    "\n",
    "    # ----------------------------------\n",
    "    # Subplot 2: Safety (Risk Control)\n",
    "    # ----------------------------------\n",
    "    ax = axes[1]\n",
    "    ax.plot(df_logs['step'], df_logs['avg_risk'], color='#d62728', label='Cumulative Avg Risk', linewidth=2)\n",
    "    ax.axhline(y=config.epsilon, color='black', linestyle='--', linewidth=2, label=f'Target Risk ($\\epsilon={config.epsilon}$)')\n",
    "    ax.set_ylabel('Risk Rate')\n",
    "    ax.set_title('Safety Guarantee: Realized Risk vs. Target')\n",
    "    ax.legend(loc='upper right')\n",
    "    ax.grid(True, alpha=0.3)\n",
    "    \n",
    "    # ----------------------------------\n",
    "    # Subplot 3: Efficiency (Token & Expert Ratio)\n",
    "    # ----------------------------------\n",
    "    ax = axes[2]\n",
    "    ax.plot(df_logs['step'], df_logs['token_ratio'], color='#2ca02c', label='Token Ratio (vs. All-Expert)', linewidth=2)\n",
    "    ax.plot(df_logs['step'], df_logs['expert_call_ratio'], color='#ff7f0e', linestyle='-.', label='Expert Call Ratio', linewidth=2)\n",
    "    ax.set_ylabel('Ratio')\n",
    "    ax.set_title('Efficiency: Cost Reduction')\n",
    "    ax.legend(loc='upper right')\n",
    "    ax.grid(True, alpha=0.3)\n",
    "    \n",
    "    # ----------------------------------\n",
    "    # Subplot 4: Stability (Wealth Process)\n",
    "    # ----------------------------------\n",
    "    ax = axes[3]\n",
    "    ax.plot(df_logs['step'], df_logs['wealth'], color='#9467bd', label='Wealth of Selected Threshold ($K_t$)', linewidth=1.5)\n",
    "    # 画出安全线 1/alpha\n",
    "    safe_wealth = 1.0 / config.alpha\n",
    "    ax.axhline(y=safe_wealth, color='purple', linestyle=':', label=f'Safety Barrier ($1/\\\\alpha={safe_wealth:.1f}$)')\n",
    "    \n",
    "    ax.set_ylabel('Wealth Value')\n",
    "    ax.set_xlabel('Simulation Step (post-warmup)')\n",
    "    ax.set_title('Martingale Wealth Process')\n",
    "    ax.set_yscale('log') # 财富值通常是指数增长的，用对数坐标更好看\n",
    "    ax.legend(loc='upper left')\n",
    "    ax.grid(True, alpha=0.3)\n",
    "\n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "\n",
    "plot_bpac_results(df_result, cfg)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cae19129",
   "metadata": {},
   "source": [
    "### compare to pac"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ba20362f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from tqdm import tqdm \n",
    "\n",
    "# -------------------------------------------------------------------------\n",
    "# 0. 准备全量数据 (只做一次)\n",
    "# -------------------------------------------------------------------------\n",
    "# 假设 data_list 已经准备好\n",
    "y_solved = [item['expert_correct'] for item in data_list]\n",
    "y_hat_solved = [item['instant_correct'] for item in data_list]\n",
    "uncertainty_values = [item['uncertainty'] for item in data_list]\n",
    "y_token_list = [item['expert_token'] for item in data_list]\n",
    "y_hat_token_list = [item['instant_token'] for item in data_list]\n",
    "\n",
    "# 参数设置\n",
    "NUM_RUNS = 100         # 重复次数\n",
    "TARGET_EPSILON = 0.05   # 目标 Risk\n",
    "TARGET_ALPHA = 0.05     # 置信度\n",
    "CALIB_NUM = 500        # 校准集大小\n",
    "TOTAL_NUM = len(y_solved)\n",
    "calib_ratio = CALIB_NUM / TOTAL_NUM\n",
    "\n",
    "# -------------------------------------------------------------------------\n",
    "# 1. 循环运行 Simulation\n",
    "# -------------------------------------------------------------------------\n",
    "\n",
    "# 用于存储每次实验的完整序列\n",
    "# 维度: (100, N_samples)\n",
    "all_risks = []\n",
    "all_token_ratios = []\n",
    "all_expert_ratios = []\n",
    "all_u_hats = []\n",
    "\n",
    "print(f\"Starting {NUM_RUNS} runs simulation...\")\n",
    "\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    # 每次使用不同的随机种子 (seed=0, 1, 2, ..., 99)\n",
    "    # 这样 run_baseline_continuous 内部的 shuffle 结果每次都不一样\n",
    "    df, u_hat = run_baseline_continuous(\n",
    "        y_solved, \n",
    "        y_hat_solved, \n",
    "        uncertainty_values, \n",
    "        y_token_list, \n",
    "        y_hat_token_list,\n",
    "        calib_ratio=calib_ratio, \n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=TARGET_ALPHA,\n",
    "        seed=seed\n",
    "    )\n",
    "    \n",
    "    # 记录整条曲线 (df['avg_risk'] 是一个 Series)\n",
    "    all_risks.append(df['avg_risk'].values)\n",
    "    all_token_ratios.append(df['token_ratio'].values)\n",
    "    all_expert_ratios.append(df['expert_call_ratio'].values)\n",
    "    all_u_hats.append(u_hat)\n",
    "\n",
    "# 转为 Numpy Array 方便计算均值方差\n",
    "# shape = (100, N)\n",
    "arr_risks = np.array(all_risks)\n",
    "arr_token_ratios = np.array(all_token_ratios)\n",
    "arr_expert_ratios = np.array(all_expert_ratios)\n",
    "arr_thresholds = np.array(all_u_hats)\n",
    "\n",
    "print(\"\\nSimulation Finished!\")\n",
    "print(f\"Average u_hat across runs: {np.mean(all_u_hats):.4f} (std: {np.std(all_u_hats):.4f})\")\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "# 参数设置\n",
    "NUM_RUNS = 100        # 重复 100 次\n",
    "WARM_UP_STEPS = 0   # BPAC 的 Warm-up 步数 (对应 Baseline 的校准集大小)\n",
    "TARGET_EPSILON = 0.05  # 目标 Risk\n",
    "CFG_ALPHA= 0.05       # BPAC 参数\n",
    "CFG_RHO = 0.1         # BPAC 参数\n",
    "BPA_beta = 1\n",
    "\n",
    "bpac_risks = []\n",
    "bpac_token_ratios = []\n",
    "bpac_expert_ratios = []\n",
    "bpac_wealths = []\n",
    "bpac_thresholds = []\n",
    "\n",
    "print(f\"Starting BPAC Simulation ({NUM_RUNS} runs)...\")\n",
    "\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    # -----------------------------------------------------------\n",
    "    # 1. 严格的数据对齐 (Data Alignment)\n",
    "    # -----------------------------------------------------------\n",
    "    # 使用与 Baseline 相同的种子生成随机数生成器\n",
    "    rng = np.random.default_rng(seed)\n",
    "    \n",
    "    # 生成打乱的索引\n",
    "    n = len(data_list)\n",
    "    indices = np.arange(n)\n",
    "    rng.shuffle(indices)\n",
    "    \n",
    "    # 关键修正：根据 indices 重排 data_list\n",
    "    # 注意：data_list 是 list of dicts，不能直接用 indices 索引，除非转 numpy\n",
    "    # 这里用列表推导式最稳妥\n",
    "    shuffled_data = [data_list[i] for i in indices]\n",
    "    \n",
    "    # -----------------------------------------------------------\n",
    "    # 2. 配置并运行 BPAC\n",
    "    # -----------------------------------------------------------\n",
    "    # 你的 BPACConfig\n",
    "    cfg = BPACConfig(\n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=CFG_ALPHA, \n",
    "        rho=CFG_RHO, \n",
    "        warm_up=WARM_UP_STEPS,\n",
    "        num_thresholds=1001, \n",
    "        beta=BPA_beta, \n",
    "        c_clip=0.9\n",
    "    )\n",
    "    \n",
    "    # 运行模拟 (传入打乱后的数据)\n",
    "    df_result, _ = run_simulation(shuffled_data, cfg)\n",
    "    \n",
    "    bpac_risks.append(df_result['avg_risk'].values)\n",
    "    bpac_token_ratios.append(df_result['token_ratio'].values)\n",
    "    bpac_expert_ratios.append(df_result['expert_call_ratio'].values)\n",
    "    bpac_wealths.append(df_result['wealth'].values)\n",
    "    bpac_thresholds.append(df_result['threshold'].values)\n",
    "\n",
    "\n",
    "# 转换为 Numpy 数组方便计算均值\n",
    "bpac_risks_arr = np.array(bpac_risks)       # Shape: (100, N)\n",
    "bpac_token_ratios_arr = np.array(bpac_token_ratios) # Shape: (100, N)\n",
    "bpac_expert_ratios_arr = np.array(bpac_expert_ratios) # Shape: (100, N)\n",
    "bpac_wealths_arr = np.array(bpac_wealths)   # Shape: (100, N)\n",
    "bpac_thresholds_arr = np.array(bpac_thresholds) # Shape: (100, N)\n",
    "print(\"BPAC Simulation Finished!\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b989a31b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# Baseline 数据\n",
    "base_mean_risk = np.mean(arr_risks, axis=0)\n",
    "base_std_risk = np.std(arr_risks, axis=0)\n",
    "\n",
    "base_mean_token = np.mean(arr_token_ratios, axis=0)\n",
    "base_std_token = np.std(arr_token_ratios, axis=0)\n",
    "\n",
    "base_mean_expert = np.mean(arr_expert_ratios, axis=0)\n",
    "base_std_expert = np.std(arr_expert_ratios, axis=0)\n",
    "\n",
    "# 获取总时间步数\n",
    "N_STEPS = bpac_risks_arr.shape[1] \n",
    "\n",
    "# A. 重构 Baseline Threshold 序列 (因为你只存了最终的一个 u_hat)\n",
    "# 逻辑：在校准期 (前 CALIB_NUM 步) 阈值设为 0，之后设为 u_hat\n",
    "# arr_thresholds 是 (100,) 的标量数组\n",
    "base_threshold_seqs = np.zeros((NUM_RUNS, N_STEPS))\n",
    "for i in range(NUM_RUNS):\n",
    "    base_threshold_seqs[i, :CALIB_NUM] = 0.0  # 校准期阈值通常视为0 (强制Action=1)\n",
    "    base_threshold_seqs[i, CALIB_NUM:] = arr_thresholds[i] # 测试期应用 u_hat\n",
    "\n",
    "base_mean_threshold_seq = np.mean(base_threshold_seqs, axis=0)\n",
    "base_std_threshold_seq = np.std(base_threshold_seqs, axis=0)\n",
    "\n",
    "\n",
    "# BPAC 数据\n",
    "bpac_mean_risk = np.mean(bpac_risks_arr, axis=0)\n",
    "bpac_std_risk = np.std(bpac_risks_arr, axis=0)\n",
    "\n",
    "bpac_mean_token = np.mean(bpac_token_ratios_arr, axis=0)\n",
    "bpac_std_token = np.std(bpac_token_ratios_arr, axis=0)\n",
    "\n",
    "bpac_mean_expert = np.mean(bpac_expert_ratios_arr, axis=0)\n",
    "bpac_std_expert = np.std(bpac_expert_ratios_arr, axis=0)\n",
    "\n",
    "# C. 确保其他 BPAC 统计量已准备好 (以防万一)\n",
    "bpac_mean_threshold = np.mean(bpac_thresholds_arr, axis=0)\n",
    "bpac_std_threshold = np.std(bpac_thresholds_arr, axis=0)\n",
    "\n",
    "# B. 计算 BPAC Wealth 统计 (你存了 bpac_wealths_arr 但还没算均值)\n",
    "bpac_mean_wealth = np.mean(bpac_wealths_arr, axis=0)\n",
    "bpac_std_wealth = np.std(bpac_wealths_arr, axis=0)\n",
    "\n",
    "\n",
    "fig, axes = plt.subplots(5, 1, figsize=(12, 24), sharex=True)\n",
    "(ax1, ax2, ax3, ax4, ax5) = axes\n",
    "\n",
    "# X轴\n",
    "steps = np.arange(N_STEPS)\n",
    "\n",
    "# === 子图 1: Average Risk (风险控制) ===\n",
    "ax1.plot(steps, base_mean_risk, label='Baseline', color='blue', linestyle='--')\n",
    "ax1.fill_between(steps, base_mean_risk - base_std_risk, base_mean_risk + base_std_risk, color='blue', alpha=0.1)\n",
    "\n",
    "ax1.plot(steps, bpac_mean_risk, label='BPAC (Ours)', color='red', linewidth=2)\n",
    "ax1.fill_between(steps, bpac_mean_risk - bpac_std_risk, bpac_mean_risk + bpac_std_risk, color='red', alpha=0.1)\n",
    "\n",
    "ax1.axhline(y=TARGET_EPSILON, color='green', linestyle='-', linewidth=2, label=f'Target $\\epsilon={TARGET_EPSILON}$')\n",
    "ax1.axvline(x=CALIB_NUM, color='gray', linestyle=':', label='Calibration End')\n",
    "\n",
    "ax1.set_ylabel('Avg Risk')\n",
    "ax1.set_title(f'1. Risk Control (Target $\\epsilon < {TARGET_EPSILON}$)')\n",
    "ax1.legend(loc='upper right')\n",
    "ax1.grid(True, alpha=0.3)\n",
    "ax1.set_ylim(0, TARGET_EPSILON * 3.0) # 聚焦在 Epsilon 附近\n",
    "\n",
    "# === 子图 2: Token Ratio (成本节省) ===\n",
    "ax2.plot(steps, base_mean_token, label='Baseline', color='blue', linestyle='--')\n",
    "ax2.fill_between(steps, base_mean_token - base_std_token, base_mean_token + base_std_token, color='blue', alpha=0.1)\n",
    "\n",
    "ax2.plot(steps, bpac_mean_token, label='BPAC', color='red', linewidth=2)\n",
    "ax2.fill_between(steps, bpac_mean_token - bpac_std_token, bpac_mean_token + bpac_std_token, color='red', alpha=0.1)\n",
    "\n",
    "ax2.axhline(y=1.0, color='black', linestyle=':')\n",
    "ax2.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "ax2.set_ylabel('Token Ratio')\n",
    "ax2.set_title('2. Cost Efficiency (Lower is Better)')\n",
    "ax2.grid(True, alpha=0.3)\n",
    "\n",
    "# === 子图 3: Expert Call Ratio (专家调用率) ===\n",
    "ax3.plot(steps, base_mean_expert, label='Baseline', color='blue', linestyle='--')\n",
    "ax3.fill_between(steps, base_mean_expert - base_std_expert, base_mean_expert + base_std_expert, color='blue', alpha=0.1)\n",
    "\n",
    "ax3.plot(steps, bpac_mean_expert, label='BPAC', color='red', linewidth=2)\n",
    "ax3.fill_between(steps, bpac_mean_expert - bpac_std_expert, bpac_mean_expert + bpac_std_expert, color='red', alpha=0.1)\n",
    "\n",
    "ax3.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "ax3.set_ylabel('Expert Call Ratio')\n",
    "ax3.set_title('3. Expert Usage Behavior')\n",
    "ax3.grid(True, alpha=0.3)\n",
    "ax3.set_ylim(0, 1.1)\n",
    "\n",
    "# === 子图 4: Threshold Evolution (阈值变化) ===\n",
    "# Baseline: 前期是0，后期是固定值\n",
    "ax4.plot(steps, base_mean_threshold_seq, label='Baseline $\\hat{u}$', color='blue', linestyle='--')\n",
    "ax4.fill_between(steps, \n",
    "                 base_mean_threshold_seq - base_std_threshold_seq, \n",
    "                 base_mean_threshold_seq + base_std_threshold_seq, \n",
    "                 color='blue', alpha=0.1)\n",
    "\n",
    "# BPAC: 动态变化\n",
    "ax4.plot(steps, bpac_mean_threshold, label='BPAC $u_t$', color='red', linewidth=1.5)\n",
    "ax4.fill_between(steps, \n",
    "                 bpac_mean_threshold - bpac_std_threshold, \n",
    "                 bpac_mean_threshold + bpac_std_threshold, \n",
    "                 color='red', alpha=0.1)\n",
    "\n",
    "ax4.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "ax4.set_ylabel('Uncertainty Threshold')\n",
    "ax4.set_title('4. Threshold Adaptation (Dynamic vs Fixed)')\n",
    "ax4.legend(loc='upper right')\n",
    "ax4.grid(True, alpha=0.3)\n",
    "ax4.set_ylim(0, 1.0) \n",
    "\n",
    "# === 子图 5: Wealth Evolution (财富积累) ===\n",
    "# 只有 BPAC 有 Wealth\n",
    "ax5.plot(steps, bpac_mean_wealth, label='BPAC Wealth', color='purple', linewidth=1.5)\n",
    "ax5.fill_between(steps, \n",
    "                 bpac_mean_wealth - bpac_std_wealth, \n",
    "                 bpac_mean_wealth + bpac_std_wealth, \n",
    "                 color='purple', alpha=0.1)\n",
    "\n",
    "ax5.axhline(y=1.0, color='black', linestyle='--', label='Initial Wealth (1.0)')\n",
    "ax5.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "# 如果财富增长非常快，建议开启对数坐标\n",
    "# ax5.set_yscale('log') \n",
    "\n",
    "ax5.set_ylabel('Wealth')\n",
    "ax5.set_xlabel('Time Step')\n",
    "ax5.set_title('5. BPAC Wealth Accumulation')\n",
    "ax5.legend(loc='upper left')\n",
    "ax5.grid(True, alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "faa2aa9d",
   "metadata": {},
   "source": [
    "#### paper"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a2068215",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "from scipy import stats\n",
    "\n",
    "# ==========================================\n",
    "# 1. 设置阴影类型 (ICML 关键设置)\n",
    "# ==========================================\n",
    "# 选项: \n",
    "# 'std': 标准差 (Mean ± Std) -> 阴影最大，容易超线\n",
    "# 'sem': 标准误 (Mean ± Std/sqrt(N)) -> 阴影更窄，聚焦于均值的准确性 (推荐!)\n",
    "# 'ci':  95% 置信区间 (Bootstrap/t-distribution) -> 严谨的统计学区间\n",
    "# 'percentile': 分位数 (10% - 90%) -> 展示数据分布的真实范围\n",
    "\n",
    "SHADOW_TYPE = 'sem' \n",
    "NUM_RUNS = 100  # 你的实验次数\n",
    "\n",
    "def get_error_bounds(data_array, type='sem'):\n",
    "    \"\"\"根据选择的类型计算上下界\"\"\"\n",
    "    mean_val = np.mean(data_array, axis=0)\n",
    "    \n",
    "    if type == 'std':\n",
    "        std_val = np.std(data_array, axis=0)\n",
    "        lower = mean_val - std_val\n",
    "        upper = mean_val + std_val\n",
    "        \n",
    "    elif type == 'sem':\n",
    "        # 标准误 = Std / sqrt(N)\n",
    "        sem_val = stats.sem(data_array, axis=0)\n",
    "        lower = mean_val - sem_val\n",
    "        upper = mean_val + sem_val\n",
    "        \n",
    "    elif type == 'ci':\n",
    "        # 95% t-distribution 置信区间\n",
    "        sem_val = stats.sem(data_array, axis=0)\n",
    "        # ppf(0.975) 对应双侧 95%\n",
    "        ci_scale = stats.t.ppf(0.975, df=NUM_RUNS-1) \n",
    "        lower = mean_val - sem_val * ci_scale\n",
    "        upper = mean_val + sem_val * ci_scale\n",
    "        \n",
    "    elif type == 'percentile':\n",
    "        # 10% - 90% 分位数 (剔除最极端的异常值)\n",
    "        lower = np.percentile(data_array, 10, axis=0)\n",
    "        upper = np.percentile(data_array, 90, axis=0)\n",
    "        \n",
    "    else:\n",
    "        raise ValueError(\"Unknown shadow type\")\n",
    "        \n",
    "    return mean_val, lower, upper\n",
    "\n",
    "# ==========================================\n",
    "# 2. 计算绘图数据\n",
    "# ==========================================\n",
    "steps = np.arange(bpac_risks_arr.shape[1])\n",
    "\n",
    "# --- 计算 Risk ---\n",
    "base_mean, base_low, base_high = get_error_bounds(arr_risks, SHADOW_TYPE)\n",
    "bpac_mean, bpac_low, bpac_high = get_error_bounds(bpac_risks_arr, SHADOW_TYPE)\n",
    "\n",
    "# --- 计算 ECP ---\n",
    "base_ecp_mean, base_ecp_low, base_ecp_high = get_error_bounds(arr_expert_ratios, SHADOW_TYPE)\n",
    "bpac_ecp_mean, bpac_ecp_low, bpac_ecp_high = get_error_bounds(bpac_expert_ratios_arr, SHADOW_TYPE)\n",
    "\n",
    "# --- 计算 TCP ---\n",
    "base_tcp_mean, base_tcp_low, base_tcp_high = get_error_bounds(arr_token_ratios, SHADOW_TYPE)\n",
    "bpac_tcp_mean, bpac_tcp_low, bpac_tcp_high = get_error_bounds(bpac_token_ratios_arr, SHADOW_TYPE)\n",
    "\n",
    "# --- 物理截断 (Clipping) ---\n",
    "# 无论用什么统计方法，Risk 和 Ratio 都不可能小于 0\n",
    "def clip_bounds(low, high, min_v=0, max_v=None):\n",
    "    low = np.maximum(low, min_v)\n",
    "    if max_v is not None:\n",
    "        high = np.minimum(high, max_v)\n",
    "    return low, high\n",
    "\n",
    "base_low, base_high = clip_bounds(base_low, base_high, 0, 1.0) # Risk 通常<1，也可不设上限\n",
    "bpac_low, bpac_high = clip_bounds(bpac_low, bpac_high, 0, 1.0)\n",
    "\n",
    "base_ecp_low, base_ecp_high = clip_bounds(base_ecp_low, base_ecp_high, 0, 1.0)\n",
    "bpac_ecp_low, bpac_ecp_high = clip_bounds(bpac_ecp_low, bpac_ecp_high, 0, 1.0)\n",
    "\n",
    "base_tcp_low, base_tcp_high = clip_bounds(base_tcp_low, base_tcp_high, 0, None)\n",
    "bpac_tcp_low, bpac_tcp_high = clip_bounds(bpac_tcp_low, bpac_tcp_high, 0, None)\n",
    "\n",
    "\n",
    "# ==========================================\n",
    "# 3. 绘图 (ICML Style)\n",
    "# ==========================================\n",
    "# 设置字体和线宽\n",
    "plt.rcParams.update({\n",
    "    'font.size': 14,\n",
    "    'axes.labelsize': 16, 'axes.titlesize': 16,\n",
    "    'xtick.labelsize': 14, 'ytick.labelsize': 14,\n",
    "    'legend.fontsize': 13, 'lines.linewidth': 2.5,\n",
    "    'axes.grid': True, 'grid.alpha': 0.3, 'grid.linestyle': '--'\n",
    "})\n",
    "\n",
    "fig, axes = plt.subplots(1, 3, figsize=(20, 5), sharex=True)\n",
    "\n",
    "# 颜色\n",
    "C_BASE = '#1f77b4'  # Blue\n",
    "C_OURS = '#d62728'  # Red\n",
    "C_TGT = '#2ca02c'   # Green\n",
    "C_WARM = 'gray'\n",
    "\n",
    "# --- (a) Risk ---\n",
    "ax = axes[0]\n",
    "ax.plot(steps, base_mean, color=C_BASE, linestyle='--', label='PAC (Baseline)')\n",
    "ax.fill_between(steps, base_low, base_high, color=C_BASE, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, bpac_mean, color=C_OURS, linestyle='-', label='B-PAC (Ours)')\n",
    "ax.fill_between(steps, bpac_low, bpac_high, color=C_OURS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.axhline(y=TARGET_EPSILON, color=C_TGT, linestyle='-', linewidth=2, label=r'Target $\\epsilon$')\n",
    "ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "\n",
    "# ax.set_title('(a) Cumulative Risk')\n",
    "ax.set_ylabel('ER')\n",
    "ax.set_xlabel('Time Step')\n",
    "ax.set_ylim(0, TARGET_EPSILON * 2.0) # 视野聚焦\n",
    "# 图例中说明阴影含义\n",
    "if SHADOW_TYPE == 'sem':\n",
    "    shadow_label = 'Shaded: SEM'\n",
    "elif SHADOW_TYPE == 'std':\n",
    "    shadow_label = 'Shaded: Std. Dev.'\n",
    "elif SHADOW_TYPE == 'ci':\n",
    "    shadow_label = 'Shaded: 95% CI'\n",
    "else:\n",
    "    shadow_label = 'Shaded: 10-90% Pctl'\n",
    "    \n",
    "# 创建一个空的 handle 来显示阴影说明\n",
    "from matplotlib.lines import Line2D\n",
    "handles, labels = ax.get_legend_handles_labels()\n",
    "handles.append(Line2D([0], [0], color='gray', alpha=0.3, linewidth=10))\n",
    "# labels.append(shadow_label)\n",
    "ax.legend(handles, labels, loc='upper right', frameon=True, framealpha=0.9)\n",
    "\n",
    "\n",
    "# --- (b) ECP ---\n",
    "ax = axes[1]\n",
    "ax.plot(steps, base_ecp_mean, color=C_BASE, linestyle='--')\n",
    "ax.fill_between(steps, base_ecp_low, base_ecp_high, color=C_BASE, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, bpac_ecp_mean, color=C_OURS, linestyle='-')\n",
    "ax.fill_between(steps, bpac_ecp_low, bpac_ecp_high, color=C_OURS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "# ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2, label='Warm-up End')\n",
    "ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "# ax.set_title('(b) Expert Call Rate (ECP)')\n",
    "ax.set_ylabel('ECP(%)')\n",
    "ax.set_xlabel('Time Step')\n",
    "ax.set_ylim(0, 1.05)\n",
    "ax.legend(loc='upper right')\n",
    "\n",
    "# --- (c) TCP ---\n",
    "ax = axes[2]\n",
    "ax.plot(steps, base_tcp_mean, color=C_BASE, linestyle='--')\n",
    "ax.fill_between(steps, base_tcp_low, base_tcp_high, color=C_BASE, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, bpac_tcp_mean, color=C_OURS, linestyle='-')\n",
    "ax.fill_between(steps, bpac_tcp_low, bpac_tcp_high, color=C_OURS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.axhline(y=1.0, color='black', linestyle='-.', linewidth=1.5, label='Full Expert')\n",
    "ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "\n",
    "# ax.set_title('(c) Token Cost Ratio (TCP)')\n",
    "ax.set_ylabel('TCP(%)')\n",
    "ax.set_xlabel('Time Step')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('icml_final_plot.pdf', bbox_inches='tight')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "16637bce",
   "metadata": {},
   "source": [
    "## qwen2.5-ins"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "91926e23",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "expert_data3 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/bbh/qwen3-think.json\")\n",
    "\n",
    "expert_data = expert_data3[expert_data3[\"matched\"] == True]\n",
    "session_ids = list(expert_data[\"session_id\"])\n",
    "\n",
    "instant_data1 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/bbh/qwen2.5-7b.json\")\n",
    "instant_data = instant_data1[instant_data1[\"session_id\"].isin(session_ids)]\n",
    "len(list(set(instant_data[\"session_id\"]))), len(list(set(expert_data[\"session_id\"])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1bc9a42d",
   "metadata": {},
   "outputs": [],
   "source": [
    "instant_data.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b14a46d7",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_list =[]\n",
    "selcet_ids =[]\n",
    "for i,row in expert_data.iterrows():\n",
    "    session_id = row['session_id']\n",
    "    if session_id not in instant_data['session_id'].values:\n",
    "        print(f\"Instant data missing at index:{i}, session_id:{session_id}\")\n",
    "        continue\n",
    "    tmp_dict = {}\n",
    "    instant_row = instant_data[instant_data['session_id'] == session_id].iloc[0]\n",
    "    selcet_ids.append(session_id)\n",
    "    if instant_row[\"question\"] != row[\"question\"]:\n",
    "        print(f\"Question mismatch at index:{i}, session_id:{session_id}\")\n",
    "        continue\n",
    "    tmp_dict['uncertainty'] = 1- instant_row['verbalized_prob']\n",
    "    tmp_dict['instant_correct'] = int(instant_row['matched']) if instant_row['matched'] != \"No answer extracted\" else 0\n",
    "    tmp_dict['expert_correct'] = int(expert_data.loc[i, 'matched'])\n",
    "    tmp_dict['instant_token'] = instant_row['gen_token_count']\n",
    "    tmp_dict['expert_token'] = expert_data.loc[i, 'gen_token_count']\n",
    "    data_list.append(tmp_dict)\n",
    "\n",
    "print(f\"Total samples prepared: {len(data_list)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "70ca0d37",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "# =========================\n",
    "# 1. 读取与筛选数据\n",
    "# =========================\n",
    "data1 = pd.read_json(\n",
    "    \"/home/-/-/pac/zeroeval/result_dirs_parsed/bbh/qwen2.5-7b.json\"\n",
    ")\n",
    "\n",
    "data2 = data1[data1[\"session_id\"].isin(selcet_ids)]\n",
    "df = data2.copy()\n",
    "\n",
    "# 统一使用一个 scalar 置信度\n",
    "df[\"token_prob_scalar\"] = df[\"verbalized_prob\"]\n",
    "\n",
    "# =========================\n",
    "# 2. 按 matched 分组\n",
    "# =========================\n",
    "tp_true = df.loc[df[\"matched\"] == True, \"token_prob_scalar\"].dropna()\n",
    "tp_false = df.loc[df[\"matched\"] == False, \"token_prob_scalar\"].dropna()\n",
    "\n",
    "# =========================\n",
    "# 3. 置信度分布直方图\n",
    "# =========================\n",
    "plt.figure()\n",
    "plt.hist(tp_true, bins=30, alpha=0.6, label=\"matched=True\")\n",
    "plt.hist(tp_false, bins=30, alpha=0.6, label=\"matched=False\")\n",
    "plt.xlabel(\"token_prob_scalar\")\n",
    "plt.ylabel(\"Count\")\n",
    "plt.title(\"Token Probability Distribution by Matched\")\n",
    "plt.legend()\n",
    "plt.show()\n",
    "\n",
    "# =========================\n",
    "# 4. Boxplot\n",
    "# =========================\n",
    "plt.figure()\n",
    "plt.boxplot([tp_true, tp_false], labels=[\"Matched=True\", \"Matched=False\"])\n",
    "plt.ylabel(\"token_prob_scalar\")\n",
    "plt.title(\"Token Probability by Matched\")\n",
    "plt.show()\n",
    "\n",
    "\n",
    "# =====================================================================\n",
    "# 第二部分：Instant / Expert 不确定性分析\n",
    "# =====================================================================\n",
    "\n",
    "# data_list: list[dict]\n",
    "instant_tokens = [item[\"instant_token\"] for item in data_list]\n",
    "expert_tokens = [item[\"expert_token\"] for item in data_list]\n",
    "\n",
    "uncertainty_scores = [item[\"uncertainty\"] for item in data_list]\n",
    "correct_instant = [item[\"instant_correct\"] for item in data_list]\n",
    "correct_expert = [item[\"expert_correct\"] for item in data_list]\n",
    "\n",
    "# =========================\n",
    "# 5. 准确率\n",
    "# =========================\n",
    "accuracy_instant = sum(correct_instant) / len(correct_instant)\n",
    "accuracy_expert = sum(correct_expert) / len(correct_expert)\n",
    "\n",
    "print(f\"Instant Model Accuracy: {accuracy_instant:.4f}\")\n",
    "print(f\"Expert Model Accuracy:  {accuracy_expert:.4f}\")\n",
    "\n",
    "expert_tokens = [item['expert_token'] for item in data_list]\n",
    "instant_tokens = [item['instant_token'] for item in data_list]\n",
    "mean_expert_tokens = np.mean(expert_tokens)\n",
    "mean_instant_tokens = np.mean(instant_tokens)\n",
    "print(f\"Mean Expert Tokens: {mean_expert_tokens}, Mean Instant Tokens: {mean_instant_tokens}\")\n",
    "\n",
    "\n",
    "# =========================\n",
    "# 6. Uncertainty 直方图\n",
    "# =========================\n",
    "plt.figure()\n",
    "plt.hist(uncertainty_scores, bins=50)\n",
    "plt.xlabel(\"Uncertainty\")\n",
    "plt.ylabel(\"Count\")\n",
    "plt.title(\"Uncertainty Distribution\")\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "93b68233",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_list1 = np.array(data_list)\n",
    "\n",
    "# np.random.shuffle(data_list1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d05e843e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 运行配置\n",
    "cfg = BPACConfig(epsilon=0.1, alpha=0.1, rho=0.1,warm_up=0)\n",
    "\n",
    "# 执行模拟\n",
    "df_result, model = run_simulation(data_list1, cfg)\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "def plot_bpac_results(df_logs, config):\n",
    "    \"\"\"\n",
    "    绘制 B-PAC 实验结果面板\n",
    "    \"\"\"\n",
    "    if df_logs.empty:\n",
    "        print(\"Log is empty. Check warmup settings or data.\")\n",
    "        return\n",
    "\n",
    "    fig, axes = plt.subplots(4, 1, figsize=(12, 16), sharex=True)\n",
    "    \n",
    "    # ----------------------------------\n",
    "    # Subplot 1: Threshold Adaptation\n",
    "    # ----------------------------------\n",
    "    ax = axes[0]\n",
    "    ax.plot(df_logs['step'], df_logs['threshold'], label='Threshold ($u_t$)', color='#1f77b4', linewidth=2)\n",
    "    # 可以在背景里画出 uncertainty 的散点，展示数据分布（可选）\n",
    "    # ax.scatter(df_logs['step'], df_logs['uncertainty'], alpha=0.1, color='gray', s=1, label='Input Uncertainty')\n",
    "    ax.set_ylabel('Uncertainty Score')\n",
    "    ax.set_title(f'Threshold Adaptation (Confidence {1-config.alpha:.0%})')\n",
    "    ax.legend(loc='upper right')\n",
    "    ax.grid(True, alpha=0.3)\n",
    "    ax.set_ylim(-0.05, 1.05)\n",
    "\n",
    "    # ----------------------------------\n",
    "    # Subplot 2: Safety (Risk Control)\n",
    "    # ----------------------------------\n",
    "    ax = axes[1]\n",
    "    ax.plot(df_logs['step'], df_logs['avg_risk'], color='#d62728', label='Cumulative Avg Risk', linewidth=2)\n",
    "    ax.axhline(y=config.epsilon, color='black', linestyle='--', linewidth=2, label=f'Target Risk ($\\epsilon={config.epsilon}$)')\n",
    "    ax.set_ylabel('Risk Rate')\n",
    "    ax.set_title('Safety Guarantee: Realized Risk vs. Target')\n",
    "    ax.legend(loc='upper right')\n",
    "    ax.grid(True, alpha=0.3)\n",
    "    \n",
    "    # ----------------------------------\n",
    "    # Subplot 3: Efficiency (Token & Expert Ratio)\n",
    "    # ----------------------------------\n",
    "    ax = axes[2]\n",
    "    ax.plot(df_logs['step'], df_logs['token_ratio'], color='#2ca02c', label='Token Ratio (vs. All-Expert)', linewidth=2)\n",
    "    ax.plot(df_logs['step'], df_logs['expert_call_ratio'], color='#ff7f0e', linestyle='-.', label='Expert Call Ratio', linewidth=2)\n",
    "    ax.set_ylabel('Ratio')\n",
    "    ax.set_title('Efficiency: Cost Reduction')\n",
    "    ax.legend(loc='upper right')\n",
    "    ax.grid(True, alpha=0.3)\n",
    "    \n",
    "    # ----------------------------------\n",
    "    # Subplot 4: Stability (Wealth Process)\n",
    "    # ----------------------------------\n",
    "    ax = axes[3]\n",
    "    ax.plot(df_logs['step'], df_logs['wealth'], color='#9467bd', label='Wealth of Selected Threshold ($K_t$)', linewidth=1.5)\n",
    "    # 画出安全线 1/alpha\n",
    "    safe_wealth = 1.0 / config.alpha\n",
    "    ax.axhline(y=safe_wealth, color='purple', linestyle=':', label=f'Safety Barrier ($1/\\\\alpha={safe_wealth:.1f}$)')\n",
    "    \n",
    "    ax.set_ylabel('Wealth Value')\n",
    "    ax.set_xlabel('Simulation Step (post-warmup)')\n",
    "    ax.set_title('Martingale Wealth Process')\n",
    "    ax.set_yscale('log') # 财富值通常是指数增长的，用对数坐标更好看\n",
    "    ax.legend(loc='upper left')\n",
    "    ax.grid(True, alpha=0.3)\n",
    "\n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "\n",
    "plot_bpac_results(df_result, cfg)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "15f45ec4",
   "metadata": {},
   "source": [
    "### compare to pac"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "205ee580",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from tqdm import tqdm \n",
    "\n",
    "# -------------------------------------------------------------------------\n",
    "# 0. 准备全量数据 (只做一次)\n",
    "# -------------------------------------------------------------------------\n",
    "# 假设 data_list 已经准备好\n",
    "y_solved = [item['expert_correct'] for item in data_list]\n",
    "y_hat_solved = [item['instant_correct'] for item in data_list]\n",
    "uncertainty_values = [item['uncertainty'] for item in data_list]\n",
    "y_token_list = [item['expert_token'] for item in data_list]\n",
    "y_hat_token_list = [item['instant_token'] for item in data_list]\n",
    "\n",
    "# 参数设置\n",
    "NUM_RUNS = 100         # 重复次数\n",
    "TARGET_EPSILON = 0.05   # 目标 Risk\n",
    "TARGET_ALPHA = 0.05     # 置信度\n",
    "CALIB_NUM = 500        # 校准集大小\n",
    "TOTAL_NUM = len(y_solved)\n",
    "calib_ratio = CALIB_NUM / TOTAL_NUM\n",
    "\n",
    "# -------------------------------------------------------------------------\n",
    "# 1. 循环运行 Simulation\n",
    "# -------------------------------------------------------------------------\n",
    "\n",
    "# 用于存储每次实验的完整序列\n",
    "# 维度: (100, N_samples)\n",
    "all_risks = []\n",
    "all_token_ratios = []\n",
    "all_expert_ratios = []\n",
    "all_u_hats = []\n",
    "\n",
    "print(f\"Starting {NUM_RUNS} runs simulation...\")\n",
    "\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    # 每次使用不同的随机种子 (seed=0, 1, 2, ..., 99)\n",
    "    # 这样 run_baseline_continuous 内部的 shuffle 结果每次都不一样\n",
    "    df, u_hat = run_baseline_continuous(\n",
    "        y_solved, \n",
    "        y_hat_solved, \n",
    "        uncertainty_values, \n",
    "        y_token_list, \n",
    "        y_hat_token_list,\n",
    "        calib_ratio=calib_ratio, \n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=TARGET_ALPHA,\n",
    "        seed=seed\n",
    "    )\n",
    "    \n",
    "    # 记录整条曲线 (df['avg_risk'] 是一个 Series)\n",
    "    all_risks.append(df['avg_risk'].values)\n",
    "    all_token_ratios.append(df['token_ratio'].values)\n",
    "    all_expert_ratios.append(df['expert_call_ratio'].values)\n",
    "    all_u_hats.append(u_hat)\n",
    "\n",
    "# 转为 Numpy Array 方便计算均值方差\n",
    "# shape = (100, N)\n",
    "arr_risks = np.array(all_risks)\n",
    "arr_token_ratios = np.array(all_token_ratios)\n",
    "arr_expert_ratios = np.array(all_expert_ratios)\n",
    "arr_thresholds = np.array(all_u_hats)\n",
    "\n",
    "print(\"\\nSimulation Finished!\")\n",
    "print(f\"Average u_hat across runs: {np.mean(all_u_hats):.4f} (std: {np.std(all_u_hats):.4f})\")\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "# 参数设置\n",
    "NUM_RUNS = 100        # 重复 100 次\n",
    "WARM_UP_STEPS = 0   # BPAC 的 Warm-up 步数 (对应 Baseline 的校准集大小)\n",
    "TARGET_EPSILON = 0.05  # 目标 Risk\n",
    "CFG_ALPHA= 0.05       # BPAC 参数\n",
    "CFG_RHO = 0.1         # BPAC 参数\n",
    "BPA_beta = 1\n",
    "\n",
    "bpac_risks = []\n",
    "bpac_token_ratios = []\n",
    "bpac_expert_ratios = []\n",
    "bpac_wealths = []\n",
    "bpac_thresholds = []\n",
    "\n",
    "print(f\"Starting BPAC Simulation ({NUM_RUNS} runs)...\")\n",
    "\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    # -----------------------------------------------------------\n",
    "    # 1. 严格的数据对齐 (Data Alignment)\n",
    "    # -----------------------------------------------------------\n",
    "    # 使用与 Baseline 相同的种子生成随机数生成器\n",
    "    rng = np.random.default_rng(seed)\n",
    "    \n",
    "    # 生成打乱的索引\n",
    "    n = len(data_list)\n",
    "    indices = np.arange(n)\n",
    "    rng.shuffle(indices)\n",
    "    \n",
    "    # 关键修正：根据 indices 重排 data_list\n",
    "    # 注意：data_list 是 list of dicts，不能直接用 indices 索引，除非转 numpy\n",
    "    # 这里用列表推导式最稳妥\n",
    "    shuffled_data = [data_list[i] for i in indices]\n",
    "    \n",
    "    # -----------------------------------------------------------\n",
    "    # 2. 配置并运行 BPAC\n",
    "    # -----------------------------------------------------------\n",
    "    # 你的 BPACConfig\n",
    "    cfg = BPACConfig(\n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=CFG_ALPHA, \n",
    "        rho=CFG_RHO, \n",
    "        warm_up=WARM_UP_STEPS,\n",
    "        num_thresholds=1001, \n",
    "        beta=BPA_beta, \n",
    "        c_clip=0.9\n",
    "    )\n",
    "    \n",
    "    # 运行模拟 (传入打乱后的数据)\n",
    "    df_result, _ = run_simulation(shuffled_data, cfg)\n",
    "    \n",
    "    bpac_risks.append(df_result['avg_risk'].values)\n",
    "    bpac_token_ratios.append(df_result['token_ratio'].values)\n",
    "    bpac_expert_ratios.append(df_result['expert_call_ratio'].values)\n",
    "    bpac_wealths.append(df_result['wealth'].values)\n",
    "    bpac_thresholds.append(df_result['threshold'].values)\n",
    "\n",
    "\n",
    "# 转换为 Numpy 数组方便计算均值\n",
    "bpac_risks_arr = np.array(bpac_risks)       # Shape: (100, N)\n",
    "bpac_token_ratios_arr = np.array(bpac_token_ratios) # Shape: (100, N)\n",
    "bpac_expert_ratios_arr = np.array(bpac_expert_ratios) # Shape: (100, N)\n",
    "bpac_wealths_arr = np.array(bpac_wealths)   # Shape: (100, N)\n",
    "bpac_thresholds_arr = np.array(bpac_thresholds) # Shape: (100, N)\n",
    "print(\"BPAC Simulation Finished!\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eccac243",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# Baseline 数据\n",
    "base_mean_risk = np.mean(arr_risks, axis=0)\n",
    "base_std_risk = np.std(arr_risks, axis=0)\n",
    "\n",
    "base_mean_token = np.mean(arr_token_ratios, axis=0)\n",
    "base_std_token = np.std(arr_token_ratios, axis=0)\n",
    "\n",
    "base_mean_expert = np.mean(arr_expert_ratios, axis=0)\n",
    "base_std_expert = np.std(arr_expert_ratios, axis=0)\n",
    "\n",
    "# 获取总时间步数\n",
    "N_STEPS = bpac_risks_arr.shape[1] \n",
    "\n",
    "# A. 重构 Baseline Threshold 序列 (因为你只存了最终的一个 u_hat)\n",
    "# 逻辑：在校准期 (前 CALIB_NUM 步) 阈值设为 0，之后设为 u_hat\n",
    "# arr_thresholds 是 (100,) 的标量数组\n",
    "base_threshold_seqs = np.zeros((NUM_RUNS, N_STEPS))\n",
    "for i in range(NUM_RUNS):\n",
    "    base_threshold_seqs[i, :CALIB_NUM] = 0.0  # 校准期阈值通常视为0 (强制Action=1)\n",
    "    base_threshold_seqs[i, CALIB_NUM:] = arr_thresholds[i] # 测试期应用 u_hat\n",
    "\n",
    "base_mean_threshold_seq = np.mean(base_threshold_seqs, axis=0)\n",
    "base_std_threshold_seq = np.std(base_threshold_seqs, axis=0)\n",
    "\n",
    "\n",
    "# BPAC 数据\n",
    "bpac_mean_risk = np.mean(bpac_risks_arr, axis=0)\n",
    "bpac_std_risk = np.std(bpac_risks_arr, axis=0)\n",
    "\n",
    "bpac_mean_token = np.mean(bpac_token_ratios_arr, axis=0)\n",
    "bpac_std_token = np.std(bpac_token_ratios_arr, axis=0)\n",
    "\n",
    "bpac_mean_expert = np.mean(bpac_expert_ratios_arr, axis=0)\n",
    "bpac_std_expert = np.std(bpac_expert_ratios_arr, axis=0)\n",
    "\n",
    "# C. 确保其他 BPAC 统计量已准备好 (以防万一)\n",
    "bpac_mean_threshold = np.mean(bpac_thresholds_arr, axis=0)\n",
    "bpac_std_threshold = np.std(bpac_thresholds_arr, axis=0)\n",
    "\n",
    "# B. 计算 BPAC Wealth 统计 (你存了 bpac_wealths_arr 但还没算均值)\n",
    "bpac_mean_wealth = np.mean(bpac_wealths_arr, axis=0)\n",
    "bpac_std_wealth = np.std(bpac_wealths_arr, axis=0)\n",
    "\n",
    "\n",
    "fig, axes = plt.subplots(5, 1, figsize=(12, 24), sharex=True)\n",
    "(ax1, ax2, ax3, ax4, ax5) = axes\n",
    "\n",
    "# X轴\n",
    "steps = np.arange(N_STEPS)\n",
    "\n",
    "# === 子图 1: Average Risk (风险控制) ===\n",
    "ax1.plot(steps, base_mean_risk, label='Baseline', color='blue', linestyle='--')\n",
    "ax1.fill_between(steps, base_mean_risk - base_std_risk, base_mean_risk + base_std_risk, color='blue', alpha=0.1)\n",
    "\n",
    "ax1.plot(steps, bpac_mean_risk, label='BPAC (Ours)', color='red', linewidth=2)\n",
    "ax1.fill_between(steps, bpac_mean_risk - bpac_std_risk, bpac_mean_risk + bpac_std_risk, color='red', alpha=0.1)\n",
    "\n",
    "ax1.axhline(y=TARGET_EPSILON, color='green', linestyle='-', linewidth=2, label=f'Target $\\epsilon={TARGET_EPSILON}$')\n",
    "ax1.axvline(x=CALIB_NUM, color='gray', linestyle=':', label='Calibration End')\n",
    "\n",
    "ax1.set_ylabel('Avg Risk')\n",
    "ax1.set_title(f'1. Risk Control (Target $\\epsilon < {TARGET_EPSILON}$)')\n",
    "ax1.legend(loc='upper right')\n",
    "ax1.grid(True, alpha=0.3)\n",
    "ax1.set_ylim(0, TARGET_EPSILON * 3.0) # 聚焦在 Epsilon 附近\n",
    "\n",
    "# === 子图 2: Token Ratio (成本节省) ===\n",
    "ax2.plot(steps, base_mean_token, label='Baseline', color='blue', linestyle='--')\n",
    "ax2.fill_between(steps, base_mean_token - base_std_token, base_mean_token + base_std_token, color='blue', alpha=0.1)\n",
    "\n",
    "ax2.plot(steps, bpac_mean_token, label='BPAC', color='red', linewidth=2)\n",
    "ax2.fill_between(steps, bpac_mean_token - bpac_std_token, bpac_mean_token + bpac_std_token, color='red', alpha=0.1)\n",
    "\n",
    "ax2.axhline(y=1.0, color='black', linestyle=':')\n",
    "ax2.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "ax2.set_ylabel('Token Ratio')\n",
    "ax2.set_title('2. Cost Efficiency (Lower is Better)')\n",
    "ax2.grid(True, alpha=0.3)\n",
    "\n",
    "# === 子图 3: Expert Call Ratio (专家调用率) ===\n",
    "ax3.plot(steps, base_mean_expert, label='Baseline', color='blue', linestyle='--')\n",
    "ax3.fill_between(steps, base_mean_expert - base_std_expert, base_mean_expert + base_std_expert, color='blue', alpha=0.1)\n",
    "\n",
    "ax3.plot(steps, bpac_mean_expert, label='BPAC', color='red', linewidth=2)\n",
    "ax3.fill_between(steps, bpac_mean_expert - bpac_std_expert, bpac_mean_expert + bpac_std_expert, color='red', alpha=0.1)\n",
    "\n",
    "ax3.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "ax3.set_ylabel('Expert Call Ratio')\n",
    "ax3.set_title('3. Expert Usage Behavior')\n",
    "ax3.grid(True, alpha=0.3)\n",
    "ax3.set_ylim(0, 1.1)\n",
    "\n",
    "# === 子图 4: Threshold Evolution (阈值变化) ===\n",
    "# Baseline: 前期是0，后期是固定值\n",
    "ax4.plot(steps, base_mean_threshold_seq, label='Baseline $\\hat{u}$', color='blue', linestyle='--')\n",
    "ax4.fill_between(steps, \n",
    "                 base_mean_threshold_seq - base_std_threshold_seq, \n",
    "                 base_mean_threshold_seq + base_std_threshold_seq, \n",
    "                 color='blue', alpha=0.1)\n",
    "\n",
    "# BPAC: 动态变化\n",
    "ax4.plot(steps, bpac_mean_threshold, label='BPAC $u_t$', color='red', linewidth=1.5)\n",
    "ax4.fill_between(steps, \n",
    "                 bpac_mean_threshold - bpac_std_threshold, \n",
    "                 bpac_mean_threshold + bpac_std_threshold, \n",
    "                 color='red', alpha=0.1)\n",
    "\n",
    "ax4.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "ax4.set_ylabel('Uncertainty Threshold')\n",
    "ax4.set_title('4. Threshold Adaptation (Dynamic vs Fixed)')\n",
    "ax4.legend(loc='upper right')\n",
    "ax4.grid(True, alpha=0.3)\n",
    "ax4.set_ylim(0, 1.0) \n",
    "\n",
    "# === 子图 5: Wealth Evolution (财富积累) ===\n",
    "# 只有 BPAC 有 Wealth\n",
    "ax5.plot(steps, bpac_mean_wealth, label='BPAC Wealth', color='purple', linewidth=1.5)\n",
    "ax5.fill_between(steps, \n",
    "                 bpac_mean_wealth - bpac_std_wealth, \n",
    "                 bpac_mean_wealth + bpac_std_wealth, \n",
    "                 color='purple', alpha=0.1)\n",
    "\n",
    "ax5.axhline(y=1.0, color='black', linestyle='--', label='Initial Wealth (1.0)')\n",
    "ax5.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "# 如果财富增长非常快，建议开启对数坐标\n",
    "# ax5.set_yscale('log') \n",
    "\n",
    "ax5.set_ylabel('Wealth')\n",
    "ax5.set_xlabel('Time Step')\n",
    "ax5.set_title('5. BPAC Wealth Accumulation')\n",
    "ax5.legend(loc='upper left')\n",
    "ax5.grid(True, alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "66adb5da",
   "metadata": {},
   "source": [
    "#### paper"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "94a82f01",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "from scipy import stats\n",
    "\n",
    "# ==========================================\n",
    "# 1. 设置阴影类型 (ICML 关键设置)\n",
    "# ==========================================\n",
    "# 选项: \n",
    "# 'std': 标准差 (Mean ± Std) -> 阴影最大，容易超线\n",
    "# 'sem': 标准误 (Mean ± Std/sqrt(N)) -> 阴影更窄，聚焦于均值的准确性 (推荐!)\n",
    "# 'ci':  95% 置信区间 (Bootstrap/t-distribution) -> 严谨的统计学区间\n",
    "# 'percentile': 分位数 (10% - 90%) -> 展示数据分布的真实范围\n",
    "\n",
    "SHADOW_TYPE = 'sem' \n",
    "NUM_RUNS = 100  # 你的实验次数\n",
    "\n",
    "def get_error_bounds(data_array, type='sem'):\n",
    "    \"\"\"根据选择的类型计算上下界\"\"\"\n",
    "    mean_val = np.mean(data_array, axis=0)\n",
    "    \n",
    "    if type == 'std':\n",
    "        std_val = np.std(data_array, axis=0)\n",
    "        lower = mean_val - std_val\n",
    "        upper = mean_val + std_val\n",
    "        \n",
    "    elif type == 'sem':\n",
    "        # 标准误 = Std / sqrt(N)\n",
    "        sem_val = stats.sem(data_array, axis=0)\n",
    "        lower = mean_val - sem_val\n",
    "        upper = mean_val + sem_val\n",
    "        \n",
    "    elif type == 'ci':\n",
    "        # 95% t-distribution 置信区间\n",
    "        sem_val = stats.sem(data_array, axis=0)\n",
    "        # ppf(0.975) 对应双侧 95%\n",
    "        ci_scale = stats.t.ppf(0.975, df=NUM_RUNS-1) \n",
    "        lower = mean_val - sem_val * ci_scale\n",
    "        upper = mean_val + sem_val * ci_scale\n",
    "        \n",
    "    elif type == 'percentile':\n",
    "        # 10% - 90% 分位数 (剔除最极端的异常值)\n",
    "        lower = np.percentile(data_array, 10, axis=0)\n",
    "        upper = np.percentile(data_array, 90, axis=0)\n",
    "        \n",
    "    else:\n",
    "        raise ValueError(\"Unknown shadow type\")\n",
    "        \n",
    "    return mean_val, lower, upper\n",
    "\n",
    "# ==========================================\n",
    "# 2. 计算绘图数据\n",
    "# ==========================================\n",
    "steps = np.arange(bpac_risks_arr.shape[1])\n",
    "\n",
    "# --- 计算 Risk ---\n",
    "base_mean, base_low, base_high = get_error_bounds(arr_risks, SHADOW_TYPE)\n",
    "bpac_mean, bpac_low, bpac_high = get_error_bounds(bpac_risks_arr, SHADOW_TYPE)\n",
    "\n",
    "# --- 计算 ECP ---\n",
    "base_ecp_mean, base_ecp_low, base_ecp_high = get_error_bounds(arr_expert_ratios, SHADOW_TYPE)\n",
    "bpac_ecp_mean, bpac_ecp_low, bpac_ecp_high = get_error_bounds(bpac_expert_ratios_arr, SHADOW_TYPE)\n",
    "\n",
    "# --- 计算 TCP ---\n",
    "base_tcp_mean, base_tcp_low, base_tcp_high = get_error_bounds(arr_token_ratios, SHADOW_TYPE)\n",
    "bpac_tcp_mean, bpac_tcp_low, bpac_tcp_high = get_error_bounds(bpac_token_ratios_arr, SHADOW_TYPE)\n",
    "\n",
    "# --- 物理截断 (Clipping) ---\n",
    "# 无论用什么统计方法，Risk 和 Ratio 都不可能小于 0\n",
    "def clip_bounds(low, high, min_v=0, max_v=None):\n",
    "    low = np.maximum(low, min_v)\n",
    "    if max_v is not None:\n",
    "        high = np.minimum(high, max_v)\n",
    "    return low, high\n",
    "\n",
    "base_low, base_high = clip_bounds(base_low, base_high, 0, 1.0) # Risk 通常<1，也可不设上限\n",
    "bpac_low, bpac_high = clip_bounds(bpac_low, bpac_high, 0, 1.0)\n",
    "\n",
    "base_ecp_low, base_ecp_high = clip_bounds(base_ecp_low, base_ecp_high, 0, 1.0)\n",
    "bpac_ecp_low, bpac_ecp_high = clip_bounds(bpac_ecp_low, bpac_ecp_high, 0, 1.0)\n",
    "\n",
    "base_tcp_low, base_tcp_high = clip_bounds(base_tcp_low, base_tcp_high, 0, None)\n",
    "bpac_tcp_low, bpac_tcp_high = clip_bounds(bpac_tcp_low, bpac_tcp_high, 0, None)\n",
    "\n",
    "\n",
    "# ==========================================\n",
    "# 3. 绘图 (ICML Style)\n",
    "# ==========================================\n",
    "# 设置字体和线宽\n",
    "plt.rcParams.update({\n",
    "    'font.size': 14,\n",
    "    'axes.labelsize': 16, 'axes.titlesize': 16,\n",
    "    'xtick.labelsize': 14, 'ytick.labelsize': 14,\n",
    "    'legend.fontsize': 13, 'lines.linewidth': 2.5,\n",
    "    'axes.grid': True, 'grid.alpha': 0.3, 'grid.linestyle': '--'\n",
    "})\n",
    "\n",
    "fig, axes = plt.subplots(1, 3, figsize=(20, 5), sharex=True)\n",
    "\n",
    "# 颜色\n",
    "C_BASE = '#1f77b4'  # Blue\n",
    "C_OURS = '#d62728'  # Red\n",
    "C_TGT = '#2ca02c'   # Green\n",
    "C_WARM = 'gray'\n",
    "\n",
    "# --- (a) Risk ---\n",
    "ax = axes[0]\n",
    "ax.plot(steps, base_mean, color=C_BASE, linestyle='--', label='PAC (Baseline)')\n",
    "ax.fill_between(steps, base_low, base_high, color=C_BASE, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, bpac_mean, color=C_OURS, linestyle='-', label='B-PAC (Ours)')\n",
    "ax.fill_between(steps, bpac_low, bpac_high, color=C_OURS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.axhline(y=TARGET_EPSILON, color=C_TGT, linestyle='-', linewidth=2, label=r'Target $\\epsilon$')\n",
    "ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "\n",
    "# ax.set_title('(a) Cumulative Risk')\n",
    "ax.set_ylabel('ER')\n",
    "ax.set_xlabel('Time Step')\n",
    "ax.set_ylim(0, TARGET_EPSILON * 2.0) # 视野聚焦\n",
    "# 图例中说明阴影含义\n",
    "if SHADOW_TYPE == 'sem':\n",
    "    shadow_label = 'Shaded: SEM'\n",
    "elif SHADOW_TYPE == 'std':\n",
    "    shadow_label = 'Shaded: Std. Dev.'\n",
    "elif SHADOW_TYPE == 'ci':\n",
    "    shadow_label = 'Shaded: 95% CI'\n",
    "else:\n",
    "    shadow_label = 'Shaded: 10-90% Pctl'\n",
    "    \n",
    "# 创建一个空的 handle 来显示阴影说明\n",
    "from matplotlib.lines import Line2D\n",
    "handles, labels = ax.get_legend_handles_labels()\n",
    "handles.append(Line2D([0], [0], color='gray', alpha=0.3, linewidth=10))\n",
    "# labels.append(shadow_label)\n",
    "ax.legend(handles, labels, loc='upper right', frameon=True, framealpha=0.9)\n",
    "\n",
    "\n",
    "# --- (b) ECP ---\n",
    "ax = axes[1]\n",
    "ax.plot(steps, base_ecp_mean, color=C_BASE, linestyle='--')\n",
    "ax.fill_between(steps, base_ecp_low, base_ecp_high, color=C_BASE, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, bpac_ecp_mean, color=C_OURS, linestyle='-')\n",
    "ax.fill_between(steps, bpac_ecp_low, bpac_ecp_high, color=C_OURS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "# ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2, label='Warm-up End')\n",
    "ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "# ax.set_title('(b) Expert Call Rate (ECP)')\n",
    "ax.set_ylabel('ECP(%)')\n",
    "ax.set_xlabel('Time Step')\n",
    "ax.set_ylim(0, 1.05)\n",
    "ax.legend(loc='upper right')\n",
    "\n",
    "# --- (c) TCP ---\n",
    "ax = axes[2]\n",
    "ax.plot(steps, base_tcp_mean, color=C_BASE, linestyle='--')\n",
    "ax.fill_between(steps, base_tcp_low, base_tcp_high, color=C_BASE, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, bpac_tcp_mean, color=C_OURS, linestyle='-')\n",
    "ax.fill_between(steps, bpac_tcp_low, bpac_tcp_high, color=C_OURS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.axhline(y=1.0, color='black', linestyle='-.', linewidth=1.5, label='Full Expert')\n",
    "ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "\n",
    "# ax.set_title('(c) Token Cost Ratio (TCP)')\n",
    "ax.set_ylabel('TCP(%)')\n",
    "ax.set_xlabel('Time Step')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('icml_final_plot.pdf', bbox_inches='tight')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "61dbdd98",
   "metadata": {},
   "source": [
    "## gemma3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aa35341b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "expert_data3 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/bbh/qwen3-think.json\")\n",
    "\n",
    "expert_data = expert_data3[expert_data3[\"matched\"] == True]\n",
    "session_ids = list(expert_data[\"session_id\"])\n",
    "\n",
    "instant_data1 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/bbh/gemma3-ins.json\")\n",
    "instant_data = instant_data1[instant_data1[\"session_id\"].isin(session_ids)]\n",
    "len(list(set(instant_data[\"session_id\"]))), len(list(set(expert_data[\"session_id\"])))\n",
    "instant_data.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b08ddcfe",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "data_list =[]\n",
    "selcet_ids =[]\n",
    "for i,row in expert_data.iterrows():\n",
    "    session_id = row['session_id']\n",
    "    if session_id not in instant_data['session_id'].values:\n",
    "        print(f\"Instant data missing at index:{i}, session_id:{session_id}\")\n",
    "        continue\n",
    "    tmp_dict = {}\n",
    "    instant_row = instant_data[instant_data['session_id'] == session_id].iloc[0]\n",
    "    selcet_ids.append(session_id)\n",
    "    if instant_row[\"question\"] != row[\"question\"]:\n",
    "        print(f\"Question mismatch at index:{i}, session_id:{session_id}\")\n",
    "        continue\n",
    "    tmp_dict['uncertainty'] = 1- instant_row['verbalized_prob']\n",
    "    tmp_dict['instant_correct'] = int(instant_row['matched']) if instant_row['matched'] != \"No answer extracted\" else 0\n",
    "    tmp_dict['expert_correct'] = int(expert_data.loc[i, 'matched'])\n",
    "    tmp_dict['instant_token'] = instant_row['gen_token_count']\n",
    "    tmp_dict['expert_token'] = expert_data.loc[i, 'gen_token_count']\n",
    "    data_list.append(tmp_dict)\n",
    "\n",
    "print(f\"Total samples prepared: {len(data_list)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "220a449b",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "# =========================\n",
    "# 1. 读取与筛选数据\n",
    "# =========================\n",
    "data1 = pd.read_json(\n",
    "    \"/home/-/-/pac/zeroeval/result_dirs_parsed/bbh/gemma3-ins.json\"\n",
    ")\n",
    "\n",
    "data2 = data1[data1[\"session_id\"].isin(selcet_ids)]\n",
    "df = data2.copy()\n",
    "\n",
    "# 统一使用一个 scalar 置信度\n",
    "df[\"token_prob_scalar\"] = df[\"verbalized_prob\"]\n",
    "\n",
    "# =========================\n",
    "# 2. 按 matched 分组\n",
    "# =========================\n",
    "tp_true = df.loc[df[\"matched\"] == True, \"token_prob_scalar\"].dropna()\n",
    "tp_false = df.loc[df[\"matched\"] == False, \"token_prob_scalar\"].dropna()\n",
    "\n",
    "# =========================\n",
    "# 3. 置信度分布直方图\n",
    "# =========================\n",
    "plt.figure()\n",
    "plt.hist(tp_true, bins=30, alpha=0.6, label=\"matched=True\")\n",
    "plt.hist(tp_false, bins=30, alpha=0.6, label=\"matched=False\")\n",
    "plt.xlabel(\"token_prob_scalar\")\n",
    "plt.ylabel(\"Count\")\n",
    "plt.title(\"Token Probability Distribution by Matched\")\n",
    "plt.legend()\n",
    "plt.show()\n",
    "\n",
    "# =========================\n",
    "# 4. Boxplot\n",
    "# =========================\n",
    "plt.figure()\n",
    "plt.boxplot([tp_true, tp_false], labels=[\"Matched=True\", \"Matched=False\"])\n",
    "plt.ylabel(\"token_prob_scalar\")\n",
    "plt.title(\"Token Probability by Matched\")\n",
    "plt.show()\n",
    "\n",
    "\n",
    "# =====================================================================\n",
    "# 第二部分：Instant / Expert 不确定性分析\n",
    "# =====================================================================\n",
    "\n",
    "# data_list: list[dict]\n",
    "instant_tokens = [item[\"instant_token\"] for item in data_list]\n",
    "expert_tokens = [item[\"expert_token\"] for item in data_list]\n",
    "\n",
    "uncertainty_scores = [item[\"uncertainty\"] for item in data_list]\n",
    "correct_instant = [item[\"instant_correct\"] for item in data_list]\n",
    "correct_expert = [item[\"expert_correct\"] for item in data_list]\n",
    "\n",
    "# =========================\n",
    "# 5. 准确率\n",
    "# =========================\n",
    "accuracy_instant = sum(correct_instant) / len(correct_instant)\n",
    "accuracy_expert = sum(correct_expert) / len(correct_expert)\n",
    "\n",
    "print(f\"Instant Model Accuracy: {accuracy_instant:.4f}\")\n",
    "print(f\"Expert Model Accuracy:  {accuracy_expert:.4f}\")\n",
    "\n",
    "expert_tokens = [item['expert_token'] for item in data_list]\n",
    "instant_tokens = [item['instant_token'] for item in data_list]\n",
    "mean_expert_tokens = np.mean(expert_tokens)\n",
    "mean_instant_tokens = np.mean(instant_tokens)\n",
    "print(f\"Mean Expert Tokens: {mean_expert_tokens}, Mean Instant Tokens: {mean_instant_tokens}\")\n",
    "\n",
    "\n",
    "# =========================\n",
    "# 6. Uncertainty 直方图\n",
    "# =========================\n",
    "plt.figure()\n",
    "plt.hist(uncertainty_scores, bins=50)\n",
    "plt.xlabel(\"Uncertainty\")\n",
    "plt.ylabel(\"Count\")\n",
    "plt.title(\"Uncertainty Distribution\")\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "411ee638",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# 运行配置\n",
    "cfg = BPACConfig(epsilon=0.1, alpha=0.1, rho=0.1,warm_up=0)\n",
    "\n",
    "# 执行模拟\n",
    "df_result, model = run_simulation(data_list1, cfg)\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "def plot_bpac_results(df_logs, config):\n",
    "    \"\"\"\n",
    "    绘制 B-PAC 实验结果面板\n",
    "    \"\"\"\n",
    "    if df_logs.empty:\n",
    "        print(\"Log is empty. Check warmup settings or data.\")\n",
    "        return\n",
    "\n",
    "    fig, axes = plt.subplots(4, 1, figsize=(12, 16), sharex=True)\n",
    "    \n",
    "    # ----------------------------------\n",
    "    # Subplot 1: Threshold Adaptation\n",
    "    # ----------------------------------\n",
    "    ax = axes[0]\n",
    "    ax.plot(df_logs['step'], df_logs['threshold'], label='Threshold ($u_t$)', color='#1f77b4', linewidth=2)\n",
    "    # 可以在背景里画出 uncertainty 的散点，展示数据分布（可选）\n",
    "    # ax.scatter(df_logs['step'], df_logs['uncertainty'], alpha=0.1, color='gray', s=1, label='Input Uncertainty')\n",
    "    ax.set_ylabel('Uncertainty Score')\n",
    "    ax.set_title(f'Threshold Adaptation (Confidence {1-config.alpha:.0%})')\n",
    "    ax.legend(loc='upper right')\n",
    "    ax.grid(True, alpha=0.3)\n",
    "    ax.set_ylim(-0.05, 1.05)\n",
    "\n",
    "    # ----------------------------------\n",
    "    # Subplot 2: Safety (Risk Control)\n",
    "    # ----------------------------------\n",
    "    ax = axes[1]\n",
    "    ax.plot(df_logs['step'], df_logs['avg_risk'], color='#d62728', label='Cumulative Avg Risk', linewidth=2)\n",
    "    ax.axhline(y=config.epsilon, color='black', linestyle='--', linewidth=2, label=f'Target Risk ($\\epsilon={config.epsilon}$)')\n",
    "    ax.set_ylabel('Risk Rate')\n",
    "    ax.set_title('Safety Guarantee: Realized Risk vs. Target')\n",
    "    ax.legend(loc='upper right')\n",
    "    ax.grid(True, alpha=0.3)\n",
    "    \n",
    "    # ----------------------------------\n",
    "    # Subplot 3: Efficiency (Token & Expert Ratio)\n",
    "    # ----------------------------------\n",
    "    ax = axes[2]\n",
    "    ax.plot(df_logs['step'], df_logs['token_ratio'], color='#2ca02c', label='Token Ratio (vs. All-Expert)', linewidth=2)\n",
    "    ax.plot(df_logs['step'], df_logs['expert_call_ratio'], color='#ff7f0e', linestyle='-.', label='Expert Call Ratio', linewidth=2)\n",
    "    ax.set_ylabel('Ratio')\n",
    "    ax.set_title('Efficiency: Cost Reduction')\n",
    "    ax.legend(loc='upper right')\n",
    "    ax.grid(True, alpha=0.3)\n",
    "    \n",
    "    # ----------------------------------\n",
    "    # Subplot 4: Stability (Wealth Process)\n",
    "    # ----------------------------------\n",
    "    ax = axes[3]\n",
    "    ax.plot(df_logs['step'], df_logs['wealth'], color='#9467bd', label='Wealth of Selected Threshold ($K_t$)', linewidth=1.5)\n",
    "    # 画出安全线 1/alpha\n",
    "    safe_wealth = 1.0 / config.alpha\n",
    "    ax.axhline(y=safe_wealth, color='purple', linestyle=':', label=f'Safety Barrier ($1/\\\\alpha={safe_wealth:.1f}$)')\n",
    "    \n",
    "    ax.set_ylabel('Wealth Value')\n",
    "    ax.set_xlabel('Simulation Step (post-warmup)')\n",
    "    ax.set_title('Martingale Wealth Process')\n",
    "    ax.set_yscale('log') # 财富值通常是指数增长的，用对数坐标更好看\n",
    "    ax.legend(loc='upper left')\n",
    "    ax.grid(True, alpha=0.3)\n",
    "\n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "\n",
    "plot_bpac_results(df_result, cfg)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b424399a",
   "metadata": {},
   "source": [
    "### compare to pac"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f109c064",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from tqdm import tqdm \n",
    "\n",
    "# -------------------------------------------------------------------------\n",
    "# 0. 准备全量数据 (只做一次)\n",
    "# -------------------------------------------------------------------------\n",
    "# 假设 data_list 已经准备好\n",
    "y_solved = [item['expert_correct'] for item in data_list]\n",
    "y_hat_solved = [item['instant_correct'] for item in data_list]\n",
    "uncertainty_values = [item['uncertainty'] for item in data_list]\n",
    "y_token_list = [item['expert_token'] for item in data_list]\n",
    "y_hat_token_list = [item['instant_token'] for item in data_list]\n",
    "\n",
    "# 参数设置\n",
    "NUM_RUNS = 100         # 重复次数\n",
    "TARGET_EPSILON = 0.05   # 目标 Risk\n",
    "TARGET_ALPHA = 0.05     # 置信度\n",
    "CALIB_NUM = 500        # 校准集大小\n",
    "TOTAL_NUM = len(y_solved)\n",
    "calib_ratio = CALIB_NUM / TOTAL_NUM\n",
    "\n",
    "# -------------------------------------------------------------------------\n",
    "# 1. 循环运行 Simulation\n",
    "# -------------------------------------------------------------------------\n",
    "\n",
    "# 用于存储每次实验的完整序列\n",
    "# 维度: (100, N_samples)\n",
    "all_risks = []\n",
    "all_token_ratios = []\n",
    "all_expert_ratios = []\n",
    "all_u_hats = []\n",
    "\n",
    "print(f\"Starting {NUM_RUNS} runs simulation...\")\n",
    "\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    # 每次使用不同的随机种子 (seed=0, 1, 2, ..., 99)\n",
    "    # 这样 run_baseline_continuous 内部的 shuffle 结果每次都不一样\n",
    "    df, u_hat = run_baseline_continuous(\n",
    "        y_solved, \n",
    "        y_hat_solved, \n",
    "        uncertainty_values, \n",
    "        y_token_list, \n",
    "        y_hat_token_list,\n",
    "        calib_ratio=calib_ratio, \n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=TARGET_ALPHA,\n",
    "        seed=seed\n",
    "    )\n",
    "    \n",
    "    # 记录整条曲线 (df['avg_risk'] 是一个 Series)\n",
    "    all_risks.append(df['avg_risk'].values)\n",
    "    all_token_ratios.append(df['token_ratio'].values)\n",
    "    all_expert_ratios.append(df['expert_call_ratio'].values)\n",
    "    all_u_hats.append(u_hat)\n",
    "\n",
    "# 转为 Numpy Array 方便计算均值方差\n",
    "# shape = (100, N)\n",
    "arr_risks = np.array(all_risks)\n",
    "arr_token_ratios = np.array(all_token_ratios)\n",
    "arr_expert_ratios = np.array(all_expert_ratios)\n",
    "arr_thresholds = np.array(all_u_hats)\n",
    "\n",
    "print(\"\\nSimulation Finished!\")\n",
    "print(f\"Average u_hat across runs: {np.mean(all_u_hats):.4f} (std: {np.std(all_u_hats):.4f})\")\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "# 参数设置\n",
    "NUM_RUNS = 100        # 重复 100 次\n",
    "WARM_UP_STEPS = 0   # BPAC 的 Warm-up 步数 (对应 Baseline 的校准集大小)\n",
    "TARGET_EPSILON = 0.05  # 目标 Risk\n",
    "CFG_ALPHA= 0.05       # BPAC 参数\n",
    "CFG_RHO = 0.1         # BPAC 参数\n",
    "BPA_beta = 1\n",
    "\n",
    "bpac_risks = []\n",
    "bpac_token_ratios = []\n",
    "bpac_expert_ratios = []\n",
    "bpac_wealths = []\n",
    "bpac_thresholds = []\n",
    "\n",
    "print(f\"Starting BPAC Simulation ({NUM_RUNS} runs)...\")\n",
    "\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    # -----------------------------------------------------------\n",
    "    # 1. 严格的数据对齐 (Data Alignment)\n",
    "    # -----------------------------------------------------------\n",
    "    # 使用与 Baseline 相同的种子生成随机数生成器\n",
    "    rng = np.random.default_rng(seed)\n",
    "    \n",
    "    # 生成打乱的索引\n",
    "    n = len(data_list)\n",
    "    indices = np.arange(n)\n",
    "    rng.shuffle(indices)\n",
    "    \n",
    "    # 关键修正：根据 indices 重排 data_list\n",
    "    # 注意：data_list 是 list of dicts，不能直接用 indices 索引，除非转 numpy\n",
    "    # 这里用列表推导式最稳妥\n",
    "    shuffled_data = [data_list[i] for i in indices]\n",
    "    \n",
    "    # -----------------------------------------------------------\n",
    "    # 2. 配置并运行 BPAC\n",
    "    # -----------------------------------------------------------\n",
    "    # 你的 BPACConfig\n",
    "    cfg = BPACConfig(\n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=CFG_ALPHA, \n",
    "        rho=CFG_RHO, \n",
    "        warm_up=WARM_UP_STEPS,\n",
    "        num_thresholds=1001, \n",
    "        beta=BPA_beta, \n",
    "        c_clip=0.9\n",
    "    )\n",
    "    \n",
    "    # 运行模拟 (传入打乱后的数据)\n",
    "    df_result, _ = run_simulation(shuffled_data, cfg)\n",
    "    \n",
    "    bpac_risks.append(df_result['avg_risk'].values)\n",
    "    bpac_token_ratios.append(df_result['token_ratio'].values)\n",
    "    bpac_expert_ratios.append(df_result['expert_call_ratio'].values)\n",
    "    bpac_wealths.append(df_result['wealth'].values)\n",
    "    bpac_thresholds.append(df_result['threshold'].values)\n",
    "\n",
    "\n",
    "# 转换为 Numpy 数组方便计算均值\n",
    "bpac_risks_arr = np.array(bpac_risks)       # Shape: (100, N)\n",
    "bpac_token_ratios_arr = np.array(bpac_token_ratios) # Shape: (100, N)\n",
    "bpac_expert_ratios_arr = np.array(bpac_expert_ratios) # Shape: (100, N)\n",
    "bpac_wealths_arr = np.array(bpac_wealths)   # Shape: (100, N)\n",
    "bpac_thresholds_arr = np.array(bpac_thresholds) # Shape: (100, N)\n",
    "print(\"BPAC Simulation Finished!\")\n",
    "\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# Baseline 数据\n",
    "base_mean_risk = np.mean(arr_risks, axis=0)\n",
    "base_std_risk = np.std(arr_risks, axis=0)\n",
    "\n",
    "base_mean_token = np.mean(arr_token_ratios, axis=0)\n",
    "base_std_token = np.std(arr_token_ratios, axis=0)\n",
    "\n",
    "base_mean_expert = np.mean(arr_expert_ratios, axis=0)\n",
    "base_std_expert = np.std(arr_expert_ratios, axis=0)\n",
    "\n",
    "# 获取总时间步数\n",
    "N_STEPS = bpac_risks_arr.shape[1] \n",
    "\n",
    "# A. 重构 Baseline Threshold 序列 (因为你只存了最终的一个 u_hat)\n",
    "# 逻辑：在校准期 (前 CALIB_NUM 步) 阈值设为 0，之后设为 u_hat\n",
    "# arr_thresholds 是 (100,) 的标量数组\n",
    "base_threshold_seqs = np.zeros((NUM_RUNS, N_STEPS))\n",
    "for i in range(NUM_RUNS):\n",
    "    base_threshold_seqs[i, :CALIB_NUM] = 0.0  # 校准期阈值通常视为0 (强制Action=1)\n",
    "    base_threshold_seqs[i, CALIB_NUM:] = arr_thresholds[i] # 测试期应用 u_hat\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4ce52500",
   "metadata": {},
   "outputs": [],
   "source": [
    "base_mean_threshold_seq = np.mean(base_threshold_seqs, axis=0)\n",
    "base_std_threshold_seq = np.std(base_threshold_seqs, axis=0)\n",
    "\n",
    "\n",
    "# BPAC 数据\n",
    "bpac_mean_risk = np.mean(bpac_risks_arr, axis=0)\n",
    "bpac_std_risk = np.std(bpac_risks_arr, axis=0)\n",
    "\n",
    "bpac_mean_token = np.mean(bpac_token_ratios_arr, axis=0)\n",
    "bpac_std_token = np.std(bpac_token_ratios_arr, axis=0)\n",
    "\n",
    "bpac_mean_expert = np.mean(bpac_expert_ratios_arr, axis=0)\n",
    "bpac_std_expert = np.std(bpac_expert_ratios_arr, axis=0)\n",
    "\n",
    "# C. 确保其他 BPAC 统计量已准备好 (以防万一)\n",
    "bpac_mean_threshold = np.mean(bpac_thresholds_arr, axis=0)\n",
    "bpac_std_threshold = np.std(bpac_thresholds_arr, axis=0)\n",
    "\n",
    "# B. 计算 BPAC Wealth 统计 (你存了 bpac_wealths_arr 但还没算均值)\n",
    "bpac_mean_wealth = np.mean(bpac_wealths_arr, axis=0)\n",
    "bpac_std_wealth = np.std(bpac_wealths_arr, axis=0)\n",
    "\n",
    "\n",
    "fig, axes = plt.subplots(5, 1, figsize=(12, 24), sharex=True)\n",
    "(ax1, ax2, ax3, ax4, ax5) = axes\n",
    "\n",
    "# X轴\n",
    "steps = np.arange(N_STEPS)\n",
    "\n",
    "# === 子图 1: Average Risk (风险控制) ===\n",
    "ax1.plot(steps, base_mean_risk, label='Baseline', color='blue', linestyle='--')\n",
    "ax1.fill_between(steps, base_mean_risk - base_std_risk, base_mean_risk + base_std_risk, color='blue', alpha=0.1)\n",
    "\n",
    "ax1.plot(steps, bpac_mean_risk, label='BPAC (Ours)', color='red', linewidth=2)\n",
    "ax1.fill_between(steps, bpac_mean_risk - bpac_std_risk, bpac_mean_risk + bpac_std_risk, color='red', alpha=0.1)\n",
    "\n",
    "ax1.axhline(y=TARGET_EPSILON, color='green', linestyle='-', linewidth=2, label=f'Target $\\epsilon={TARGET_EPSILON}$')\n",
    "ax1.axvline(x=CALIB_NUM, color='gray', linestyle=':', label='Calibration End')\n",
    "\n",
    "ax1.set_ylabel('Avg Risk')\n",
    "ax1.set_title(f'1. Risk Control (Target $\\epsilon < {TARGET_EPSILON}$)')\n",
    "ax1.legend(loc='upper right')\n",
    "ax1.grid(True, alpha=0.3)\n",
    "ax1.set_ylim(0, TARGET_EPSILON * 3.0) # 聚焦在 Epsilon 附近\n",
    "\n",
    "# === 子图 2: Token Ratio (成本节省) ===\n",
    "ax2.plot(steps, base_mean_token, label='Baseline', color='blue', linestyle='--')\n",
    "ax2.fill_between(steps, base_mean_token - base_std_token, base_mean_token + base_std_token, color='blue', alpha=0.1)\n",
    "\n",
    "ax2.plot(steps, bpac_mean_token, label='BPAC', color='red', linewidth=2)\n",
    "ax2.fill_between(steps, bpac_mean_token - bpac_std_token, bpac_mean_token + bpac_std_token, color='red', alpha=0.1)\n",
    "\n",
    "ax2.axhline(y=1.0, color='black', linestyle=':')\n",
    "ax2.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "ax2.set_ylabel('Token Ratio')\n",
    "ax2.set_title('2. Cost Efficiency (Lower is Better)')\n",
    "ax2.grid(True, alpha=0.3)\n",
    "\n",
    "# === 子图 3: Expert Call Ratio (专家调用率) ===\n",
    "ax3.plot(steps, base_mean_expert, label='Baseline', color='blue', linestyle='--')\n",
    "ax3.fill_between(steps, base_mean_expert - base_std_expert, base_mean_expert + base_std_expert, color='blue', alpha=0.1)\n",
    "\n",
    "ax3.plot(steps, bpac_mean_expert, label='BPAC', color='red', linewidth=2)\n",
    "ax3.fill_between(steps, bpac_mean_expert - bpac_std_expert, bpac_mean_expert + bpac_std_expert, color='red', alpha=0.1)\n",
    "\n",
    "ax3.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "ax3.set_ylabel('Expert Call Ratio')\n",
    "ax3.set_title('3. Expert Usage Behavior')\n",
    "ax3.grid(True, alpha=0.3)\n",
    "ax3.set_ylim(0, 1.1)\n",
    "\n",
    "# === 子图 4: Threshold Evolution (阈值变化) ===\n",
    "# Baseline: 前期是0，后期是固定值\n",
    "ax4.plot(steps, base_mean_threshold_seq, label='Baseline $\\hat{u}$', color='blue', linestyle='--')\n",
    "ax4.fill_between(steps, \n",
    "                 base_mean_threshold_seq - base_std_threshold_seq, \n",
    "                 base_mean_threshold_seq + base_std_threshold_seq, \n",
    "                 color='blue', alpha=0.1)\n",
    "\n",
    "# BPAC: 动态变化\n",
    "ax4.plot(steps, bpac_mean_threshold, label='BPAC $u_t$', color='red', linewidth=1.5)\n",
    "ax4.fill_between(steps, \n",
    "                 bpac_mean_threshold - bpac_std_threshold, \n",
    "                 bpac_mean_threshold + bpac_std_threshold, \n",
    "                 color='red', alpha=0.1)\n",
    "\n",
    "ax4.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "ax4.set_ylabel('Uncertainty Threshold')\n",
    "ax4.set_title('4. Threshold Adaptation (Dynamic vs Fixed)')\n",
    "ax4.legend(loc='upper right')\n",
    "ax4.grid(True, alpha=0.3)\n",
    "ax4.set_ylim(0, 1.0) \n",
    "\n",
    "# === 子图 5: Wealth Evolution (财富积累) ===\n",
    "# 只有 BPAC 有 Wealth\n",
    "ax5.plot(steps, bpac_mean_wealth, label='BPAC Wealth', color='purple', linewidth=1.5)\n",
    "ax5.fill_between(steps, \n",
    "                 bpac_mean_wealth - bpac_std_wealth, \n",
    "                 bpac_mean_wealth + bpac_std_wealth, \n",
    "                 color='purple', alpha=0.1)\n",
    "\n",
    "ax5.axhline(y=1.0, color='black', linestyle='--', label='Initial Wealth (1.0)')\n",
    "ax5.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "# 如果财富增长非常快，建议开启对数坐标\n",
    "# ax5.set_yscale('log') \n",
    "\n",
    "ax5.set_ylabel('Wealth')\n",
    "ax5.set_xlabel('Time Step')\n",
    "ax5.set_title('5. BPAC Wealth Accumulation')\n",
    "ax5.legend(loc='upper left')\n",
    "ax5.grid(True, alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1af43c88",
   "metadata": {},
   "source": [
    "#### paper"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "58d8cacb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "from scipy import stats\n",
    "\n",
    "# ==========================================\n",
    "# 1. 设置阴影类型 (ICML 关键设置)\n",
    "# ==========================================\n",
    "# 选项: \n",
    "# 'std': 标准差 (Mean ± Std) -> 阴影最大，容易超线\n",
    "# 'sem': 标准误 (Mean ± Std/sqrt(N)) -> 阴影更窄，聚焦于均值的准确性 (推荐!)\n",
    "# 'ci':  95% 置信区间 (Bootstrap/t-distribution) -> 严谨的统计学区间\n",
    "# 'percentile': 分位数 (10% - 90%) -> 展示数据分布的真实范围\n",
    "\n",
    "SHADOW_TYPE = 'sem' \n",
    "NUM_RUNS = 100  # 你的实验次数\n",
    "\n",
    "def get_error_bounds(data_array, type='sem'):\n",
    "    \"\"\"根据选择的类型计算上下界\"\"\"\n",
    "    mean_val = np.mean(data_array, axis=0)\n",
    "    \n",
    "    if type == 'std':\n",
    "        std_val = np.std(data_array, axis=0)\n",
    "        lower = mean_val - std_val\n",
    "        upper = mean_val + std_val\n",
    "        \n",
    "    elif type == 'sem':\n",
    "        # 标准误 = Std / sqrt(N)\n",
    "        sem_val = stats.sem(data_array, axis=0)\n",
    "        lower = mean_val - sem_val\n",
    "        upper = mean_val + sem_val\n",
    "        \n",
    "    elif type == 'ci':\n",
    "        # 95% t-distribution 置信区间\n",
    "        sem_val = stats.sem(data_array, axis=0)\n",
    "        # ppf(0.975) 对应双侧 95%\n",
    "        ci_scale = stats.t.ppf(0.975, df=NUM_RUNS-1) \n",
    "        lower = mean_val - sem_val * ci_scale\n",
    "        upper = mean_val + sem_val * ci_scale\n",
    "        \n",
    "    elif type == 'percentile':\n",
    "        # 10% - 90% 分位数 (剔除最极端的异常值)\n",
    "        lower = np.percentile(data_array, 10, axis=0)\n",
    "        upper = np.percentile(data_array, 90, axis=0)\n",
    "        \n",
    "    else:\n",
    "        raise ValueError(\"Unknown shadow type\")\n",
    "        \n",
    "    return mean_val, lower, upper\n",
    "\n",
    "# ==========================================\n",
    "# 2. 计算绘图数据\n",
    "# ==========================================\n",
    "steps = np.arange(bpac_risks_arr.shape[1])\n",
    "\n",
    "# --- 计算 Risk ---\n",
    "base_mean, base_low, base_high = get_error_bounds(arr_risks, SHADOW_TYPE)\n",
    "bpac_mean, bpac_low, bpac_high = get_error_bounds(bpac_risks_arr, SHADOW_TYPE)\n",
    "\n",
    "# --- 计算 ECP ---\n",
    "base_ecp_mean, base_ecp_low, base_ecp_high = get_error_bounds(arr_expert_ratios, SHADOW_TYPE)\n",
    "bpac_ecp_mean, bpac_ecp_low, bpac_ecp_high = get_error_bounds(bpac_expert_ratios_arr, SHADOW_TYPE)\n",
    "\n",
    "# --- 计算 TCP ---\n",
    "base_tcp_mean, base_tcp_low, base_tcp_high = get_error_bounds(arr_token_ratios, SHADOW_TYPE)\n",
    "bpac_tcp_mean, bpac_tcp_low, bpac_tcp_high = get_error_bounds(bpac_token_ratios_arr, SHADOW_TYPE)\n",
    "\n",
    "# --- 物理截断 (Clipping) ---\n",
    "# 无论用什么统计方法，Risk 和 Ratio 都不可能小于 0\n",
    "def clip_bounds(low, high, min_v=0, max_v=None):\n",
    "    low = np.maximum(low, min_v)\n",
    "    if max_v is not None:\n",
    "        high = np.minimum(high, max_v)\n",
    "    return low, high\n",
    "\n",
    "base_low, base_high = clip_bounds(base_low, base_high, 0, 1.0) # Risk 通常<1，也可不设上限\n",
    "bpac_low, bpac_high = clip_bounds(bpac_low, bpac_high, 0, 1.0)\n",
    "\n",
    "base_ecp_low, base_ecp_high = clip_bounds(base_ecp_low, base_ecp_high, 0, 1.0)\n",
    "bpac_ecp_low, bpac_ecp_high = clip_bounds(bpac_ecp_low, bpac_ecp_high, 0, 1.0)\n",
    "\n",
    "base_tcp_low, base_tcp_high = clip_bounds(base_tcp_low, base_tcp_high, 0, None)\n",
    "bpac_tcp_low, bpac_tcp_high = clip_bounds(bpac_tcp_low, bpac_tcp_high, 0, None)\n",
    "\n",
    "\n",
    "# ==========================================\n",
    "# 3. 绘图 (ICML Style)\n",
    "# ==========================================\n",
    "# 设置字体和线宽\n",
    "plt.rcParams.update({\n",
    "    'font.size': 14,\n",
    "    'axes.labelsize': 16, 'axes.titlesize': 16,\n",
    "    'xtick.labelsize': 14, 'ytick.labelsize': 14,\n",
    "    'legend.fontsize': 13, 'lines.linewidth': 2.5,\n",
    "    'axes.grid': True, 'grid.alpha': 0.3, 'grid.linestyle': '--'\n",
    "})\n",
    "\n",
    "fig, axes = plt.subplots(1, 3, figsize=(20, 5), sharex=True)\n",
    "\n",
    "# 颜色\n",
    "C_BASE = '#1f77b4'  # Blue\n",
    "C_OURS = '#d62728'  # Red\n",
    "C_TGT = '#2ca02c'   # Green\n",
    "C_WARM = 'gray'\n",
    "\n",
    "# --- (a) Risk ---\n",
    "ax = axes[0]\n",
    "ax.plot(steps, base_mean, color=C_BASE, linestyle='--', label='PAC (Baseline)')\n",
    "ax.fill_between(steps, base_low, base_high, color=C_BASE, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, bpac_mean, color=C_OURS, linestyle='-', label='B-PAC (Ours)')\n",
    "ax.fill_between(steps, bpac_low, bpac_high, color=C_OURS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.axhline(y=TARGET_EPSILON, color=C_TGT, linestyle='-', linewidth=2, label=r'Target $\\epsilon$')\n",
    "ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "\n",
    "# ax.set_title('(a) Cumulative Risk')\n",
    "ax.set_ylabel('ER')\n",
    "ax.set_xlabel('Time Step')\n",
    "ax.set_ylim(0, TARGET_EPSILON * 2.0) # 视野聚焦\n",
    "# 图例中说明阴影含义\n",
    "if SHADOW_TYPE == 'sem':\n",
    "    shadow_label = 'Shaded: SEM'\n",
    "elif SHADOW_TYPE == 'std':\n",
    "    shadow_label = 'Shaded: Std. Dev.'\n",
    "elif SHADOW_TYPE == 'ci':\n",
    "    shadow_label = 'Shaded: 95% CI'\n",
    "else:\n",
    "    shadow_label = 'Shaded: 10-90% Pctl'\n",
    "    \n",
    "# 创建一个空的 handle 来显示阴影说明\n",
    "from matplotlib.lines import Line2D\n",
    "handles, labels = ax.get_legend_handles_labels()\n",
    "handles.append(Line2D([0], [0], color='gray', alpha=0.3, linewidth=10))\n",
    "# labels.append(shadow_label)\n",
    "ax.legend(handles, labels, loc='upper right', frameon=True, framealpha=0.9)\n",
    "\n",
    "\n",
    "# --- (b) ECP ---\n",
    "ax = axes[1]\n",
    "ax.plot(steps, base_ecp_mean, color=C_BASE, linestyle='--')\n",
    "ax.fill_between(steps, base_ecp_low, base_ecp_high, color=C_BASE, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, bpac_ecp_mean, color=C_OURS, linestyle='-')\n",
    "ax.fill_between(steps, bpac_ecp_low, bpac_ecp_high, color=C_OURS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "# ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2, label='Warm-up End')\n",
    "ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "# ax.set_title('(b) Expert Call Rate (ECP)')\n",
    "ax.set_ylabel('ECP(%)')\n",
    "ax.set_xlabel('Time Step')\n",
    "ax.set_ylim(0, 1.05)\n",
    "ax.legend(loc='upper right')\n",
    "\n",
    "# --- (c) TCP ---\n",
    "ax = axes[2]\n",
    "ax.plot(steps, base_tcp_mean, color=C_BASE, linestyle='--')\n",
    "ax.fill_between(steps, base_tcp_low, base_tcp_high, color=C_BASE, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, bpac_tcp_mean, color=C_OURS, linestyle='-')\n",
    "ax.fill_between(steps, bpac_tcp_low, bpac_tcp_high, color=C_OURS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.axhline(y=1.0, color='black', linestyle='-.', linewidth=1.5, label='Full Expert')\n",
    "ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "\n",
    "# ax.set_title('(c) Token Cost Ratio (TCP)')\n",
    "ax.set_ylabel('TCP(%)')\n",
    "ax.set_xlabel('Time Step')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('icml_final_plot.pdf', bbox_inches='tight')\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "vllm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
