{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "54640d6f",
   "metadata": {},
   "source": [
    "# 算法核心"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c822a45f",
   "metadata": {},
   "source": [
    "## our"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "70350301",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from dataclasses import dataclass\n",
    "from typing import List, Optional, Tuple, Dict\n",
    "\n",
    "@dataclass\n",
    "class BPACConfig:\n",
    "    \"\"\"\n",
    "    B-PAC 算法超参数配置\n",
    "    \"\"\"\n",
    "    alpha: float = 0.1          # 容错概率 (1-Confidence), 例如 0.1 代表 90% 置信度\n",
    "    epsilon: float = 0.1      # 容忍的风险上限 (Error Tolerance), 例如 0.05 代表允许 5% 的性能损失\n",
    "    rho: float = 0.1           # 最小探索概率 (Minimum Exploration Probability),0到1\n",
    "    beta: float = 1.0           # FTRL 正则化参数,0到无穷\n",
    "    c_clip: float = 0.9         # 投注截断常数，0到1\n",
    "    num_thresholds: int = 1001   # 阈值搜索空间的精细度\n",
    "    warm_up: int = 50          # 初始预热步数\n",
    "    rho_0: float = 0.05\n",
    "    rho_1: float = 0.6\n",
    "    change_point: int = 200\n",
    "\n",
    "def compute_step_loss(y_correct_t: int, y_hat_correct_t: int, max_val: float) -> float:\n",
    "    \"\"\"\n",
    "    单步 Loss 计算（修改版）\n",
    "    \n",
    "    新公式：\n",
    "    loss = (y_correct_t - y_hat_correct_t) / max_val\n",
    "    \n",
    "    参数:\n",
    "        y_correct_t: int, 1-10, 表示专家的分数\n",
    "        y_hat_correct_t: int, 1-10 表示小模型的分数\n",
    "        max_val: float, 用于归一化的分母（通常 > 0），当前最大的可能分数差值。\n",
    "    \n",
    "    返回:\n",
    "        float, 计算得到的单步 loss 值\n",
    "    \"\"\"\n",
    "    if max_val == 0:\n",
    "        raise ValueError(\"max_val cannot be zero to avoid division by zero\")\n",
    "    y_correct_t = np.asanyarray(y_correct_t,dtype=float)\n",
    "    y_hat_correct_t = np.asanyarray(y_hat_correct_t,dtype=float)\n",
    "    loss = np.sqrt((y_correct_t - y_hat_correct_t) / max_val)\n",
    "    # k = 1\n",
    "    # loss = (1-np.exp(-k*(y_correct_t - y_hat_correct_t))) / (1 - np.exp(-k*max_val))\n",
    "\n",
    "    return loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d92b113c",
   "metadata": {},
   "outputs": [],
   "source": [
    "class BPAC:\n",
    "    def __init__(self, config: BPACConfig):\n",
    "        self.cfg = config\n",
    "        self.threshold_candidates = np.linspace(0, 1, self.cfg.num_thresholds)\n",
    "\n",
    "        # 状态初始化\n",
    "        self.current_u_idx = 0 \n",
    "        self.current_u = self.threshold_candidates[self.current_u_idx]\n",
    "        self.wealth = np.ones(self.cfg.num_thresholds) # K_0 = 1\n",
    "        \n",
    "        # FTRL 统计量\n",
    "        self.sum_D = np.zeros(self.cfg.num_thresholds)\n",
    "        self.sum_D_sq = np.zeros(self.cfg.num_thresholds)\n",
    "\n",
    "    def get_action(self, uncertainty_score: float):\n",
    "        \"\"\"\n",
    "        Returns:\n",
    "            action (int): 1 (Expert), 0 (Instant)\n",
    "            propensity (float): The probability of choosing Expert (pi_t)\n",
    "        \"\"\"\n",
    "        # 策略: pi_t = I(U >= u) + rho * I(U < u)\n",
    "        if uncertainty_score >= self.current_u:\n",
    "            # 必须调用专家\n",
    "            return 1, 1.0\n",
    "        else:\n",
    "            # 尝试使用小模型，但有 rho 的概率探索\n",
    "            # prop 是指“在这个不确定性下，算法设计上调用专家的概率”\n",
    "            propensity = self.cfg.rho \n",
    "            \n",
    "            # 实际采样动作\n",
    "            is_exploring = np.random.rand() < self.cfg.rho\n",
    "            action = 1 if is_exploring else 0\n",
    "            \n",
    "            return action, propensity\n",
    "\n",
    "    def update(self, uncertainty_score: float, action: int, observed_loss: Optional[float]):\n",
    "        \"\"\"\n",
    "        核心更新逻辑 (Bandit Feedback)\n",
    "        论文中的 update 仅依赖于 'observed' 数据\n",
    "        \"\"\"\n",
    "        # 1. 数据准备\n",
    "        # 如果 action=0 (没调专家)，则 observed_loss 为 None，但在公式中 l_t * xi_t 会变成 0\n",
    "        l_t = observed_loss if observed_loss is not None else 0.0\n",
    "        xi_t = action\n",
    "        \n",
    "        # 2. 计算 Propensity 向量 (Vectorized for all u)\n",
    "        # indicator_less: I(U_t < u)\n",
    "        indicator_less = (uncertainty_score < self.threshold_candidates).astype(float)\n",
    "        # pi_t(u)\n",
    "        if uncertainty_score < self.current_u:\n",
    "            pi_t = self.cfg.rho\n",
    "        else:\n",
    "            pi_t = 1.0\n",
    "        \n",
    "        # 3. 计算 Payoff D_t(u)\n",
    "        # D_t = epsilon - (l_t * xi_t * I(U < u)) / pi_t\n",
    "        weighted_loss = (1-self.cfg.rho_0)*(l_t * xi_t * indicator_less) / pi_t\n",
    "\n",
    "        # epsilon = epsilon / (1 - rho) 调整\n",
    "        # epsilon = self.cfg.epsilon / (1.0 - self.cfg.rho)\n",
    "        D_t = self.cfg.epsilon - weighted_loss\n",
    "        \n",
    "        # 4. FTRL Lambda 更新 [cite: 199]\n",
    "        denom = self.sum_D_sq + self.cfg.beta\n",
    "        denom[denom == 0] = 1e-9 # 避免除零\n",
    "        lambda_raw = self.sum_D / denom\n",
    "        \n",
    "        M_t = max(self.cfg.epsilon,((1.0-self.cfg.rho_0)/self.cfg.rho)-self.cfg.epsilon)\n",
    "        upper_bound = self.cfg.c_clip / M_t\n",
    "        lambda_t = np.clip(lambda_raw, 0, upper_bound)\n",
    "        \n",
    "        # 5. 财富更新\n",
    "        self.wealth = self.wealth * (1.0 + lambda_t * D_t)\n",
    "        self.sum_D += D_t\n",
    "        self.sum_D_sq += (D_t ** 2)\n",
    "\n",
    "        # # 6. 阈值选择 [cite: 166]\n",
    "        # valid_indices = np.where(self.wealth >= (1.0 / self.cfg.alpha))[0]\n",
    "        \n",
    "        is_safe_mask = (self.wealth >= (1.0 / self.cfg.alpha))\n",
    "        prefix_safe_mask = np.logical_and.accumulate(is_safe_mask)\n",
    "        valid_indices = np.where(prefix_safe_mask)[0]\n",
    "\n",
    "        if len(valid_indices) > 0:\n",
    "            self.current_u_idx = valid_indices[-1]\n",
    "            self.current_u = self.threshold_candidates[self.current_u_idx]\n",
    "        else:\n",
    "            # Fallback to safest (all expert)\n",
    "            self.current_u_idx = 0\n",
    "            self.current_u = 0.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9c241747",
   "metadata": {},
   "outputs": [],
   "source": [
    "def run_simulation(data_sequence: List[Dict], config: BPACConfig):\n",
    "    \"\"\"\n",
    "    data_sequence: List of item dicts\n",
    "    item keys: \"uncertainty\", \"instant_correct\", \"expert_correct\", \"instant_token\", \"expert_token\"\n",
    "    \"\"\"\n",
    "    model = BPAC(config)\n",
    "    logs = []\n",
    "    warm_up = config.warm_up\n",
    "    # print(f\"Start Simulation with {len(data_sequence)} samples...\")\n",
    "    # print(f\"Config: Epsilon={config.epsilon}, Alpha={config.alpha}, Rho={config.rho}\")\n",
    "    diffs = []\n",
    "    for i in data_sequence:\n",
    "        diff = i['expert_correct'] - i['instant_correct']\n",
    "        diffs.append(diff)\n",
    "    max_diff = max(diffs)\n",
    "\n",
    "    # 累积变量\n",
    "    total_actual_tokens = 0\n",
    "    total_baseline_tokens = 0\n",
    "    cumulative_loss = 0\n",
    "    expert_calls = 0\n",
    "\n",
    "    for t, item in enumerate(data_sequence):\n",
    "        # 1. 提取特征\n",
    "        u_t = item['uncertainty']\n",
    "        inst_corr = item['instant_correct']\n",
    "        exp_corr = item['expert_correct']\n",
    "        inst_tok = item['instant_token']\n",
    "        exp_tok = item['expert_token']\n",
    "        \n",
    "        # 2. 算法决策\n",
    "\n",
    "        \n",
    "        if t< model.cfg.change_point:\n",
    "            model.cfg.rho = model.cfg.rho_1\n",
    "        else:\n",
    "            model.cfg.rho = model.cfg.rho_0\n",
    "            \n",
    "        action, propensity = model.get_action(u_t)\n",
    "        \n",
    "        # 3. 计算 Loss\n",
    "        # (A) True Loss: 上帝视角，用于评估和画图\n",
    "        # 即使 action=0，如果小模型错了专家对了，这里也是 1\n",
    "        true_loss = compute_step_loss(exp_corr, inst_corr,max_diff)\n",
    "        \n",
    "        # (B) Observed Loss: 算法视角 (Bandit Feedback) \n",
    "        # 只有调用了专家 (action=1)，算法才能看到 loss\n",
    "        # 如果 action=0，算法不知道 loss，传入 None (内部处理为0)\n",
    "        observed_loss = true_loss if action == 1 else None\n",
    "        \n",
    "        # 4. 算法更新\n",
    "        model.update(u_t, action, observed_loss)\n",
    "        if t < warm_up:\n",
    "            continue\n",
    "\n",
    "        # 5. Token 消耗计算\n",
    "        # Baseline: 假设全用 Expert\n",
    "        step_baseline_tokens = exp_tok\n",
    "        \n",
    "        # Actual: \n",
    "        # Action 0 -> instant\n",
    "        # Action 1 -> instant + expert (Cascade)\n",
    "        if action == 1:\n",
    "            step_actual_tokens = inst_tok + exp_tok\n",
    "            expert_calls += 1\n",
    "        else:\n",
    "            step_actual_tokens = inst_tok\n",
    "\n",
    "        total_actual_tokens += step_actual_tokens\n",
    "        total_baseline_tokens += step_baseline_tokens\n",
    "        \n",
    "        # 计算当前的 Token Ratio (Accumulated)\n",
    "        # 避免除以0\n",
    "        current_token_ratio = total_actual_tokens / total_baseline_tokens if total_baseline_tokens > 0 else 1.0\n",
    "        # 计算当前专家调用的比例\n",
    "        current_expert_ratio = expert_calls / (t - warm_up + 1)\n",
    "        # 计算当前的 Average Risk (Accumulated True Loss / t)\n",
    "        if action == 0:\n",
    "            cumulative_loss += true_loss # 当调用小模型时，才记录损失\n",
    "        else:\n",
    "            cumulative_loss += 0 # 注意调用专家时，损失永远为0.\n",
    "        current_avg_risk = cumulative_loss / (t + 1 - warm_up)\n",
    "\n",
    "        # 计算当前选的位置的财富\n",
    "        wealth = model.wealth[model.current_u_idx]\n",
    "        # 6. 记录日志\n",
    "        logs.append({\n",
    "            \"step\": t,\n",
    "            \"uncertainty\": u_t,\n",
    "            \"threshold\": model.current_u,      # 当前阈值\n",
    "            \"action\": action,                  # 1=Expert, 0=Instant\n",
    "            \"true_loss\": true_loss,            # 真实损失 (上帝视角)\n",
    "            \"observed_loss\": observed_loss if observed_loss is not None else np.nan,\n",
    "            \"avg_risk\": current_avg_risk,      # 累积平均风险\n",
    "            \"token_ratio\": current_token_ratio, # 累积 Token 消耗比\n",
    "            \"expert_call_ratio\": current_expert_ratio, # 累积专家调用比\n",
    "            \"wealth\": wealth                   # 当前财富水平\n",
    "        })\n",
    "\n",
    "    return pd.DataFrame(logs), model"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2bf6776a",
   "metadata": {},
   "source": [
    "## onaive"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "33c78e1a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from dataclasses import dataclass\n",
    "from typing import List, Optional, Tuple, Dict\n",
    "\n",
    "@dataclass\n",
    "class BPACConfignaive:\n",
    "    \"\"\"\n",
    "    B-PAC 算法超参数配置\n",
    "    \"\"\"\n",
    "    alpha: float = 0.1          # 容错概率 (1-Confidence), 例如 0.1 代表 90% 置信度\n",
    "    epsilon: float = 0.1      # 容忍的风险上限 (Error Tolerance), 例如 0.05 代表允许 5% 的性能损失\n",
    "    rho: float = 0.1           # 最小探索概率 (Minimum Exploration Probability),0到1\n",
    "    num_thresholds: int = 1001   # 阈值搜索空间的精细度\n",
    "    warm_up: int = 50          # 初始预热步数\n",
    "\n",
    "def compute_step_loss(y_correct_t: int, y_hat_correct_t: int, max_val: float) -> float:\n",
    "    \"\"\"\n",
    "    单步 Loss 计算（修改版）\n",
    "    \n",
    "    新公式：\n",
    "    loss = (y_correct_t - y_hat_correct_t) / max_val\n",
    "    \n",
    "    参数:\n",
    "        y_correct_t: int, 1-10, 表示专家的分数\n",
    "        y_hat_correct_t: int, 1-10 表示小模型的分数\n",
    "        max_val: float, 用于归一化的分母（通常 > 0），当前最大的可能分数差值。\n",
    "    \n",
    "    返回:\n",
    "        float, 计算得到的单步 loss 值\n",
    "    \"\"\"\n",
    "    if max_val == 0:\n",
    "        raise ValueError(\"max_val cannot be zero to avoid division by zero\")\n",
    "    y_correct_t = np.asanyarray(y_correct_t,dtype=float)\n",
    "    y_hat_correct_t = np.asanyarray(y_hat_correct_t,dtype=float)\n",
    "    loss = np.sqrt((y_correct_t - y_hat_correct_t) / max_val)\n",
    "    # k = 1\n",
    "    # loss = (1-np.exp(-k*(y_correct_t - y_hat_correct_t))) / (1 - np.exp(-k*max_val))\n",
    "\n",
    "    return loss\n",
    "\n",
    "class Onaive:\n",
    "    def __init__(self, config: BPACConfignaive):\n",
    "        self.cfg = config\n",
    "        self.threshold_candidates = np.linspace(0, 1, self.cfg.num_thresholds)\n",
    "\n",
    "        # 状态初始化\n",
    "        self.current_u_idx = 0 \n",
    "        self.current_u = self.threshold_candidates[self.current_u_idx]\n",
    "        self.t = 1\n",
    "        self.sum_risk_terms = np.zeros(self.cfg.num_thresholds)\n",
    "\n",
    "        \n",
    "    def get_action(self, uncertainty_score: float):\n",
    "        \"\"\"\n",
    "        Returns:\n",
    "            action (int): 1 (Expert), 0 (Instant)\n",
    "            propensity (float): The probability of choosing Expert (pi_t)\n",
    "        \"\"\"\n",
    "        # 策略: pi_t = I(U >= u) + rho * I(U < u)\n",
    "        if uncertainty_score >= self.current_u:\n",
    "            # 必须调用专家\n",
    "            return 1, 1.0\n",
    "        else:\n",
    "            # 尝试使用小模型，但有 rho 的概率探索\n",
    "            # prop 是指“在这个不确定性下，算法设计上调用专家的概率”\n",
    "            propensity = self.cfg.rho \n",
    "            \n",
    "            # 实际采样动作\n",
    "            is_exploring = np.random.rand() < self.cfg.rho\n",
    "            action = 1 if is_exploring else 0\n",
    "            \n",
    "            return action, propensity\n",
    "\n",
    "    def update(self, uncertainty_score: float, action: int, observed_loss: Optional[float]):\n",
    "        \"\"\"\n",
    "        根据 O-Naive 公式更新风险估计和阈值\n",
    "        R_hat(u) = (1/t) * Sum( xi * l * I(U < u) )\n",
    "        \"\"\"\n",
    "        self.t += 1\n",
    "        \n",
    "        # =========================================================\n",
    "        # 1. 计算公式中的单步项: xi * l * I(U < u)\n",
    "        # =========================================================\n",
    "        \n",
    "        # (a) 标量部分: xi * l\n",
    "        xi = action\n",
    "        l_t = observed_loss if observed_loss is not None else 0.0\n",
    "        \n",
    "        # 只有当调用专家(xi=1)且发生错误(l=1)时，scalar_term 才为 1，否则为 0\n",
    "        scalar_term = xi * l_t \n",
    "        \n",
    "        # (b) 向量部分: I(U_t < u)\n",
    "        # 这是一个形状为 (num_thresholds,) 的 0/1 向量\n",
    "        indicator_less = (uncertainty_score < self.threshold_candidates).astype(float)\n",
    "        \n",
    "        # (c) 累加到总和\n",
    "        # self.sum_risk_terms += scalar_term * indicator_less\n",
    "        # 只有那些“阈值 u 比当前 uncertainty 大”的候选者，才可能在这一步积累风险\n",
    "        self.sum_risk_terms += (scalar_term * indicator_less)\n",
    "            \n",
    "        # =========================================================\n",
    "        # 2. 计算平均累积风险 R_hat(u)\n",
    "        # =========================================================\n",
    "        estimated_risk = self.sum_risk_terms / self.t\n",
    "        \n",
    "        # =========================================================\n",
    "        # 3. 阈值选择: max { u : R_hat(u) <= epsilon }\n",
    "        # =========================================================\n",
    "        # 找到所有风险达标的索引\n",
    "        valid_indices = np.where(estimated_risk <= self.cfg.epsilon)[0]\n",
    "        \n",
    "        if len(valid_indices) > 0:\n",
    "            # 贪婪选择最大的那个\n",
    "            self.current_u_idx = valid_indices[-1]\n",
    "            self.current_u = self.threshold_candidates[self.current_u_idx]\n",
    "        else:\n",
    "            # 没有任何阈值满足风险要求 (说明连 u=0 都不安全，或者刚开始运气极差)\n",
    "            # 退化到最保守策略 (全专家)\n",
    "            self.current_u_idx = 0\n",
    "            self.current_u = 0.0\n",
    "        \n",
    "\n",
    "def run_simulation_onaive(data_sequence: List[Dict], config: BPACConfignaive):\n",
    "    \"\"\"\n",
    "    data_sequence: List of item dicts\n",
    "    item keys: \"uncertainty\", \"instant_correct\", \"expert_correct\", \"instant_token\", \"expert_token\"\n",
    "    \"\"\"\n",
    "    model = Onaive(config)\n",
    "    logs = []\n",
    "    warm_up = config.warm_up\n",
    "    # print(f\"Start Simulation with {len(data_sequence)} samples...\")\n",
    "    # print(f\"Config: Epsilon={config.epsilon}, Alpha={config.alpha}, Rho={config.rho}\")\n",
    "\n",
    "    diffs = []\n",
    "    for i in data_sequence:\n",
    "        diff = i['expert_correct'] - i['instant_correct']\n",
    "        diffs.append(diff)\n",
    "    max_diff = max(diffs)\n",
    "\n",
    "    # 累积变量\n",
    "    total_actual_tokens = 0\n",
    "    total_baseline_tokens = 0\n",
    "    cumulative_loss = 0\n",
    "    expert_calls = 0\n",
    "\n",
    "    for t, item in enumerate(data_sequence):\n",
    "        # 1. 提取特征\n",
    "        u_t = item['uncertainty']\n",
    "        inst_corr = item['instant_correct']\n",
    "        exp_corr = item['expert_correct']\n",
    "        inst_tok = item['instant_token']\n",
    "        exp_tok = item['expert_token']\n",
    "        \n",
    "            \n",
    "        action, propensity = model.get_action(u_t)\n",
    "        \n",
    "        # 3. 计算 Loss\n",
    "        # (A) True Loss: 上帝视角，用于评估和画图\n",
    "        # 即使 action=0，如果小模型错了专家对了，这里也是 1\n",
    "        true_loss = compute_step_loss(exp_corr, inst_corr, max_diff)\n",
    "        \n",
    "        # (B) Observed Loss: 算法视角 (Bandit Feedback) \n",
    "        # 只有调用了专家 (action=1)，算法才能看到 loss\n",
    "        # 如果 action=0，算法不知道 loss，传入 None (内部处理为0)\n",
    "        observed_loss = true_loss if action == 1 else None\n",
    "        \n",
    "        # 4. 算法更新\n",
    "        model.update(u_t, action, observed_loss)\n",
    "        if t < warm_up:\n",
    "            continue\n",
    "\n",
    "        # 5. Token 消耗计算\n",
    "        # Baseline: 假设全用 Expert\n",
    "        step_baseline_tokens = exp_tok\n",
    "        \n",
    "        # Actual: \n",
    "        # Action 0 -> instant\n",
    "        # Action 1 -> instant + expert (Cascade)\n",
    "        if action == 1:\n",
    "            step_actual_tokens = inst_tok + exp_tok\n",
    "            expert_calls += 1\n",
    "        else:\n",
    "            step_actual_tokens = inst_tok\n",
    "\n",
    "        total_actual_tokens += step_actual_tokens\n",
    "        total_baseline_tokens += step_baseline_tokens\n",
    "        \n",
    "        # 计算当前的 Token Ratio (Accumulated)\n",
    "        # 避免除以0\n",
    "        current_token_ratio = total_actual_tokens / total_baseline_tokens if total_baseline_tokens > 0 else 1.0\n",
    "        # 计算当前专家调用的比例\n",
    "        current_expert_ratio = expert_calls / (t - warm_up + 1)\n",
    "        # 计算当前的 Average Risk (Accumulated True Loss / t)\n",
    "        if action == 0:\n",
    "            cumulative_loss += true_loss # 当调用小模型时，才记录损失\n",
    "        else:\n",
    "            cumulative_loss += 0 # 注意调用专家时，损失永远为0.\n",
    "        current_avg_risk = cumulative_loss / (t + 1 - warm_up)\n",
    "\n",
    "        # 6. 记录日志\n",
    "        logs.append({\n",
    "            \"step\": t,\n",
    "            \"uncertainty\": u_t,\n",
    "            \"threshold\": model.current_u,      # 当前阈值\n",
    "            \"action\": action,                  # 1=Expert, 0=Instant\n",
    "            \"true_loss\": true_loss,            # 真实损失 (上帝视角)\n",
    "            \"observed_loss\": observed_loss if observed_loss is not None else np.nan,\n",
    "            \"avg_risk\": current_avg_risk,      # 累积平均风险\n",
    "            \"token_ratio\": current_token_ratio, # 累积 Token 消耗比\n",
    "            \"expert_call_ratio\": current_expert_ratio, # 累积专家调用比\n",
    "            \"wealth\": 0                   # 当前财富水平\n",
    "        })\n",
    "\n",
    "    return pd.DataFrame(logs), model"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "81bcc787",
   "metadata": {},
   "source": [
    "## bpac-ips"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3e362398",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from typing import List, Dict, Optional\n",
    "\n",
    "@dataclass\n",
    "class BPACConfigIPS:\n",
    "    \"\"\"\n",
    "    B-PAC 算法超参数配置\n",
    "    \"\"\"\n",
    "    alpha: float = 0.1          # 容错概率 (1-Confidence), 例如 0.1 代表 90% 置信度\n",
    "    epsilon: float = 0.1      # 容忍的风险上限 (Error Tolerance), 例如 0.05 代表允许 5% 的性能损失\n",
    "    rho: float = 0.1           # 最小探索概率 (Minimum Exploration Probability),0到1\n",
    "    num_thresholds: int = 1001   # 阈值搜索空间的精细度\n",
    "    warm_up: int = 50          # 初始预热步数\n",
    "\n",
    "class IPSHoeffding:\n",
    "    def __init__(self, config: BPACConfigIPS):\n",
    "        self.cfg = config\n",
    "        self.threshold_candidates = np.linspace(0, 1, self.cfg.num_thresholds)\n",
    "        \n",
    "        # 状态初始化\n",
    "        self.current_u_idx = 0 \n",
    "        self.current_u = self.threshold_candidates[self.current_u_idx]\n",
    "        \n",
    "        # IPS + Hoeffding 统计量\n",
    "        self.time_step = 0\n",
    "        self.sum_Z = np.zeros(self.cfg.num_thresholds) # 累积的 IPS 估计值\n",
    "        \n",
    "        # 预计算常数 M_tilde = (1-rho)/rho\n",
    "        # 注意：这里的 rho 应该是 rho_min (部署阶段的 rho)，以保证 bound 成立\n",
    "        self.M_tilde = (1.0 - self.cfg.rho) / self.cfg.rho\n",
    "        \n",
    "        # 候选阈值数量 N (对应公式中的 log(N/alpha_t))\n",
    "        self.N_thresholds = self.cfg.num_thresholds\n",
    "\n",
    "    def get_action(self, uncertainty_score: float):\n",
    "        \"\"\"\n",
    "        动作选择逻辑与 BPAC 保持一致，以保证 estimator 的输入分布相同。\n",
    "        \"\"\"\n",
    "        # 策略: pi_t = I(U >= u) + rho * I(U < u)\n",
    "        if uncertainty_score >= self.current_u:\n",
    "            # 必须调用专家\n",
    "            return 1, 1.0\n",
    "        else:\n",
    "            # 探索性调用\n",
    "            propensity = self.cfg.rho \n",
    "            is_exploring = np.random.rand() < self.cfg.rho\n",
    "            action = 1 if is_exploring else 0\n",
    "            \n",
    "            return action, propensity\n",
    "\n",
    "    def update(self, uncertainty_score: float, action: int, observed_loss: Optional[float]):\n",
    "        \"\"\"\n",
    "        基于 IPS + Hoeffding 的更新逻辑\n",
    "        Ref: Image 'IPS+Hoeff. ...'\n",
    "        \"\"\"\n",
    "        self.time_step += 1\n",
    "        t = self.time_step\n",
    "        \n",
    "        # 1. 数据准备\n",
    "        l_t = observed_loss if observed_loss is not None else 0.0\n",
    "        xi_t = action\n",
    "        \n",
    "        # 2. 计算 Propensity (pi_t)\n",
    "        # indicator_less: I(U_t < u)\n",
    "        indicator_less = (uncertainty_score < self.threshold_candidates).astype(float)\n",
    "        \n",
    "        # 注意：为了构造无偏估计，分母必须是生成数据时使用的真实 pi_t\n",
    "        # 如果 U < current_u, pi_t = rho; else pi_t = 1\n",
    "        if uncertainty_score < self.current_u:\n",
    "            pi_t_val = self.cfg.rho\n",
    "        else:\n",
    "            pi_t_val = 1.0\n",
    "            \n",
    "        # 3. 计算 Scaled IPS 估计量 Z_t(u)\n",
    "        # 公式: Z_t(u) = (1 - rho_min) * l_t * xi_t * I(U < u) / pi_t\n",
    "        # 注意：图片文本定义 Z_t(u) 包含了 (1-rho) 因子\n",
    "        scaling_factor = (1.0 - self.cfg.rho)\n",
    "        Z_t = scaling_factor * (l_t * xi_t * indicator_less) / pi_t_val\n",
    "        \n",
    "        # 更新累积和\n",
    "        self.sum_Z += Z_t\n",
    "        \n",
    "        # 4. 计算 Hoeffding Upper Confidence Bound (UCB)\n",
    "        # Mean Z\n",
    "        mean_Z = self.sum_Z / t\n",
    "        \n",
    "        # Failure probability allocation: alpha_t = 6 * alpha / (pi^2 * t^2)\n",
    "        alpha_t = (6 * self.cfg.alpha) / (np.pi**2 * t**2)\n",
    "        \n",
    "        # Penalty Term: M_tilde * sqrt( log(N / alpha_t) / 2t )\n",
    "        # 加上 1e-9 防止 log(0)\n",
    "        # log_term = np.log(self.N_thresholds / alpha_t + 1e-9)\n",
    "        log_term = np.log(1 / alpha_t + 1e-9)\n",
    "        penalty = self.M_tilde * np.sqrt(log_term / (2 * t))\n",
    "        \n",
    "        ucb = mean_Z + penalty\n",
    "        \n",
    "        # 5. 阈值选择\n",
    "        # 选择满足 UCB(u) <= epsilon 的最大 u\n",
    "        # valid_indices = { u : UCB(u) <= epsilon }\n",
    "        \n",
    "        # 注意：这里需要比较的是 Deployment Risk，所以 epsilon 是原始设定的 risk budget\n",
    "        valid_indices = np.where(ucb <= self.cfg.epsilon)[0]\n",
    "        \n",
    "        if len(valid_indices) > 0:\n",
    "            self.current_u_idx = valid_indices[-1] # Max index\n",
    "            self.current_u = self.threshold_candidates[self.current_u_idx]\n",
    "        else:\n",
    "            # Fallback to safest (u=0, all expert)\n",
    "            self.current_u_idx = 0\n",
    "            self.current_u = 0.0\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from dataclasses import dataclass\n",
    "from typing import List, Optional, Tuple, Dict\n",
    "\n",
    "def compute_step_loss(y_correct_t: int, y_hat_correct_t: int, max_val: float) -> float:\n",
    "    \"\"\"\n",
    "    单步 Loss 计算（修改版）\n",
    "    \n",
    "    新公式：\n",
    "    loss = (y_correct_t - y_hat_correct_t) / max_val\n",
    "    \n",
    "    参数:\n",
    "        y_correct_t: int, 1-10, 表示专家的分数\n",
    "        y_hat_correct_t: int, 1-10 表示小模型的分数\n",
    "        max_val: float, 用于归一化的分母（通常 > 0），当前最大的可能分数差值。\n",
    "    \n",
    "    返回:\n",
    "        float, 计算得到的单步 loss 值\n",
    "    \"\"\"\n",
    "    if max_val == 0:\n",
    "        raise ValueError(\"max_val cannot be zero to avoid division by zero\")\n",
    "    y_correct_t = np.asanyarray(y_correct_t,dtype=float)\n",
    "    y_hat_correct_t = np.asanyarray(y_hat_correct_t,dtype=float)\n",
    "    loss = np.sqrt((y_correct_t - y_hat_correct_t) / max_val)\n",
    "    # k = 1\n",
    "    # loss = (1-np.exp(-k*(y_correct_t - y_hat_correct_t))) / (1 - np.exp(-k*max_val))\n",
    "\n",
    "    return loss\n",
    "\n",
    "def run_simulation_ips(data_sequence: List[Dict], config: IPSHoeffding):\n",
    "    \"\"\"\n",
    "    data_sequence: List of item dicts\n",
    "    item keys: \"uncertainty\", \"instant_correct\", \"expert_correct\", \"instant_token\", \"expert_token\"\n",
    "    \"\"\"\n",
    "    model = IPSHoeffding(config)\n",
    "    logs = []\n",
    "    warm_up = config.warm_up\n",
    "    # print(f\"Start Simulation with {len(data_sequence)} samples...\")\n",
    "    # print(f\"Config: Epsilon={config.epsilon}, Alpha={config.alpha}, Rho={config.rho}\")\n",
    "    diffs = []\n",
    "    for i in data_sequence:\n",
    "        diff = i['expert_correct'] - i['instant_correct']\n",
    "        diffs.append(diff)\n",
    "    max_diff = max(diffs)\n",
    "\n",
    "    # 累积变量\n",
    "    total_actual_tokens = 0\n",
    "    total_baseline_tokens = 0\n",
    "    cumulative_loss = 0\n",
    "    expert_calls = 0\n",
    "\n",
    "    for t, item in enumerate(data_sequence):\n",
    "        # 1. 提取特征\n",
    "        u_t = item['uncertainty']\n",
    "        inst_corr = item['instant_correct']\n",
    "        exp_corr = item['expert_correct']\n",
    "        inst_tok = item['instant_token']\n",
    "        exp_tok = item['expert_token']\n",
    "        \n",
    "        action, propensity = model.get_action(u_t)\n",
    "        \n",
    "        # 3. 计算 Loss\n",
    "        # (A) True Loss: 上帝视角，用于评估和画图\n",
    "        # 即使 action=0，如果小模型错了专家对了，这里也是 1\n",
    "        true_loss = compute_step_loss(exp_corr, inst_corr, max_diff)\n",
    "        \n",
    "        # (B) Observed Loss: 算法视角 (Bandit Feedback) \n",
    "        # 只有调用了专家 (action=1)，算法才能看到 loss\n",
    "        # 如果 action=0，算法不知道 loss，传入 None (内部处理为0)\n",
    "        observed_loss = true_loss if action == 1 else None\n",
    "        \n",
    "        # 4. 算法更新\n",
    "        model.update(u_t, action, observed_loss)\n",
    "        if t < warm_up:\n",
    "            continue\n",
    "\n",
    "        # 5. Token 消耗计算\n",
    "        # Baseline: 假设全用 Expert\n",
    "        step_baseline_tokens = exp_tok\n",
    "        \n",
    "        # Actual: \n",
    "        # Action 0 -> instant\n",
    "        # Action 1 -> instant + expert (Cascade)\n",
    "        if action == 1:\n",
    "            step_actual_tokens = inst_tok + exp_tok\n",
    "            expert_calls += 1\n",
    "        else:\n",
    "            step_actual_tokens = inst_tok\n",
    "\n",
    "        total_actual_tokens += step_actual_tokens\n",
    "        total_baseline_tokens += step_baseline_tokens\n",
    "        \n",
    "        # 计算当前的 Token Ratio (Accumulated)\n",
    "        # 避免除以0\n",
    "        current_token_ratio = total_actual_tokens / total_baseline_tokens if total_baseline_tokens > 0 else 1.0\n",
    "        # 计算当前专家调用的比例\n",
    "        current_expert_ratio = expert_calls / (t - warm_up + 1)\n",
    "        # 计算当前的 Average Risk (Accumulated True Loss / t)\n",
    "        if action == 0:\n",
    "            cumulative_loss += true_loss # 当调用小模型时，才记录损失\n",
    "        else:\n",
    "            cumulative_loss += 0 # 注意调用专家时，损失永远为0.\n",
    "        current_avg_risk = cumulative_loss / (t + 1 - warm_up)\n",
    "\n",
    "        # 6. 记录日志\n",
    "        logs.append({\n",
    "            \"step\": t,\n",
    "            \"uncertainty\": u_t,\n",
    "            \"threshold\": model.current_u,      # 当前阈值\n",
    "            \"action\": action,                  # 1=Expert, 0=Instant\n",
    "            \"true_loss\": true_loss,            # 真实损失 (上帝视角)\n",
    "            \"observed_loss\": observed_loss if observed_loss is not None else np.nan,\n",
    "            \"avg_risk\": current_avg_risk,      # 累积平均风险\n",
    "            \"token_ratio\": current_token_ratio, # 累积 Token 消耗比\n",
    "            \"expert_call_ratio\": current_expert_ratio, # 累积专家调用比\n",
    "            \"wealth\": 0                   # 当前财富水平\n",
    "        })\n",
    "\n",
    "    return pd.DataFrame(logs), model"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "81e0c01a",
   "metadata": {},
   "source": [
    "## Baseline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3f1dcf4a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from scipy.stats import norm\n",
    "\n",
    "def compute_pac_loss(y_correct, y_hat_correct, max_value):\n",
    "    if max_value == 0:\n",
    "        raise ValueError(\"max_value cannot be zero\")\n",
    "\n",
    "    y_correct = np.asarray(y_correct, dtype=float)\n",
    "    y_hat_correct = np.asarray(y_hat_correct, dtype=float)\n",
    "\n",
    "    diff = (y_correct - y_hat_correct) / max_value\n",
    "    # k = 1\n",
    "    # loss = (1-np.exp(-k*(y_correct - y_hat_correct))) / (1 - np.exp(-k*max_value))\n",
    "    # 数值稳定 & 理论更合理（防负数）\n",
    "    loss = np.sqrt(diff)\n",
    "\n",
    "    return loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fa77ba3a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from scipy.stats import norm\n",
    "\n",
    "def split_calib_test(y, calib_ratio=None, calib_num=None, seed=None):\n",
    "    \"\"\"\n",
    "    将数据随机划分为修正集 (calibration set) 和测试集 (test set)\n",
    "\n",
    "    参数:\n",
    "      y           : 数据 (长度 n)\n",
    "      calib_ratio : 修正集占比 (0, 1)\n",
    "      calib_num   : 修正集样本数 (int)\n",
    "      seed        : 随机种子\n",
    "\n",
    "    返回:\n",
    "      calib_idx, test_idx\n",
    "    \"\"\"\n",
    "    # -------- 参数合法性检查 --------\n",
    "    if calib_ratio is None and calib_num is None:\n",
    "        raise ValueError(\"必须指定 calib_ratio 或 calib_num 其中之一\")\n",
    "\n",
    "    if calib_ratio is not None and calib_num is not None:\n",
    "        raise ValueError(\"calib_ratio 和 calib_num 不能同时指定\")\n",
    "\n",
    "    if calib_ratio is not None:\n",
    "        if not (0 < calib_ratio < 1):\n",
    "            raise ValueError(\"calib_ratio 必须在 (0, 1) 之间\")\n",
    "\n",
    "    if calib_num is not None:\n",
    "        if not isinstance(calib_num, int) or calib_num <= 0:\n",
    "            raise ValueError(\"calib_num 必须是正整数\")\n",
    "\n",
    "    # -------- 随机数生成器 --------\n",
    "    rng = np.random.default_rng(seed)\n",
    "\n",
    "    n = len(y)\n",
    "\n",
    "    # -------- 计算 calib_size --------\n",
    "    if calib_ratio is not None:\n",
    "        calib_size = int(n * calib_ratio)\n",
    "    else:\n",
    "        calib_size = calib_num\n",
    "\n",
    "    if calib_size >= n:\n",
    "        raise ValueError(\"calibration set 的大小必须小于数据总量\")\n",
    "\n",
    "    # -------- 打乱并划分 --------\n",
    "    indices = np.arange(n)\n",
    "    rng.shuffle(indices)\n",
    "\n",
    "    calib_idx = indices[:calib_size]\n",
    "    test_idx = indices[calib_size:]\n",
    "\n",
    "    return calib_idx, test_idx\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from scipy.stats import norm\n",
    "\n",
    "def run_baseline_continuous(\n",
    "    y_solved, y_hat_solved, U, y_token, y_hat_token,\n",
    "    calib_ratio=0.5, epsilon=0.05, alpha=0.05, seed=42,\n",
    "    pi=0.5, m=None # 新增 m 参数，默认为 None (即等于 calib_size)\n",
    "):\n",
    "    \"\"\"\n",
    "    连续模拟 Two-Stage Baseline，其中计算 u_hat 的逻辑严格复刻原始代码。\n",
    "    \"\"\"\n",
    "    # -------------------------------------------------------------------------\n",
    "    # 1. 数据准备与重排 (Calibration -> Test)\n",
    "    # -------------------------------------------------------------------------\n",
    "    diffs = []\n",
    "\n",
    "    for i in range(len(y_solved)):\n",
    "\n",
    "        diff = y_solved[i] - y_hat_solved[i]\n",
    "\n",
    "        diffs.append(diff)\n",
    "\n",
    "    max_diff = max(diffs)\n",
    "\n",
    "    n = len(y_solved)\n",
    "    calib_size = int(n * calib_ratio)\n",
    "    \n",
    "    # 设定随机种子\n",
    "    rng = np.random.default_rng(seed)\n",
    "    \n",
    "    # 打乱索引\n",
    "    indices = np.arange(n)\n",
    "    rng.shuffle(indices)\n",
    "    \n",
    "    # 重排索引：前 calib_size 放前面，剩下的放后面\n",
    "    sorted_indices = np.concatenate([indices[:calib_size], indices[calib_size:]])\n",
    "    \n",
    "    # 辅助重排函数\n",
    "    def reorder(arr):\n",
    "        return [arr[i] for i in sorted_indices]\n",
    "    \n",
    "    # 重排数据\n",
    "    y_s = reorder(y_solved)       # 专家正确性\n",
    "    y_h = reorder(y_hat_solved)   # 小模型正确性\n",
    "    u_s_list = reorder(U)         # 不确定性 (list)\n",
    "    tok_s = reorder(y_token)      # 专家Token\n",
    "    tok_h = reorder(y_hat_token)  # 小模型Token\n",
    "    \n",
    "    # -------------------------------------------------------------------------\n",
    "    # 2. 计算固定阈值 u_hat (完全保留原始逻辑)\n",
    "    # -------------------------------------------------------------------------\n",
    "    \n",
    "    # (A) 提取校准数据 (前 calib_size 个)\n",
    "    y_solved_c = np.array(y_s[:calib_size], dtype=object)\n",
    "    y_hat_solved_c = np.array(y_h[:calib_size], dtype=object)\n",
    "    U_c = np.asarray(u_s_list[:calib_size], dtype=float)\n",
    "    # 这里造一个 dummy 的 gt_anss，因为 compute_pac_loss 需要占位符\n",
    "    gt_anss_c = np.array([None] * calib_size, dtype=object)\n",
    "    \n",
    "    n_c = len(y_solved_c)\n",
    "    \n",
    "    # 如果没指定 m，默认使用校准集大小 (和原始代码中 calib_idx.shape[0] 对应)\n",
    "    if m is None:\n",
    "        m = n_c\n",
    "\n",
    "    # (B) 计算 Full Loss\n",
    "    loss_full_c = compute_pac_loss(y_solved_c, y_hat_solved_c, max_diff)\n",
    "\n",
    "    # (C) 放回抽样 (Bootstrap) --- [原始逻辑]\n",
    "    sample_idx = rng.choice(n_c, size=m, replace=True)\n",
    "    U_s_sampled = U_c[sample_idx] # 注意变量名避免冲突\n",
    "    loss_s_sampled = loss_full_c[sample_idx]\n",
    "\n",
    "    # (D) Bernoulli(pi) 审计 --- [原始逻辑]\n",
    "    phi = (rng.random(m) < pi).astype(float)\n",
    "    weights = phi / pi\n",
    "\n",
    "    # (E) 定义 UCB 函数 --- [原始逻辑]\n",
    "    z = norm.ppf(1 - alpha)\n",
    "    \n",
    "    def upper_conf_bound(u):\n",
    "        mask = (U_s_sampled <= u).astype(float)\n",
    "        X = loss_s_sampled * mask * weights\n",
    "        mean = X.mean()\n",
    "        std = X.std(ddof=1) if X.size > 1 else 0.0\n",
    "        return mean + z * std / np.sqrt(X.size)\n",
    "\n",
    "    # (F) Grid Search 与 u_hat 选择 --- [原始逻辑]\n",
    "    # 原始代码使用 np.sort(U_c) 作为网格\n",
    "    u_grid = np.sort(U_c)\n",
    "    \n",
    "    # 计算所有点的 UCB\n",
    "    ucbs = np.array([upper_conf_bound(u) for u in u_grid])\n",
    "    \n",
    "    # 找到满足条件的最大的 u\n",
    "    ok_idx = np.where(ucbs <= epsilon)[0]\n",
    "    if ok_idx.size > 0:\n",
    "        u_hat = float(u_grid[ok_idx.max()])\n",
    "    else:\n",
    "        # 如果都不满足，取最小的（最保守）\n",
    "        u_hat = float(u_grid[0])\n",
    "        \n",
    "    # print(f\"Run Seed={seed}: u_hat = {u_hat:.4f}\")\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    # 3. 连续流模拟 (生成 Logs)\n",
    "    # -------------------------------------------------------------------------\n",
    "    logs = []\n",
    "    \n",
    "    total_actual_tokens = 0\n",
    "    total_baseline_tokens = 0\n",
    "    cumulative_loss = 0\n",
    "    expert_calls = 0\n",
    "    \n",
    "    for t in range(n):\n",
    "        cur_u = u_s_list[t]\n",
    "        cur_y_exp = y_s[t]\n",
    "        cur_y_inst = y_h[t]\n",
    "        cur_tok_exp = tok_s[t]\n",
    "        cur_tok_inst = tok_h[t]\n",
    "        \n",
    "        # 判断阶段\n",
    "        is_in_calibration = (t < calib_size)\n",
    "        \n",
    "        if is_in_calibration:\n",
    "            # === 校准阶段 ===\n",
    "            action = 1 \n",
    "            threshold = 0.0 # 占位\n",
    "            \n",
    "            # 校准阶段虽然强制调用专家，但为了对齐 Risk 曲线的起点，\n",
    "            # 我们通常认为此时 Loss=0 (因为获得了专家/真实标签)\n",
    "            true_loss = 0.0 \n",
    "            observed_loss = 0.0\n",
    "        else:\n",
    "            # === 测试阶段 ===\n",
    "            threshold = u_hat\n",
    "            # 应用计算出的 u_hat\n",
    "            if cur_u >= u_hat:\n",
    "                action = 1 # Expert\n",
    "            else:\n",
    "                action = 0 # Instant\n",
    "            \n",
    "            # Loss 计算\n",
    "            if action == 1:\n",
    "                true_loss = 0.0\n",
    "                observed_loss = 0.0\n",
    "            else:\n",
    "                # 没调专家：如果专家对(1)且小模型错(0)，则 loss=1\n",
    "                true_loss = compute_step_loss(cur_y_exp, cur_y_inst, max_diff)\n",
    "                observed_loss = None\n",
    "\n",
    "        # 统计\n",
    "        step_baseline = cur_tok_exp\n",
    "        if action == 1:\n",
    "            step_actual = cur_tok_inst + cur_tok_exp\n",
    "            expert_calls += 1\n",
    "        else:\n",
    "            step_actual = cur_tok_inst\n",
    "            \n",
    "        total_actual_tokens += step_actual\n",
    "        total_baseline_tokens += step_baseline\n",
    "        cumulative_loss += true_loss\n",
    "        \n",
    "        # 实时指标\n",
    "        current_token_ratio = total_actual_tokens / total_baseline_tokens if total_baseline_tokens > 0 else 1.0\n",
    "        current_expert_ratio = expert_calls / (t + 1)\n",
    "        current_avg_risk = cumulative_loss / (t + 1)\n",
    "        \n",
    "        logs.append({\n",
    "            \"step\": t,\n",
    "            \"phase\": \"Calibration\" if is_in_calibration else \"Test\",\n",
    "            \"uncertainty\": cur_u,\n",
    "            \"threshold\": threshold,\n",
    "            \"action\": action,\n",
    "            \"true_loss\": true_loss,\n",
    "            \"observed_loss\": observed_loss,\n",
    "            \"avg_risk\": current_avg_risk,\n",
    "            \"token_ratio\": current_token_ratio,\n",
    "            \"expert_call_ratio\": current_expert_ratio,\n",
    "            \"wealth\": 1.0 # 占位，方便画图\n",
    "        })\n",
    "        \n",
    "    return pd.DataFrame(logs), u_hat"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b8dd0032",
   "metadata": {},
   "source": [
    "# logits score"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "69b1b007",
   "metadata": {},
   "source": [
    "## Qwen3-ins"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "43a91d7c",
   "metadata": {},
   "outputs": [],
   "source": [
    "instant = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs/magpie/qwen3-ins.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0cca01e9",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "expert  = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs/magpie/qwen3-think.json\")\n",
    "instant = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs/magpie/qwen3-ins.json\")\n",
    "\n",
    "# ======================\n",
    "# 2. 合并（按 index）\n",
    "# ======================\n",
    "df = expert.merge(\n",
    "    instant[[\"gpt4_score\"]],\n",
    "    left_index=True,\n",
    "    right_index=True,\n",
    "    suffixes=(\"_expert\", \"_instant\")\n",
    ")\n",
    "\n",
    "# ======================\n",
    "# 3. 条件 mask\n",
    "# ======================\n",
    "valid_mask = (\n",
    "    df[\"gpt4_score_expert\"].notna() &\n",
    "    df[\"gpt4_score_instant\"].notna()\n",
    ")\n",
    "\n",
    "better_mask = valid_mask & (\n",
    "    df[\"gpt4_score_expert\"] >= df[\"gpt4_score_instant\"]\n",
    ")\n",
    "\n",
    "# ======================\n",
    "# 4. 筛选\n",
    "# ======================\n",
    "df_selected = df.loc[better_mask]\n",
    "\n",
    "# 拆回原 dataframe（保留整行）\n",
    "expert_data  = expert.loc[df_selected.index]\n",
    "instant_data = instant.loc[df_selected.index]\n",
    "\n",
    "# ======================\n",
    "# 5. 统计信息\n",
    "# ======================\n",
    "total_valid = valid_mask.sum()\n",
    "num_better  = better_mask.sum()\n",
    "\n",
    "print(f\"Total valid paired samples (after remove NaN): {total_valid}\")\n",
    "print(f\"Cases where Think/Expert score >= Instant score: {num_better}\")\n",
    "print(f\"Ratio: {num_better / total_valid:.4f}\")\n",
    "print(f\"Percentage: {num_better / total_valid * 100:.1f}%\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3b08a4dd",
   "metadata": {},
   "outputs": [],
   "source": [
    "instant_data.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9090a4e0",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_list =[]\n",
    "selcet_ids =[]\n",
    "for i,row in expert_data.iterrows():\n",
    "    session_id = row['session_id']\n",
    "    if session_id not in instant_data['session_id'].values:\n",
    "        print(f\"Instant data missing at index:{i}, session_id:{session_id}\")\n",
    "        continue\n",
    "    tmp_dict = {}\n",
    "    instant_row = instant_data[instant_data['session_id'] == session_id].iloc[0]\n",
    "    selcet_ids.append(session_id)\n",
    "    if instant_row[\"question\"] != row[\"question\"]:\n",
    "        print(f\"Question mismatch at index:{i}, session_id:{session_id}\")\n",
    "        continue\n",
    "    tmp_dict['uncertainty'] = 1- instant_row['token_probs'][0]\n",
    "    tmp_dict['instant_correct'] = float(instant_row['gpt4_score'])\n",
    "    tmp_dict['expert_correct'] = float(expert_data.loc[i, \"gpt4_score\"])\n",
    "    tmp_dict['instant_token'] = instant_row['gen_token_count']\n",
    "    tmp_dict['expert_token'] = expert_data.loc[i, 'gen_token_count']\n",
    "    data_list.append(tmp_dict)\n",
    "\n",
    "print(f\"Total samples prepared: {len(data_list)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9e9b2644",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "diff = np.array([\n",
    "    d[\"instant_correct\"] - d[\"expert_correct\"]\n",
    "    for d in data_list\n",
    "])\n",
    "\n",
    "q1 = np.percentile(diff, 25)\n",
    "q3 = np.percentile(diff, 75)\n",
    "iqr = q3 - q1\n",
    "\n",
    "lower = q1 - 1.5 * iqr\n",
    "upper = q3 + 1.5 * iqr\n",
    "\n",
    "filtered_data_list = [\n",
    "    d for d in data_list\n",
    "    if lower <= (d[\"instant_correct\"] - d[\"expert_correct\"]) <= upper\n",
    "]\n",
    "\n",
    "print(\"原始样本数:\", len(data_list))\n",
    "print(\"删除后样本数:\", len(filtered_data_list))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d2e4dca1",
   "metadata": {},
   "outputs": [],
   "source": [
    "diff = np.array([\n",
    "    d[\"instant_correct\"] - d[\"expert_correct\"]\n",
    "    for d in filtered_data_list\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "37305433",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.hist(diff, bins=30, edgecolor='black')\n",
    "plt.title(\"Distribution of Score Differences (Instant - Expert)\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d3307821",
   "metadata": {},
   "source": [
    "### compare to pac"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "65d2ef62",
   "metadata": {},
   "source": [
    "#### bpac"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4b9d7ffb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from tqdm import tqdm \n",
    "\n",
    "\n",
    "# 假设 data_list 已经准备好\n",
    "y_solved = [item['expert_correct'] for item in data_list]\n",
    "y_hat_solved = [item['instant_correct'] for item in data_list]\n",
    "uncertainty_values = [item['uncertainty'] for item in data_list]\n",
    "y_token_list = [item['expert_token'] for item in data_list]\n",
    "y_hat_token_list = [item['instant_token'] for item in data_list]\n",
    "\n",
    "# 参数设置\n",
    "NUM_RUNS = 100         # 重复次数\n",
    "TARGET_EPSILON = 0.08   # 目标 Risk\n",
    "TARGET_ALPHA = 0.1     # 置信度\n",
    "CALIB_NUM = 500        # 校准集大小\n",
    "TOTAL_NUM = len(y_solved)\n",
    "calib_ratio = CALIB_NUM / TOTAL_NUM\n",
    "\n",
    "# -------------------------------------------------------------------------\n",
    "# 1. 循环运行 Simulation\n",
    "# -------------------------------------------------------------------------\n",
    "\n",
    "# 用于存储每次实验的完整序列\n",
    "# 维度: (100, N_samples)\n",
    "all_risks = []\n",
    "all_token_ratios = []\n",
    "all_expert_ratios = []\n",
    "all_u_hats = []\n",
    "\n",
    "print(f\"Starting {NUM_RUNS} runs simulation...\")\n",
    "\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    # 每次使用不同的随机种子 (seed=0, 1, 2, ..., 99)\n",
    "    # 这样 run_baseline_continuous 内部的 shuffle 结果每次都不一样\n",
    "    df, u_hat = run_baseline_continuous(\n",
    "        y_solved, \n",
    "        y_hat_solved, \n",
    "        uncertainty_values, \n",
    "        y_token_list, \n",
    "        y_hat_token_list,\n",
    "        calib_ratio=calib_ratio, \n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=TARGET_ALPHA,\n",
    "        seed=seed\n",
    "    )\n",
    "    \n",
    "    # 记录整条曲线 (df['avg_risk'] 是一个 Series)\n",
    "    all_risks.append(df['avg_risk'].values)\n",
    "    all_token_ratios.append(df['token_ratio'].values)\n",
    "    all_expert_ratios.append(df['expert_call_ratio'].values)\n",
    "    all_u_hats.append(u_hat)\n",
    "\n",
    "# 转为 Numpy Array 方便计算均值方差\n",
    "# shape = (100, N)\n",
    "arr_risks = np.array(all_risks)\n",
    "arr_token_ratios = np.array(all_token_ratios)\n",
    "arr_expert_ratios = np.array(all_expert_ratios)\n",
    "arr_thresholds = np.array(all_u_hats)\n",
    "\n",
    "print(\"\\nSimulation Finished!\")\n",
    "print(f\"Average u_hat across runs: {np.mean(all_u_hats):.4f} (std: {np.std(all_u_hats):.4f})\")\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "# 参数设置\n",
    "NUM_RUNS = 100        # 重复 100 次\n",
    "WARM_UP_STEPS = 0   # BPAC 的 Warm-up 步数 (对应 Baseline 的校准集大小)\n",
    "TARGET_EPSILON = 0.08  # 目标 Risk\n",
    "CFG_ALPHA= 0.1       # BPAC 参数\n",
    "CFG_RHO = 0      # BPAC 参数\n",
    "BPA_beta = 1\n",
    "CFG_RHO_0 = 0.05\n",
    "CFG_RHO_1 = 0.7\n",
    "CFG_CHANGE = 200\n",
    "C_CLIP = 0.9\n",
    "\n",
    "bpac_risks = []\n",
    "bpac_token_ratios = []\n",
    "bpac_expert_ratios = []\n",
    "bpac_wealths = []\n",
    "bpac_thresholds = []\n",
    "\n",
    "print(f\"Starting BPAC Simulation ({NUM_RUNS} runs)...\")\n",
    "\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    # -----------------------------------------------------------\n",
    "    # 1. 严格的数据对齐 (Data Alignment)\n",
    "    # -----------------------------------------------------------\n",
    "    # 使用与 Baseline 相同的种子生成随机数生成器\n",
    "    rng = np.random.default_rng(seed)\n",
    "    \n",
    "    # 生成打乱的索引\n",
    "    n = len(data_list)\n",
    "    indices = np.arange(n)\n",
    "    rng.shuffle(indices)\n",
    "    \n",
    "    # 关键修正：根据 indices 重排 data_list\n",
    "    # 注意：data_list 是 list of dicts，不能直接用 indices 索引，除非转 numpy\n",
    "    # 这里用列表推导式最稳妥\n",
    "    shuffled_data = [data_list[i] for i in indices]\n",
    "    \n",
    "    # -----------------------------------------------------------\n",
    "    # 2. 配置并运行 BPAC\n",
    "    # -----------------------------------------------------------\n",
    "    # 你的 BPACConfig\n",
    "    cfg = BPACConfig(\n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=CFG_ALPHA, \n",
    "        rho=CFG_RHO, \n",
    "        warm_up=WARM_UP_STEPS,\n",
    "        num_thresholds=1001, \n",
    "        beta=BPA_beta, \n",
    "        c_clip=C_CLIP,\n",
    "        rho_0=CFG_RHO_0,\n",
    "        rho_1=CFG_RHO_1,\n",
    "        change_point=CFG_CHANGE\n",
    "    )\n",
    "    \n",
    "    # 运行模拟 (传入打乱后的数据)\n",
    "    df_result, _ = run_simulation(shuffled_data, cfg)\n",
    "    \n",
    "    bpac_risks.append(df_result['avg_risk'].values)\n",
    "    bpac_token_ratios.append(df_result['token_ratio'].values)\n",
    "    bpac_expert_ratios.append(df_result['expert_call_ratio'].values)\n",
    "    bpac_wealths.append(df_result['wealth'].values)\n",
    "    bpac_thresholds.append(df_result['threshold'].values)\n",
    "\n",
    "\n",
    "# 转换为 Numpy 数组方便计算均值\n",
    "bpac_risks_arr = np.array(bpac_risks)       # Shape: (100, N)\n",
    "bpac_token_ratios_arr = np.array(bpac_token_ratios) # Shape: (100, N)\n",
    "bpac_expert_ratios_arr = np.array(bpac_expert_ratios) # Shape: (100, N)\n",
    "bpac_wealths_arr = np.array(bpac_wealths)   # Shape: (100, N)\n",
    "bpac_thresholds_arr = np.array(bpac_thresholds) # Shape: (100, N)\n",
    "print(\"BPAC Simulation Finished!\")\n",
    "bpac_risks_arr.shape, bpac_token_ratios_arr.shape, bpac_expert_ratios_arr.shape\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "eb01cf8d",
   "metadata": {},
   "source": [
    "##### 图1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5da0b717",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# Baseline 数据\n",
    "base_mean_risk = np.mean(arr_risks, axis=0)\n",
    "base_std_risk = np.std(arr_risks, axis=0)\n",
    "\n",
    "base_mean_token = np.mean(arr_token_ratios, axis=0)\n",
    "base_std_token = np.std(arr_token_ratios, axis=0)\n",
    "\n",
    "base_mean_expert = np.mean(arr_expert_ratios, axis=0)\n",
    "base_std_expert = np.std(arr_expert_ratios, axis=0)\n",
    "\n",
    "# 获取总时间步数\n",
    "N_STEPS = bpac_risks_arr.shape[1] \n",
    "\n",
    "# A. 重构 Baseline Threshold 序列 (因为你只存了最终的一个 u_hat)\n",
    "# 逻辑：在校准期 (前 CALIB_NUM 步) 阈值设为 0，之后设为 u_hat\n",
    "# arr_thresholds 是 (100,) 的标量数组\n",
    "base_threshold_seqs = np.zeros((NUM_RUNS, N_STEPS))\n",
    "for i in range(NUM_RUNS):\n",
    "    base_threshold_seqs[i, :CALIB_NUM] = 0.0  # 校准期阈值通常视为0 (强制Action=1)\n",
    "    base_threshold_seqs[i, CALIB_NUM:] = arr_thresholds[i] # 测试期应用 u_hat\n",
    "\n",
    "base_mean_threshold_seq = np.mean(base_threshold_seqs, axis=0)\n",
    "base_std_threshold_seq = np.std(base_threshold_seqs, axis=0)\n",
    "\n",
    "\n",
    "# BPAC 数据\n",
    "bpac_mean_risk = np.mean(bpac_risks_arr, axis=0)\n",
    "bpac_std_risk = np.std(bpac_risks_arr, axis=0)\n",
    "\n",
    "bpac_mean_token = np.mean(bpac_token_ratios_arr, axis=0)\n",
    "bpac_std_token = np.std(bpac_token_ratios_arr, axis=0)\n",
    "\n",
    "bpac_mean_expert = np.mean(bpac_expert_ratios_arr, axis=0)\n",
    "bpac_std_expert = np.std(bpac_expert_ratios_arr, axis=0)\n",
    "\n",
    "# C. 确保其他 BPAC 统计量已准备好 (以防万一)\n",
    "bpac_mean_threshold = np.mean(bpac_thresholds_arr, axis=0)\n",
    "bpac_std_threshold = np.std(bpac_thresholds_arr, axis=0)\n",
    "\n",
    "# B. 计算 BPAC Wealth 统计 (你存了 bpac_wealths_arr 但还没算均值)\n",
    "bpac_mean_wealth = np.mean(bpac_wealths_arr, axis=0)\n",
    "bpac_std_wealth = np.std(bpac_wealths_arr, axis=0)\n",
    "\n",
    "\n",
    "fig, axes = plt.subplots(5, 1, figsize=(12, 24), sharex=True)\n",
    "(ax1, ax2, ax3, ax4, ax5) = axes\n",
    "\n",
    "# X轴\n",
    "steps = np.arange(N_STEPS)\n",
    "\n",
    "# === 子图 1: Average Risk (风险控制) ===\n",
    "ax1.plot(steps, base_mean_risk, label='Baseline', color='blue', linestyle='--')\n",
    "ax1.fill_between(steps, base_mean_risk - base_std_risk, base_mean_risk + base_std_risk, color='blue', alpha=0.1)\n",
    "\n",
    "ax1.plot(steps, bpac_mean_risk, label='BPAC (Ours)', color='red', linewidth=2)\n",
    "ax1.fill_between(steps, bpac_mean_risk - bpac_std_risk, bpac_mean_risk + bpac_std_risk, color='red', alpha=0.1)\n",
    "\n",
    "ax1.axhline(y=TARGET_EPSILON, color='green', linestyle='-', linewidth=2, label=f'Target $\\epsilon={TARGET_EPSILON}$')\n",
    "ax1.axvline(x=CALIB_NUM, color='gray', linestyle=':', label='Calibration End')\n",
    "\n",
    "ax1.set_ylabel('Avg Risk')\n",
    "ax1.set_title(f'1. Risk Control (Target $\\epsilon < {TARGET_EPSILON}$)')\n",
    "ax1.legend(loc='upper right')\n",
    "ax1.grid(True, alpha=0.3)\n",
    "ax1.set_ylim(0, TARGET_EPSILON * 3.0) # 聚焦在 Epsilon 附近\n",
    "\n",
    "# === 子图 2: Token Ratio (成本节省) ===\n",
    "ax2.plot(steps, base_mean_token, label='Baseline', color='blue', linestyle='--')\n",
    "ax2.fill_between(steps, base_mean_token - base_std_token, base_mean_token + base_std_token, color='blue', alpha=0.1)\n",
    "\n",
    "ax2.plot(steps, bpac_mean_token, label='BPAC', color='red', linewidth=2)\n",
    "ax2.fill_between(steps, bpac_mean_token - bpac_std_token, bpac_mean_token + bpac_std_token, color='red', alpha=0.1)\n",
    "\n",
    "ax2.axhline(y=1.0, color='black', linestyle=':')\n",
    "ax2.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "ax2.set_ylabel('Token Ratio')\n",
    "ax2.set_title('2. Cost Efficiency (Lower is Better)')\n",
    "ax2.grid(True, alpha=0.3)\n",
    "\n",
    "# === 子图 3: Expert Call Ratio (专家调用率) ===\n",
    "ax3.plot(steps, base_mean_expert, label='Baseline', color='blue', linestyle='--')\n",
    "ax3.fill_between(steps, base_mean_expert - base_std_expert, base_mean_expert + base_std_expert, color='blue', alpha=0.1)\n",
    "\n",
    "ax3.plot(steps, bpac_mean_expert, label='BPAC', color='red', linewidth=2)\n",
    "ax3.fill_between(steps, bpac_mean_expert - bpac_std_expert, bpac_mean_expert + bpac_std_expert, color='red', alpha=0.1)\n",
    "\n",
    "ax3.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "ax3.set_ylabel('Expert Call Ratio')\n",
    "ax3.set_title('3. Expert Usage Behavior')\n",
    "ax3.grid(True, alpha=0.3)\n",
    "ax3.set_ylim(0, 1.1)\n",
    "\n",
    "# === 子图 4: Threshold Evolution (阈值变化) ===\n",
    "# Baseline: 前期是0，后期是固定值\n",
    "ax4.plot(steps, base_mean_threshold_seq, label='Baseline $\\hat{u}$', color='blue', linestyle='--')\n",
    "ax4.fill_between(steps, \n",
    "                 base_mean_threshold_seq - base_std_threshold_seq, \n",
    "                 base_mean_threshold_seq + base_std_threshold_seq, \n",
    "                 color='blue', alpha=0.1)\n",
    "\n",
    "# BPAC: 动态变化\n",
    "ax4.plot(steps, bpac_mean_threshold, label='BPAC $u_t$', color='red', linewidth=1.5)\n",
    "ax4.fill_between(steps, \n",
    "                 bpac_mean_threshold - bpac_std_threshold, \n",
    "                 bpac_mean_threshold + bpac_std_threshold, \n",
    "                 color='red', alpha=0.1)\n",
    "\n",
    "ax4.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "ax4.set_ylabel('Uncertainty Threshold')\n",
    "ax4.set_title('4. Threshold Adaptation (Dynamic vs Fixed)')\n",
    "ax4.legend(loc='upper right')\n",
    "ax4.grid(True, alpha=0.3)\n",
    "ax4.set_ylim(0, 1.0) \n",
    "\n",
    "# === 子图 5: Wealth Evolution (财富积累) ===\n",
    "# 只有 BPAC 有 Wealth\n",
    "ax5.plot(steps, bpac_mean_wealth, label='BPAC Wealth', color='purple', linewidth=1.5)\n",
    "ax5.fill_between(steps, \n",
    "                 bpac_mean_wealth - bpac_std_wealth, \n",
    "                 bpac_mean_wealth + bpac_std_wealth, \n",
    "                 color='purple', alpha=0.1)\n",
    "\n",
    "ax5.axhline(y=1.0, color='black', linestyle='--', label='Initial Wealth (1.0)')\n",
    "ax5.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "# 如果财富增长非常快，建议开启对数坐标\n",
    "# ax5.set_yscale('log') \n",
    "\n",
    "ax5.set_ylabel('Wealth')\n",
    "ax5.set_xlabel('Time Step')\n",
    "ax5.set_title('5. BPAC Wealth Accumulation')\n",
    "ax5.legend(loc='upper left')\n",
    "ax5.grid(True, alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0d4fccbc",
   "metadata": {},
   "source": [
    "##### 图2（4）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7f6a8b9d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "from scipy import stats\n",
    "from matplotlib.ticker import MultipleLocator\n",
    "\n",
    "# ==========================================\n",
    "# 1. 统计与辅助函数\n",
    "# ==========================================\n",
    "SHADOW_TYPE = 'std'  \n",
    "CALIB_NUM = 500\n",
    "\n",
    "def get_error_bounds(data_array, type='std', scale=1):\n",
    "    \"\"\"计算均值和阴影上下界\"\"\"\n",
    "    if data_array.ndim == 1 or data_array.shape[0] <= 1:\n",
    "        mean_val = data_array.flatten() * scale\n",
    "        return mean_val, mean_val, mean_val\n",
    "\n",
    "    mean_val = np.mean(data_array, axis=0) * scale\n",
    "    \n",
    "    if type == 'std':\n",
    "        dev_val = np.std(data_array, axis=0) * scale\n",
    "    elif type == 'sem':\n",
    "        dev_val = stats.sem(data_array, axis=0) * scale\n",
    "    else:\n",
    "        dev_val = np.std(data_array, axis=0) * scale\n",
    "        \n",
    "    lower = mean_val - dev_val\n",
    "    upper = mean_val + dev_val\n",
    "        \n",
    "    return mean_val, lower, upper\n",
    "\n",
    "# ==========================================\n",
    "# 2. 数据预处理\n",
    "# ==========================================\n",
    "# 获取维度\n",
    "NUM_RUNS, N_STEPS = bpac_risks_arr.shape\n",
    "steps = np.arange(N_STEPS)\n",
    "\n",
    "# --- 2.1 构建 Baseline 阈值序列 ---\n",
    "# 逻辑：前 CALIB_NUM 步是 0，之后是 arr_thresholds[i]\n",
    "base_threshold_seqs = np.zeros((NUM_RUNS, N_STEPS))\n",
    "for i in range(NUM_RUNS):\n",
    "    base_threshold_seqs[i, :CALIB_NUM] = 0.0\n",
    "    # arr_thresholds 是标量数组 (100,)\n",
    "    base_threshold_seqs[i, CALIB_NUM:] = arr_thresholds[i]\n",
    "\n",
    "# --- 2.2 计算统计量 ---\n",
    "\n",
    "# Risk (ER) - 保持 0-0.2\n",
    "base_risk_m, base_risk_l, base_risk_h = get_error_bounds(arr_risks, SHADOW_TYPE)\n",
    "bpac_risk_m, bpac_risk_l, bpac_risk_h = get_error_bounds(bpac_risks_arr, SHADOW_TYPE)\n",
    "\n",
    "# ECP (Expert Call %) - 放大 100 倍\n",
    "base_ecp_m, base_ecp_l, base_ecp_h = get_error_bounds(arr_expert_ratios, SHADOW_TYPE, scale=100)\n",
    "bpac_ecp_m, bpac_ecp_l, bpac_ecp_h = get_error_bounds(bpac_expert_ratios_arr, SHADOW_TYPE, scale=100)\n",
    "\n",
    "# TCP (Token Cost %) - 放大 100 倍\n",
    "base_tcp_m, base_tcp_l, base_tcp_h = get_error_bounds(arr_token_ratios, SHADOW_TYPE, scale=100)\n",
    "bpac_tcp_m, bpac_tcp_l, bpac_tcp_h = get_error_bounds(bpac_token_ratios_arr, SHADOW_TYPE, scale=100)\n",
    "\n",
    "# Threshold ($u_t$) - 保持 0-1\n",
    "base_th_m, base_th_l, base_th_h = get_error_bounds(base_threshold_seqs, SHADOW_TYPE)\n",
    "bpac_th_m, bpac_th_l, bpac_th_h = get_error_bounds(bpac_thresholds_arr, SHADOW_TYPE)\n",
    "\n",
    "\n",
    "# ==========================================\n",
    "# 3. 绘图 (ICML 风格 - 4 Subplots)\n",
    "# ==========================================\n",
    "plt.rcParams.update({\n",
    "    'font.size': 14,\n",
    "    'axes.labelsize': 20,\n",
    "    'xtick.labelsize': 16,\n",
    "    'ytick.labelsize': 16,\n",
    "    'legend.fontsize': 16,\n",
    "    'lines.linewidth': 3.0,\n",
    "    'axes.grid': True, \n",
    "    'grid.alpha': 0.3, \n",
    "    'grid.linestyle': '--',\n",
    "    'figure.autolayout': True,\n",
    "    'savefig.dpi': 300\n",
    "})\n",
    "\n",
    "# 1行4列，宽一点\n",
    "fig, axes = plt.subplots(1, 4, figsize=(26, 5), sharex=True)\n",
    "\n",
    "# 颜色定义\n",
    "C_BASE = '#1f77b4'  # Blue\n",
    "C_OURS = '#d62728'  # Red\n",
    "C_TGT = '#2ca02c'   # Green (或者黑色 'black')\n",
    "C_WARM = 'gray'\n",
    "\n",
    "# --- (a) Risk (ER) ---\n",
    "ax = axes[0]\n",
    "# Baseline\n",
    "ax.plot(steps, base_risk_m, color=C_BASE, linestyle='--', label='PAC (Baseline)')\n",
    "ax.fill_between(steps, base_risk_l, base_risk_h, color=C_BASE, alpha=0.25)\n",
    "# BPAC\n",
    "ax.plot(steps, bpac_risk_m, color=C_OURS, linestyle='-', label='B-PAC (Ours)')\n",
    "ax.fill_between(steps, bpac_risk_l, bpac_risk_h, color=C_OURS, alpha=0.25)\n",
    "\n",
    "# Target Line\n",
    "ax.axhline(y=TARGET_EPSILON, color=C_TGT, linestyle='-', linewidth=2, label='Tolerance')\n",
    "if CALIB_NUM > 0:\n",
    "    ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "\n",
    "ax.set_ylabel('ER', fontsize=22)\n",
    "ax.set_xlabel('Time Step')\n",
    "# 自动调整 ylim\n",
    "ax.set_ylim(0, max(0.15, TARGET_EPSILON * 2.0))\n",
    "ax.yaxis.set_major_locator(MultipleLocator(0.04))\n",
    "\n",
    "# 图例放在第一个图\n",
    "ax.legend(loc='upper right', frameon=True, framealpha=0.95, fontsize=14)\n",
    "\n",
    "\n",
    "# --- (b) ECP (%) ---\n",
    "ax = axes[1]\n",
    "# Baseline\n",
    "ax.plot(steps, base_ecp_m, color=C_BASE, linestyle='--')\n",
    "ax.fill_between(steps, base_ecp_l, base_ecp_h, color=C_BASE, alpha=0.25)\n",
    "# BPAC\n",
    "ax.plot(steps, bpac_ecp_m, color=C_OURS, linestyle='-')\n",
    "ax.fill_between(steps, bpac_ecp_l, bpac_ecp_h, color=C_OURS, alpha=0.25)\n",
    "\n",
    "if CALIB_NUM > 0:\n",
    "    ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "\n",
    "ax.set_ylabel('ECP (%)', fontsize=20)\n",
    "ax.set_xlabel('Time Step')\n",
    "ax.set_ylim(0, 105)\n",
    "\n",
    "\n",
    "# --- (c) TCP (%) ---\n",
    "ax = axes[2]\n",
    "# Baseline\n",
    "ax.plot(steps, base_tcp_m, color=C_BASE, linestyle='--')\n",
    "ax.fill_between(steps, base_tcp_l, base_tcp_h, color=C_BASE, alpha=0.25)\n",
    "# BPAC\n",
    "ax.plot(steps, bpac_tcp_m, color=C_OURS, linestyle='-')\n",
    "ax.fill_between(steps, bpac_tcp_l, bpac_tcp_h, color=C_OURS, alpha=0.25)\n",
    "\n",
    "# Full Expert Reference\n",
    "ax.axhline(y=100, color='black', linestyle='-.', linewidth=1.5, label='Full Expert')\n",
    "if CALIB_NUM > 0:\n",
    "    ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "\n",
    "ax.set_ylabel('TP (%)', fontsize=20) # 或者 'TP(%)'\n",
    "ax.set_xlabel('Time Step')\n",
    "\n",
    "\n",
    "# --- (d) Threshold (u_t) ---\n",
    "ax = axes[3]\n",
    "# Baseline (hat{u})\n",
    "ax.plot(steps, base_th_m, color=C_BASE, linestyle='--', label=r'Baseline $\\hat{u}$')\n",
    "ax.fill_between(steps, base_th_l, base_th_h, color=C_BASE, alpha=0.25)\n",
    "\n",
    "# BPAC (u_t)\n",
    "ax.plot(steps, bpac_th_m, color=C_OURS, linestyle='-', label=r'BPAC $u_t$')\n",
    "ax.fill_between(steps, bpac_th_l, bpac_th_h, color=C_OURS, alpha=0.25)\n",
    "\n",
    "if CALIB_NUM > 0:\n",
    "    ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "\n",
    "ax.set_ylabel(r'Threshold ($\\hat{u}_t$)', fontsize=20)\n",
    "ax.set_xlabel('Time Step')\n",
    "ax.set_ylim(-0.05, 1.05)\n",
    "# ax.legend(loc='lower right', frameon=True, framealpha=0.95, fontsize=14)\n",
    "\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('threshold_magpie.pdf', bbox_inches='tight')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7b1e2d6f",
   "metadata": {},
   "source": [
    "#### onaive"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8bda2039",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "# 参数设置\n",
    "NUM_RUNS = 100        # 重复 100 次\n",
    "WARM_UP_STEPS = 0   # BPAC 的 Warm-up 步数 (对应 Baseline 的校准集大小)\n",
    "TARGET_EPSILON = 0.08  # 目标 Risk\n",
    "CFG_ALPHA= 0.1      # BPAC 参数\n",
    "CFG_RHO = 0.05        # BPAC 参数\n",
    "\n",
    "\n",
    "bpaconaive_risks = []\n",
    "bpaconaive_token_ratios = []\n",
    "bpaconaive_expert_ratios = []\n",
    "bpaconaive_wealths = []\n",
    "bpaconaive_thresholds = []\n",
    "print(f\"Starting BPAC-Naive Simulation ({NUM_RUNS} runs)...\")\n",
    "\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    # -----------------------------------------------------------\n",
    "    # 1. 严格的数据对齐 (Data Alignment)\n",
    "    # -----------------------------------------------------------\n",
    "    # 使用与 Baseline 相同的种子生成随机数生成器\n",
    "    rng = np.random.default_rng(seed)\n",
    "    \n",
    "    # 生成打乱的索引\n",
    "    n = len(data_list)\n",
    "    indices = np.arange(n)\n",
    "    rng.shuffle(indices)\n",
    "    \n",
    "    # 关键修正：根据 indices 重排 data_list\n",
    "    # 注意：data_list 是 list of dicts，不能直接用 indices 索引，除非转 numpy\n",
    "    # 这里用列表推导式最稳妥\n",
    "    shuffled_data = [data_list[i] for i in indices]\n",
    "    \n",
    "    # -----------------------------------------------------------\n",
    "    # 2. 配置并运行 BPAC\n",
    "    # -----------------------------------------------------------\n",
    "    # 你的 BPACConfig\n",
    "    cfg = BPACConfignaive(\n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=CFG_ALPHA, \n",
    "        rho=CFG_RHO, \n",
    "        warm_up=WARM_UP_STEPS,\n",
    "        num_thresholds=1001, \n",
    "    )\n",
    "    \n",
    "    # 运行模拟 (传入打乱后的数据)\n",
    "    df_result, _ = run_simulation_onaive(shuffled_data, cfg)\n",
    "    \n",
    "    bpaconaive_risks.append(df_result['avg_risk'].values)\n",
    "    bpaconaive_token_ratios.append(df_result['token_ratio'].values)\n",
    "    bpaconaive_expert_ratios.append(df_result['expert_call_ratio'].values)\n",
    "    bpaconaive_wealths.append(df_result['wealth'].values)\n",
    "    bpaconaive_thresholds.append(df_result['threshold'].values) \n",
    "\n",
    "# 转换为 Numpy 数组方便计算均值\n",
    "bpaconaive_risks_arr = np.array(bpaconaive_risks)       # Shape: (100, N)\n",
    "bpaconaive_token_ratios_arr = np.array(bpaconaive_token_ratios) # Shape: (100, N)\n",
    "bpaconaive_expert_ratios_arr = np.array(bpaconaive_expert_ratios) # Shape: (100, N)\n",
    "bpaconaive_wealths_arr = np.array(bpaconaive_wealths)   # Shape: (100, N)\n",
    "bpaconaive_thresholds_arr = np.array(bpaconaive_thresholds) # Shape: (100, N)\n",
    "print(\"BPAC-Naive Simulation Finished!\")\n",
    "bpaconaive_risks_arr.shape, bpaconaive_token_ratios_arr.shape, bpaconaive_expert_ratios_arr.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cb0f3a3b",
   "metadata": {},
   "source": [
    "#### BPAC-IPS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2dfb3dd4",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "# 参数设置\n",
    "NUM_RUNS = 100        # 重复 100 次\n",
    "WARM_UP_STEPS = 0   # BPAC 的 Warm-up 步数 (对应 Baseline 的校准集大小)\n",
    "TARGET_EPSILON = 0.08  # 目标 Risk\n",
    "CFG_ALPHA= 0.1      # BPAC 参数\n",
    "CFG_RHO = 0.05        # BPAC 参数\n",
    "\n",
    "\n",
    "bpacips_risks = []\n",
    "bpacips_token_ratios = []\n",
    "bpacips_expert_ratios = []\n",
    "bpacips_wealths = []\n",
    "bpacips_thresholds = []\n",
    "print(f\"Starting BPAC-IPS Simulation ({NUM_RUNS} runs)...\")\n",
    "\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    # -----------------------------------------------------------\n",
    "    # 1. 严格的数据对齐 (Data Alignment)\n",
    "    # -----------------------------------------------------------\n",
    "    # 使用与 Baseline 相同的种子生成随机数生成器\n",
    "    rng = np.random.default_rng(seed)\n",
    "    \n",
    "    # 生成打乱的索引\n",
    "    n = len(data_list)\n",
    "    indices = np.arange(n)\n",
    "    rng.shuffle(indices)\n",
    "    \n",
    "    # 关键修正：根据 indices 重排 data_list\n",
    "    # 注意：data_list 是 list of dicts，不能直接用 indices 索引，除非转 numpy\n",
    "    # 这里用列表推导式最稳妥\n",
    "    shuffled_data = [data_list[i] for i in indices]\n",
    "    \n",
    "    # -----------------------------------------------------------\n",
    "    # 2. 配置并运行 BPAC\n",
    "    # -----------------------------------------------------------\n",
    "    # 你的 BPACConfig\n",
    "    cfg = BPACConfigIPS(\n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=CFG_ALPHA, \n",
    "        rho=CFG_RHO, \n",
    "        warm_up=WARM_UP_STEPS,\n",
    "        num_thresholds=1001, \n",
    "    )\n",
    "    \n",
    "    # 运行模拟 (传入打乱后的数据)\n",
    "    df_result, _ = run_simulation_ips(shuffled_data, cfg)\n",
    "    \n",
    "    bpacips_risks.append(df_result['avg_risk'].values)\n",
    "    bpacips_token_ratios.append(df_result['token_ratio'].values)\n",
    "    bpacips_expert_ratios.append(df_result['expert_call_ratio'].values)\n",
    "    bpacips_wealths.append(df_result['wealth'].values)\n",
    "    bpacips_thresholds.append(df_result['threshold'].values) \n",
    "\n",
    "# 转换为 Numpy 数组方便计算均值\n",
    "bpacips_risks_arr = np.array(bpacips_risks)       # Shape: (100, N)\n",
    "bpacips_token_ratios_arr = np.array(bpacips_token_ratios) # Shape: (100, N)\n",
    "bpacips_expert_ratios_arr = np.array(bpacips_expert_ratios) # Shape: (100, N)\n",
    "bpacips_wealths_arr = np.array(bpacips_wealths)   # Shape: (100, N)\n",
    "bpacips_thresholds_arr = np.array(bpacips_thresholds) # Shape: (100, N)\n",
    "print(\"BPAC-IPS Simulation Finished!\")\n",
    "bpacips_risks_arr.shape, bpacips_token_ratios_arr.shape, bpacips_expert_ratios_arr.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8c0bffa2",
   "metadata": {},
   "source": [
    "#### 画图"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4b204a94",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "def calc_stats(arr):\n",
    "    \"\"\"辅助函数：计算均值和标准差\"\"\"\n",
    "    return np.mean(arr, axis=0), np.std(arr, axis=0)\n",
    "\n",
    "# --- 计算 BPAC (Ours) 统计量 ---\n",
    "bpac_mean_risk, bpac_std_risk = calc_stats(bpac_risks_arr)\n",
    "bpac_mean_token, bpac_std_token = calc_stats(bpac_token_ratios_arr)\n",
    "bpac_mean_expert, bpac_std_expert = calc_stats(bpac_expert_ratios_arr)\n",
    "bpac_mean_threshold, bpac_std_threshold = calc_stats(bpac_thresholds_arr)\n",
    "bpac_mean_wealth, bpac_std_wealth = calc_stats(bpac_wealths_arr)\n",
    "\n",
    "# --- 计算 BPAC-Naive 统计量 ---\n",
    "naive_mean_risk, naive_std_risk = calc_stats(bpaconaive_risks_arr)\n",
    "naive_mean_token, naive_std_token = calc_stats(bpaconaive_token_ratios_arr)\n",
    "naive_mean_expert, naive_std_expert = calc_stats(bpaconaive_expert_ratios_arr)\n",
    "naive_mean_threshold, naive_std_threshold = calc_stats(bpaconaive_thresholds_arr)\n",
    "# Naive 通常没有 Wealth，这里忽略\n",
    "\n",
    "# --- 计算 BPAC-IPS 统计量 ---\n",
    "ips_mean_risk, ips_std_risk = calc_stats(bpacips_risks_arr)\n",
    "ips_mean_token, ips_std_token = calc_stats(bpacips_token_ratios_arr)\n",
    "ips_mean_expert, ips_std_expert = calc_stats(bpacips_expert_ratios_arr)\n",
    "ips_mean_threshold, ips_std_threshold = calc_stats(bpacips_thresholds_arr)\n",
    "ips_mean_wealth, ips_std_wealth = calc_stats(bpacips_wealths_arr)\n",
    "\n",
    "# ==========================================\n",
    "# 2. 绘图 (Plotting)\n",
    "# ==========================================\n",
    "# 获取时间步\n",
    "N_STEPS = bpac_risks_arr.shape[1]\n",
    "steps = np.arange(N_STEPS)\n",
    "\n",
    "# 使用 5 行子图\n",
    "fig, axes = plt.subplots(5, 1, figsize=(12, 22), sharex=True)\n",
    "(ax1, ax2, ax3, ax4, ax5) = axes\n",
    "\n",
    "# 通用绘图辅助函数\n",
    "def plot_with_std(ax, x, mean, std, label, color, linestyle='-'):\n",
    "    ax.plot(x, mean, label=label, color=color, linestyle=linestyle, linewidth=2)\n",
    "    # 填充误差带 (透明度设低一点以免遮挡)\n",
    "    ax.fill_between(x, mean - std, mean + std, color=color, alpha=0.1)\n",
    "\n",
    "# === 子图 1: Average Risk (风险控制) ===\n",
    "plot_with_std(ax1, steps, naive_mean_risk, naive_std_risk, 'BPAC-Naive', 'blue', '--')\n",
    "plot_with_std(ax1, steps, ips_mean_risk, ips_std_risk, 'BPAC-IPS', 'green', '-.')\n",
    "plot_with_std(ax1, steps, bpac_mean_risk, bpac_std_risk, 'BPAC (Ours)', 'red')\n",
    "\n",
    "ax1.axhline(y=TARGET_EPSILON, color='black', linestyle='-', linewidth=1.5, label=f'Target $\\epsilon={TARGET_EPSILON}$')\n",
    "if WARM_UP_STEPS > 0:\n",
    "    ax1.axvline(x=WARM_UP_STEPS, color='gray', linestyle=':', label='Warm-up End')\n",
    "\n",
    "ax1.set_ylabel('Avg Risk')\n",
    "ax1.set_title(f'1. Risk Control (Target $\\epsilon < {TARGET_EPSILON}$)')\n",
    "ax1.legend(loc='upper right')\n",
    "ax1.grid(True, alpha=0.3)\n",
    "# 动态调整 Y 轴范围，避免初期波动过大\n",
    "max_risk_show = max(TARGET_EPSILON * 3.0, np.max(bpac_mean_risk[-100:]), np.max(ips_mean_risk[-100:]))\n",
    "ax1.set_ylim(0, max_risk_show)\n",
    "\n",
    "# === 子图 2: Token Ratio (成本节省) ===\n",
    "plot_with_std(ax2, steps, naive_mean_token, naive_std_token, 'BPAC-Naive', 'blue', '--')\n",
    "plot_with_std(ax2, steps, ips_mean_token, ips_std_token, 'BPAC-IPS', 'green', '-.')\n",
    "plot_with_std(ax2, steps, bpac_mean_token, bpac_std_token, 'BPAC (Ours)', 'red')\n",
    "\n",
    "ax2.axhline(y=1.0, color='black', linestyle=':')\n",
    "if WARM_UP_STEPS > 0:\n",
    "    ax2.axvline(x=WARM_UP_STEPS, color='gray', linestyle=':')\n",
    "\n",
    "ax2.set_ylabel('Token Ratio')\n",
    "ax2.set_title('2. Cost Efficiency (Lower is Better)')\n",
    "ax2.legend(loc='upper right')\n",
    "ax2.grid(True, alpha=0.3)\n",
    "\n",
    "# === 子图 3: Expert Call Ratio (专家调用率) ===\n",
    "plot_with_std(ax3, steps, naive_mean_expert, naive_std_expert, 'BPAC-Naive', 'blue', '--')\n",
    "plot_with_std(ax3, steps, ips_mean_expert, ips_std_expert, 'BPAC-IPS', 'green', '-.')\n",
    "plot_with_std(ax3, steps, bpac_mean_expert, bpac_std_expert, 'BPAC (Ours)', 'red')\n",
    "\n",
    "if WARM_UP_STEPS > 0:\n",
    "    ax3.axvline(x=WARM_UP_STEPS, color='gray', linestyle=':')\n",
    "\n",
    "ax3.set_ylabel('Expert Call Ratio')\n",
    "ax3.set_title('3. Expert Usage Behavior')\n",
    "ax3.grid(True, alpha=0.3)\n",
    "ax3.set_ylim(-0.05, 1.05)\n",
    "\n",
    "# === 子图 4: Threshold Evolution (阈值变化) ===\n",
    "plot_with_std(ax4, steps, naive_mean_threshold, naive_std_threshold, 'BPAC-Naive $\\hat{u}$', 'blue', '--')\n",
    "plot_with_std(ax4, steps, ips_mean_threshold, ips_std_threshold, 'BPAC-IPS $u_t$', 'green', '-.')\n",
    "plot_with_std(ax4, steps, bpac_mean_threshold, bpac_std_threshold, 'BPAC $u_t$', 'red')\n",
    "\n",
    "if WARM_UP_STEPS > 0:\n",
    "    ax4.axvline(x=WARM_UP_STEPS, color='gray', linestyle=':')\n",
    "\n",
    "ax4.set_ylabel('Threshold')\n",
    "ax4.set_title('4. Threshold Adaptation Strategy')\n",
    "ax4.legend(loc='center right')\n",
    "ax4.grid(True, alpha=0.3)\n",
    "ax4.set_ylim(-0.05, 1.05)\n",
    "\n",
    "# === 子图 5: Wealth Evolution (BPAC & IPS 对比) ===\n",
    "# Naive 没有 Wealth，只画 BPAC 和 IPS\n",
    "plot_with_std(ax5, steps, ips_mean_wealth, ips_std_wealth, 'BPAC-IPS Wealth', 'green', '-.')\n",
    "plot_with_std(ax5, steps, bpac_mean_wealth, bpac_std_wealth, 'BPAC Wealth', 'red')\n",
    "\n",
    "ax5.axhline(y=1.0, color='black', linestyle='--', label='Initial Wealth')\n",
    "if WARM_UP_STEPS > 0:\n",
    "    ax5.axvline(x=WARM_UP_STEPS, color='gray', linestyle=':')\n",
    "\n",
    "# 财富通常呈指数增长，建议使用对数坐标\n",
    "ax5.set_yscale('log')\n",
    "\n",
    "ax5.set_ylabel('Wealth (log scale)')\n",
    "ax5.set_xlabel('Time Step')\n",
    "ax5.set_title('5. Martingale Wealth Process Comparison')\n",
    "ax5.legend(loc='upper left')\n",
    "ax5.grid(True, alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ebc57f48",
   "metadata": {},
   "source": [
    "#### paper"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9a12f0b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "from scipy import stats\n",
    "\n",
    "# ==========================================\n",
    "# 1. 统计与辅助函数\n",
    "# ==========================================\n",
    "SHADOW_TYPE = 'sem'  # 可选: 'std', 'sem', 'ci', 'percentile'\n",
    "CALIB_NUM =0\n",
    "def get_error_bounds(data_array, type='sem'):\n",
    "    \"\"\"计算均值和阴影上下界\"\"\"\n",
    "    mean_val = np.mean(data_array, axis=0)\n",
    "    sem_val = stats.sem(data_array, axis=0)\n",
    "    lower = mean_val - sem_val\n",
    "    upper = mean_val + sem_val\n",
    "        \n",
    "    return mean_val, lower, upper\n",
    "\n",
    "def clip_bounds(low, high, min_v=0, max_v=None):\n",
    "    \"\"\"物理截断 (例如比率不能小于0)\"\"\"\n",
    "    low = np.maximum(low, min_v)\n",
    "    if max_v is not None:\n",
    "        high = np.minimum(high, max_v)\n",
    "    return low, high\n",
    "\n",
    "# ==========================================\n",
    "# 2. 计算绘图数据\n",
    "# ==========================================\n",
    "# 假设 steps 与数组长度一致\n",
    "steps = np.arange(bpac_risks_arr.shape[1])\n",
    "\n",
    "# --- Risk (ER) ---\n",
    "naive_risk_m, naive_risk_l, naive_risk_h = get_error_bounds(bpaconaive_risks_arr, SHADOW_TYPE)\n",
    "ips_risk_m, ips_risk_l, ips_risk_h = get_error_bounds(bpacips_risks_arr, SHADOW_TYPE)\n",
    "bpac_risk_m, bpac_risk_l, bpac_risk_h = get_error_bounds(bpac_risks_arr, SHADOW_TYPE)\n",
    "\n",
    "# --- ECP (Expert Call %) ---\n",
    "naive_ecp_m, naive_ecp_l, naive_ecp_h = get_error_bounds(bpaconaive_expert_ratios_arr, SHADOW_TYPE)\n",
    "ips_ecp_m, ips_ecp_l, ips_ecp_h = get_error_bounds(bpacips_expert_ratios_arr, SHADOW_TYPE)\n",
    "bpac_ecp_m, bpac_ecp_l, bpac_ecp_h = get_error_bounds(bpac_expert_ratios_arr, SHADOW_TYPE)\n",
    "\n",
    "# --- TCP (Token Cost %) ---\n",
    "naive_tcp_m, naive_tcp_l, naive_tcp_h = get_error_bounds(bpaconaive_token_ratios_arr, SHADOW_TYPE)\n",
    "ips_tcp_m, ips_tcp_l, ips_tcp_h = get_error_bounds(bpacips_token_ratios_arr, SHADOW_TYPE)\n",
    "bpac_tcp_m, bpac_tcp_l, bpac_tcp_h = get_error_bounds(bpac_token_ratios_arr, SHADOW_TYPE)\n",
    "\n",
    "# 截断不合理的数值\n",
    "naive_risk_l, naive_risk_h = clip_bounds(naive_risk_l, naive_risk_h, 0, 1.0)\n",
    "ips_risk_l, ips_risk_h = clip_bounds(ips_risk_l, ips_risk_h, 0, 1.0)\n",
    "bpac_risk_l, bpac_risk_h = clip_bounds(bpac_risk_l, bpac_risk_h, 0, 1.0)\n",
    "\n",
    "naive_ecp_l, naive_ecp_h = clip_bounds(naive_ecp_l, naive_ecp_h, 0, 1.0)\n",
    "ips_ecp_l, ips_ecp_h = clip_bounds(ips_ecp_l, ips_ecp_h, 0, 1.0)\n",
    "bpac_ecp_l, bpac_ecp_h = clip_bounds(bpac_ecp_l, bpac_ecp_h, 0, 1.0)\n",
    "\n",
    "naive_tcp_l, naive_tcp_h = clip_bounds(naive_tcp_l, naive_tcp_h, 0, None)\n",
    "ips_tcp_l, ips_tcp_h = clip_bounds(ips_tcp_l, ips_tcp_h, 0, None)\n",
    "bpac_tcp_l, bpac_tcp_h = clip_bounds(bpac_tcp_l, bpac_tcp_h, 0, None)\n",
    "\n",
    "# ==========================================\n",
    "# 3. 绘图 (ICML 风格 - 3 Methods)\n",
    "# ==========================================\n",
    "plt.rcParams.update({\n",
    "    'font.size': 14,\n",
    "    'axes.labelsize': 20,\n",
    "    'xtick.labelsize': 16,\n",
    "    'ytick.labelsize': 16,\n",
    "    'legend.fontsize': 16,\n",
    "    'lines.linewidth': 3.0,\n",
    "    'axes.grid': True, \n",
    "    'grid.alpha': 0.3, \n",
    "    'grid.linestyle': '--',\n",
    "    'figure.autolayout': True,\n",
    "    'savefig.dpi': 300\n",
    "})\n",
    "\n",
    "fig, axes = plt.subplots(1, 3, figsize=(20, 5), sharex=True)\n",
    "\n",
    "# 颜色定义\n",
    "C_NAIVE = '#1f77b4'  # Blue\n",
    "C_IPS = '#2ca02c'    # Green\n",
    "C_OURS = '#d62728'   # Red\n",
    "C_TGT = 'black'      # Tolerance Line\n",
    "C_WARM = 'gray'      # Warmup Line\n",
    "\n",
    "# --- (a) Risk ---\n",
    "ax = axes[0]\n",
    "# 1. Naive\n",
    "ax.plot(steps, naive_risk_m, color=C_NAIVE, linestyle='--', label='O-Naive')\n",
    "ax.fill_between(steps, naive_risk_l, naive_risk_h, color=C_NAIVE, alpha=0.15, edgecolor='none')\n",
    "# 2. IPS\n",
    "ax.plot(steps, ips_risk_m, color=C_IPS, linestyle='-.', label='IPS+Hoeff')\n",
    "ax.fill_between(steps, ips_risk_l, ips_risk_h, color=C_IPS, alpha=0.15, edgecolor='none')\n",
    "# 3. Ours\n",
    "ax.plot(steps, bpac_risk_m, color=C_OURS, linestyle='-', label='B-PAC (Ours)')\n",
    "ax.fill_between(steps, bpac_risk_l, bpac_risk_h, color=C_OURS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.axhline(y=TARGET_EPSILON, color=C_TGT, linestyle='--', linewidth=2, label='Tolerance')\n",
    "ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "\n",
    "ax.set_ylabel('ER', fontsize=22)\n",
    "ax.set_xlabel('Time Step')\n",
    "# 自动调整 ylim: 取三个方法最大值的 1.1 倍，或者至少展示到 Target 的 2 倍\n",
    "max_risk_show = max(TARGET_EPSILON * 1.5, np.max(naive_risk_m[-50:]), np.max(bpac_risk_m[-50:])) * 1.1\n",
    "ax.set_ylim(0, max_risk_show)\n",
    "\n",
    "# 图例\n",
    "ax.legend(loc='upper right', frameon=True, framealpha=0.95, fontsize=14)\n",
    "\n",
    "# --- (b) ECP ---\n",
    "ax = axes[1]\n",
    "ax.plot(steps, naive_ecp_m, color=C_NAIVE, linestyle='--')\n",
    "ax.fill_between(steps, naive_ecp_l, naive_ecp_h, color=C_NAIVE, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, ips_ecp_m, color=C_IPS, linestyle='-.')\n",
    "ax.fill_between(steps, ips_ecp_l, ips_ecp_h, color=C_IPS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, bpac_ecp_m, color=C_OURS, linestyle='-')\n",
    "ax.fill_between(steps, bpac_ecp_l, bpac_ecp_h, color=C_OURS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "ax.set_ylabel('ECP(%)')\n",
    "ax.set_xlabel('Time Step')\n",
    "ax.set_ylim(0, 1.05)\n",
    "\n",
    "# --- (c) TCP ---\n",
    "ax = axes[2]\n",
    "ax.plot(steps, naive_tcp_m, color=C_NAIVE, linestyle='--')\n",
    "ax.fill_between(steps, naive_tcp_l, naive_tcp_h, color=C_NAIVE, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, ips_tcp_m, color=C_IPS, linestyle='-.')\n",
    "ax.fill_between(steps, ips_tcp_l, ips_tcp_h, color=C_IPS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, bpac_tcp_m, color=C_OURS, linestyle='-')\n",
    "ax.fill_between(steps, bpac_tcp_l, bpac_tcp_h, color=C_OURS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.axhline(y=1.0, color='black', linestyle='-.', linewidth=1.5, label='Full Expert')\n",
    "ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "\n",
    "ax.set_ylabel('TP(%)')\n",
    "ax.set_xlabel('Time Step')\n",
    "\n",
    "plt.tight_layout()\n",
    "# plt.savefig('comparison_3methods.pdf', bbox_inches='tight') # 如需保存请取消注释\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a099a042",
   "metadata": {},
   "source": [
    "#### 不同epsilon画图"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f761dd2d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "import pickle  # 用于保存结果到硬盘，防止跑太久丢失\n",
    "\n",
    "# ================= 配置区域 =================\n",
    "# 实验参数\n",
    "NUM_RUNS = 100        # 重复次数\n",
    "WARM_UP_STEPS = 0     \n",
    "EPSILON_LIST = [0.05, 0.06, 0.07, 0.08, 0.09, 0.10] # <--- 这里是你想要遍历的 epsilon 列表\n",
    "\n",
    "# 固定参数 (保持不变)\n",
    "CFG_ALPHA = 0.1       \n",
    "CFG_RHO = 0       \n",
    "BPA_beta = 1\n",
    "CFG_RHO_0 = 0.05\n",
    "CFG_RHO_1 = 0.7\n",
    "CFG_CHANGE = 200\n",
    "C_CLIP = 0.9\n",
    "\n",
    "# 结果存储容器：Key 是 epsilon 值，Value 是包含各指标数组的字典\n",
    "experiment_results = {} \n",
    "\n",
    "def run_single_epsilon_experiment(target_eps):\n",
    "    \"\"\"\n",
    "    针对单个 Epsilon 运行 NUM_RUNS 次实验\n",
    "    \"\"\"\n",
    "    local_risks = []\n",
    "    local_token_ratios = []\n",
    "    local_expert_ratios = []\n",
    "    local_wealths = []\n",
    "    local_thresholds = []\n",
    "    \n",
    "    # tqdm desc 参数可以让你知道当前跑的是哪个 epsilon\n",
    "    for seed in tqdm(range(NUM_RUNS), desc=f\"Simulating Eps={target_eps}\", leave=False):\n",
    "        # 1. 数据对齐 (保证不同 epsilon 使用的是完全相同的随机数据顺序)\n",
    "        rng = np.random.default_rng(seed)\n",
    "        n = len(data_list)\n",
    "        indices = np.arange(n)\n",
    "        rng.shuffle(indices)\n",
    "        shuffled_data = [data_list[i] for i in indices]\n",
    "        \n",
    "        # 2. 配置 BPAC (传入当前的 target_eps)\n",
    "        cfg = BPACConfig(\n",
    "            epsilon=target_eps,  # <--- 动态变化的参数\n",
    "            alpha=CFG_ALPHA, \n",
    "            rho=CFG_RHO, \n",
    "            warm_up=WARM_UP_STEPS,\n",
    "            num_thresholds=1001, \n",
    "            beta=BPA_beta, \n",
    "            c_clip=C_CLIP,\n",
    "            rho_0=CFG_RHO_0,\n",
    "            rho_1=CFG_RHO_1,\n",
    "            change_point=CFG_CHANGE\n",
    "        )\n",
    "        \n",
    "        # 3. 运行\n",
    "        df_result, _ = run_simulation(shuffled_data, cfg)\n",
    "        \n",
    "        # 4. 收集\n",
    "        local_risks.append(df_result['avg_risk'].values)\n",
    "        local_token_ratios.append(df_result['token_ratio'].values)\n",
    "        local_expert_ratios.append(df_result['expert_call_ratio'].values)\n",
    "        local_wealths.append(df_result['wealth'].values)\n",
    "        local_thresholds.append(df_result['threshold'].values)\n",
    "\n",
    "    # 转 Numpy 并打包返回\n",
    "    return {\n",
    "        \"risks\": np.array(local_risks),         # Shape: (100, N)\n",
    "        \"tokens\": np.array(local_token_ratios), # Shape: (100, N)\n",
    "        \"expert_calls\": np.array(local_expert_ratios),\n",
    "        \"wealths\": np.array(local_wealths),\n",
    "        \"thresholds\": np.array(local_thresholds)\n",
    "    }\n",
    "\n",
    "# ================= 主循环 =================\n",
    "print(f\"Starting Multi-Epsilon Simulation for: {EPSILON_LIST}\")\n",
    "\n",
    "for eps in EPSILON_LIST:\n",
    "    print(f\"\\n>>> Processing Target Epsilon: {eps}\")\n",
    "    result_bundle = run_single_epsilon_experiment(eps)\n",
    "    experiment_results[eps] = result_bundle\n",
    "\n",
    "print(\"\\nAll experiments finished!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ce3ed966",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "from scipy import stats\n",
    "from matplotlib.ticker import MultipleLocator\n",
    "import matplotlib.cm as cm\n",
    "from matplotlib import colors as mcolors\n",
    "# ==========================================\n",
    "# 1. 统计与辅助函数 (保持不变)\n",
    "# ==========================================\n",
    "SHADOW_TYPE = 'std'  # 推荐使用 std 看得更清楚\n",
    "CALIB_NUM = 0\n",
    "\n",
    "def get_error_bounds(data_array, type='std', scale=1):\n",
    "    \"\"\"计算均值和阴影上下界\"\"\"\n",
    "    if data_array.ndim == 1 or data_array.shape[0] <= 1:\n",
    "        mean_val = data_array.flatten() * scale\n",
    "        return mean_val, mean_val, mean_val\n",
    "\n",
    "    mean_val = np.mean(data_array, axis=0) * scale\n",
    "    \n",
    "    if type == 'std':\n",
    "        dev_val = np.std(data_array, axis=0) * scale\n",
    "    elif type == 'sem':\n",
    "        dev_val = stats.sem(data_array, axis=0) * scale\n",
    "    else:\n",
    "        dev_val = np.std(data_array, axis=0) * scale\n",
    "        \n",
    "    lower = mean_val - dev_val\n",
    "    upper = mean_val + dev_val\n",
    "        \n",
    "    return mean_val, lower, upper\n",
    "\n",
    "# ==========================================\n",
    "# 2. 绘图设置 (ICML 风格)\n",
    "# ==========================================\n",
    "plt.rcParams.update({\n",
    "    'font.size': 14,\n",
    "    'axes.labelsize': 20,\n",
    "    'xtick.labelsize': 16,\n",
    "    'ytick.labelsize': 16,\n",
    "    'legend.fontsize': 16,\n",
    "    'lines.linewidth': 3.0,\n",
    "    'axes.grid': True, \n",
    "    'grid.alpha': 0.3, \n",
    "    'grid.linestyle': '--',\n",
    "    'figure.autolayout': True,\n",
    "    'savefig.dpi': 300\n",
    "})\n",
    "\n",
    "fig, axes = plt.subplots(1, 3, figsize=(20, 5), sharex=True)\n",
    "\n",
    "# 准备颜色生成器 (根据 EPSILON_LIST 的长度生成渐变色)\n",
    "# sorted_epsilons = sorted(experiment_results.keys()) # 确保按顺序画\n",
    "sorted_epsilons = sorted(EPSILON_LIST) # 使用你定义的列表\n",
    "num_eps = len(sorted_epsilons)\n",
    "\n",
    "\n",
    "# 这样生成的渐变色既保留了原来的风格，又保证了中间线条清晰可见\n",
    "colors_nodes = ['#1f77b4', '#2ca02c', '#d62728'] \n",
    "custom_cmap = mcolors.LinearSegmentedColormap.from_list(\"BlueGreenRed\", colors_nodes)\n",
    "\n",
    "# 生成颜色列表\n",
    "colors = [custom_cmap(x) for x in np.linspace(0, 1, num_eps)]\n",
    "# 获取时间步 (假设所有实验长度一致)\n",
    "# 从第一个结果中获取长度\n",
    "first_key = sorted_epsilons[0]\n",
    "steps = np.arange(experiment_results[first_key]['risks'].shape[1])\n",
    "\n",
    "# ==========================================\n",
    "# 3. 循环绘图\n",
    "# ==========================================\n",
    "\n",
    "for idx, eps in enumerate(sorted_epsilons):\n",
    "    bundle = experiment_results[eps]\n",
    "    color = colors[idx]\n",
    "    label_str = f'$\\epsilon={eps}$'\n",
    "    \n",
    "    # --- 1. Risk (ER) ---\n",
    "    # 提取数据\n",
    "    risks_data = bundle['risks'] # 假设 key 是 'risks' (根据你之前的代码可能是 avg_risk)\n",
    "    # 如果你的 bundle 存的是 df_result['avg_risk'].values，那 key 需要对应上\n",
    "    # 这里假设你存的时候比如: 'risks': np.array(list_of_arrays)\n",
    "    \n",
    "    # 如果你的 bundle 结构不一样，请在这里调整提取逻辑，例如:\n",
    "    # risks_data = np.array([res['avg_risk'].values for res in bundle]) \n",
    "    \n",
    "    r_mean, r_low, r_high = get_error_bounds(risks_data, SHADOW_TYPE)\n",
    "    \n",
    "    ax = axes[0]\n",
    "    ax.plot(steps, r_mean, color=color, linestyle='-', label=label_str)\n",
    "    ax.fill_between(steps, r_low, r_high, color=color, alpha=0.15) # 阴影淡一点，避免重叠太乱\n",
    "\n",
    "    # --- 2. ECP ---\n",
    "    # 注意：scale=100\n",
    "    ecp_data = bundle['expert_calls'] # 对应 run_single_epsilon_experiment 里的 key\n",
    "    e_mean, e_low, e_high = get_error_bounds(ecp_data, SHADOW_TYPE, scale=100)\n",
    "    \n",
    "    ax = axes[1]\n",
    "    ax.plot(steps, e_mean, color=color, linestyle='-')\n",
    "    ax.fill_between(steps, e_low, e_high, color=color, alpha=0.15)\n",
    "\n",
    "    # --- 3. TCP ---\n",
    "    # 注意：scale=100\n",
    "    tcp_data = bundle['tokens'] # 对应 run_single_epsilon_experiment 里的 key\n",
    "    t_mean, t_low, t_high = get_error_bounds(tcp_data, SHADOW_TYPE, scale=100)\n",
    "    \n",
    "    ax = axes[2]\n",
    "    ax.plot(steps, t_mean, color=color, linestyle='-')\n",
    "    ax.fill_between(steps, t_low, t_high, color=color, alpha=0.15)\n",
    "\n",
    "\n",
    "# ==========================================\n",
    "# 4. 细节修饰 (Axes Labels & Limits)\n",
    "# ==========================================\n",
    "\n",
    "# --- Ax0: Risk ---\n",
    "ax = axes[0]\n",
    "ax.set_ylabel('ER', fontsize=22)\n",
    "ax.set_xlabel('Time Step')\n",
    "# ax.set_ylim(0, max(sorted_epsilons)*2.0) # 动态调整，大概是最大 epsilon 的两倍\n",
    "ax.set_ylim(0, 0.1) # 或者手动固定\n",
    "ax.legend(loc='upper right', frameon=True, framealpha=0.95, fontsize=12, ncol=2) # 分两列显示图例\n",
    "ax.set_title(\"Varying Tolerance ($\\epsilon$)\")\n",
    "\n",
    "# --- Ax1: ECP ---\n",
    "ax = axes[1]\n",
    "ax.set_ylabel('ECP (%)')\n",
    "ax.set_xlabel('Time Step')\n",
    "ax.set_ylim(0, 100)\n",
    "# ax.legend() # 不需要重复图例\n",
    "\n",
    "# --- Ax2: TCP ---\n",
    "ax = axes[2]\n",
    "ax.set_ylabel('TP (%)')\n",
    "ax.set_xlabel('Time Step')\n",
    "ax.axhline(y=100, color='black', linestyle='-.', linewidth=1.5, label='Full Expert', zorder=1)\n",
    "\n",
    "# 保存\n",
    "plt.tight_layout()\n",
    "plt.savefig('epsilon_sensitivity_magpie.pdf', bbox_inches='tight')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a76d2c64",
   "metadata": {},
   "source": [
    "#### 不同step画图"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "63b97fe7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "import pickle  # 用于保存结果到硬盘，防止跑太久丢失\n",
    "\n",
    "# ================= 配置区域 =================\n",
    "# 实验参数\n",
    "NUM_RUNS = 100        # 重复次数\n",
    "WARM_UP_STEPS = 0     \n",
    "CFG_CHANGES = [10,50,100,200,300,500] # <--- 这里是你想要遍历的 epsilon 列表\n",
    "\n",
    "# 固定参数 (保持不变)\n",
    "CFG_ALPHA = 0.1       \n",
    "CFG_RHO = 0       \n",
    "BPA_beta = 1\n",
    "CFG_RHO_0 = 0.05\n",
    "CFG_RHO_1 = 0.7\n",
    "CFG_EPSILON = 0.08\n",
    "# CFG_CHANGE = 200\n",
    "C_CLIP = 0.9\n",
    "\n",
    "# 结果存储容器：Key 是 epsilon 值，Value 是包含各指标数组的字典\n",
    "experiment_results = {} \n",
    "\n",
    "def run_single_epsilon_experiment(target_steps):\n",
    "    \"\"\"\n",
    "    针对单个 Epsilon 运行 NUM_RUNS 次实验\n",
    "    \"\"\"\n",
    "    local_risks = []\n",
    "    local_token_ratios = []\n",
    "    local_expert_ratios = []\n",
    "    local_wealths = []\n",
    "    local_thresholds = []\n",
    "    \n",
    "    # tqdm desc 参数可以让你知道当前跑的是哪个 epsilon\n",
    "    for seed in tqdm(range(NUM_RUNS), desc=f\"Simulating Steps={target_steps}\", leave=False):\n",
    "        # 1. 数据对齐 (保证不同 epsilon 使用的是完全相同的随机数据顺序)\n",
    "        rng = np.random.default_rng(seed)\n",
    "        n = len(data_list)\n",
    "        indices = np.arange(n)\n",
    "        rng.shuffle(indices)\n",
    "        shuffled_data = [data_list[i] for i in indices]\n",
    "        \n",
    "        # 2. 配置 BPAC (传入当前的 target_eps)\n",
    "        cfg = BPACConfig(\n",
    "            epsilon=CFG_EPSILON,  # <--- 动态变化的参数\n",
    "            alpha=CFG_ALPHA, \n",
    "            rho=CFG_RHO, \n",
    "            warm_up=WARM_UP_STEPS,\n",
    "            num_thresholds=1001, \n",
    "            beta=BPA_beta, \n",
    "            c_clip=C_CLIP,\n",
    "            rho_0=CFG_RHO_0,\n",
    "            rho_1=CFG_RHO_1,\n",
    "            change_point=target_steps\n",
    "        )\n",
    "        \n",
    "        # 3. 运行\n",
    "        df_result, _ = run_simulation(shuffled_data, cfg)\n",
    "        \n",
    "        # 4. 收集\n",
    "        local_risks.append(df_result['avg_risk'].values)\n",
    "        local_token_ratios.append(df_result['token_ratio'].values)\n",
    "        local_expert_ratios.append(df_result['expert_call_ratio'].values)\n",
    "        local_wealths.append(df_result['wealth'].values)\n",
    "        local_thresholds.append(df_result['threshold'].values)\n",
    "\n",
    "    # 转 Numpy 并打包返回\n",
    "    return {\n",
    "        \"risks\": np.array(local_risks),         # Shape: (100, N)\n",
    "        \"tokens\": np.array(local_token_ratios), # Shape: (100, N)\n",
    "        \"expert_calls\": np.array(local_expert_ratios),\n",
    "        \"wealths\": np.array(local_wealths),\n",
    "        \"thresholds\": np.array(local_thresholds)\n",
    "    }\n",
    "\n",
    "# ================= 主循环 =================\n",
    "print(f\"Starting Multi-Epsilon Simulation for: {CFG_CHANGES}\")\n",
    "\n",
    "for eps in CFG_CHANGES:\n",
    "    print(f\"\\n>>> Processing Target Epsilon: {eps}\")\n",
    "    result_bundle = run_single_epsilon_experiment(eps)\n",
    "    experiment_results[eps] = result_bundle\n",
    "\n",
    "print(\"\\nAll experiments finished!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "85b6f151",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "from scipy import stats\n",
    "from matplotlib.ticker import MultipleLocator\n",
    "from matplotlib import colors as mcolors\n",
    "# ==========================================\n",
    "# 1. 统计与辅助函数 (保持不变)\n",
    "# ==========================================\n",
    "SHADOW_TYPE = 'std'  # 推荐使用 std 看得更清楚\n",
    "CALIB_NUM = 0\n",
    "\n",
    "def get_error_bounds(data_array, type='std', scale=1):\n",
    "    \"\"\"计算均值和阴影上下界\"\"\"\n",
    "    if data_array.ndim == 1 or data_array.shape[0] <= 1:\n",
    "        mean_val = data_array.flatten() * scale\n",
    "        return mean_val, mean_val, mean_val\n",
    "\n",
    "    mean_val = np.mean(data_array, axis=0) * scale\n",
    "    \n",
    "    if type == 'std':\n",
    "        dev_val = np.std(data_array, axis=0) * scale\n",
    "    elif type == 'sem':\n",
    "        dev_val = stats.sem(data_array, axis=0) * scale\n",
    "    else:\n",
    "        dev_val = np.std(data_array, axis=0) * scale\n",
    "        \n",
    "    lower = mean_val - dev_val\n",
    "    upper = mean_val + dev_val\n",
    "        \n",
    "    return mean_val, lower, upper\n",
    "\n",
    "# ==========================================\n",
    "# 2. 绘图设置 (ICML 风格)\n",
    "# ==========================================\n",
    "plt.rcParams.update({\n",
    "    'font.size': 14,\n",
    "    'axes.labelsize': 20,\n",
    "    'xtick.labelsize': 16,\n",
    "    'ytick.labelsize': 16,\n",
    "    'legend.fontsize': 16,\n",
    "    'lines.linewidth': 3.0,\n",
    "    'axes.grid': True, \n",
    "    'grid.alpha': 0.3, \n",
    "    'grid.linestyle': '--',\n",
    "    'figure.autolayout': True,\n",
    "    'savefig.dpi': 300\n",
    "})\n",
    "\n",
    "fig, axes = plt.subplots(1, 3, figsize=(20, 5), sharex=True)\n",
    "\n",
    "sorted_epsilons = sorted(CFG_CHANGES) # 使用你定义的列表\n",
    "num_eps = len(sorted_epsilons)\n",
    "\n",
    "\n",
    "# 这样生成的渐变色既保留了原来的风格，又保证了中间线条清晰可见\n",
    "colors_nodes = ['#1f77b4', '#2ca02c', '#d62728'] \n",
    "custom_cmap = mcolors.LinearSegmentedColormap.from_list(\"BlueGreenRed\", colors_nodes)\n",
    "\n",
    "# 生成颜色列表\n",
    "colors = [custom_cmap(x) for x in np.linspace(0, 1, num_eps)]\n",
    "# 获取时间步 (假设所有实验长度一致)\n",
    "# 从第一个结果中获取长度\n",
    "first_key = sorted_epsilons[0]\n",
    "steps = np.arange(experiment_results[first_key]['risks'].shape[1])\n",
    "\n",
    "# ==========================================\n",
    "# 3. 循环绘图\n",
    "# ==========================================\n",
    "\n",
    "for idx, eps in enumerate(sorted_epsilons):\n",
    "    bundle = experiment_results[eps]\n",
    "    color = colors[idx]\n",
    "    label_str = f'$T_{{warm}}={eps}$'\n",
    "    \n",
    "\n",
    "    # --- 1. Risk (ER) ---\n",
    "    # 提取数据\n",
    "    risks_data = bundle['risks'] # 假设 key 是 'risks' (根据你之前的代码可能是 avg_risk)\n",
    "    # 如果你的 bundle 存的是 df_result['avg_risk'].values，那 key 需要对应上\n",
    "    # 这里假设你存的时候比如: 'risks': np.array(list_of_arrays)\n",
    "    \n",
    "    # 如果你的 bundle 结构不一样，请在这里调整提取逻辑，例如:\n",
    "    # risks_data = np.array([res['avg_risk'].values for res in bundle]) \n",
    "    \n",
    "    r_mean, r_low, r_high = get_error_bounds(risks_data, SHADOW_TYPE)\n",
    "    \n",
    "    ax = axes[0]\n",
    "    ax.plot(steps, r_mean, color=color, linestyle='-', label=label_str)\n",
    "    ax.fill_between(steps, r_low, r_high, color=color, alpha=0.15) # 阴影淡一点，避免重叠太乱\n",
    "\n",
    "    # --- 2. ECP ---\n",
    "    # 注意：scale=100\n",
    "    ecp_data = bundle['expert_calls'] # 对应 run_single_epsilon_experiment 里的 key\n",
    "    e_mean, e_low, e_high = get_error_bounds(ecp_data, SHADOW_TYPE, scale=100)\n",
    "    \n",
    "    ax = axes[1]\n",
    "    ax.plot(steps, e_mean, color=color, linestyle='-')\n",
    "    ax.fill_between(steps, e_low, e_high, color=color, alpha=0.15)\n",
    "\n",
    "    # --- 3. TCP ---\n",
    "    # 注意：scale=100\n",
    "    tcp_data = bundle['tokens'] # 对应 run_single_epsilon_experiment 里的 key\n",
    "    t_mean, t_low, t_high = get_error_bounds(tcp_data, SHADOW_TYPE, scale=100)\n",
    "    \n",
    "    ax = axes[2]\n",
    "    ax.plot(steps, t_mean, color=color, linestyle='-')\n",
    "    ax.fill_between(steps, t_low, t_high, color=color, alpha=0.15)\n",
    "\n",
    "\n",
    "# ==========================================\n",
    "# 4. 细节修饰 (Axes Labels & Limits)\n",
    "# ==========================================\n",
    "\n",
    "# --- Ax0: Risk ---\n",
    "ax = axes[0]\n",
    "ax.set_ylabel('ER', fontsize=22)\n",
    "ax.set_xlabel('Time Step')\n",
    "# ax.set_ylim(0, max(sorted_epsilons)*2.0) # 动态调整，大概是最大 epsilon 的两倍\n",
    "ax.axhline(y=TARGET_EPSILON, color=C_TGT, linestyle='--', linewidth=2, label='Tolerance')\n",
    "ax.set_ylim(0, 0.16) # 或者手动固定\n",
    "ax.legend(loc='upper right', frameon=True, framealpha=0.95, fontsize=12, ncol=2) # 分两列显示图例\n",
    "\n",
    "# --- Ax1: ECP ---\n",
    "ax = axes[1]\n",
    "ax.set_ylabel('ECP (%)')\n",
    "ax.set_xlabel('Time Step')\n",
    "ax.set_ylim(0, 100)\n",
    "# ax.legend() # 不需要重复图例\n",
    "\n",
    "# --- Ax2: TCP ---\n",
    "ax = axes[2]\n",
    "ax.set_ylabel('TP (%)')\n",
    "ax.set_xlabel('Time Step')\n",
    "ax.axhline(y=100, color='black', linestyle='-.', linewidth=1.5, label='Full Expert', zorder=1)\n",
    "\n",
    "# 保存\n",
    "plt.tight_layout()\n",
    "plt.savefig('warm_step_magpie.pdf', bbox_inches='tight')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "72e2c13d",
   "metadata": {},
   "source": [
    "## Qwen2.5-ins"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d261a57b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "expert  = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs/magpie/qwen3-think.json\")\n",
    "instant = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs/magpie/qwen2.5-7b.json\")\n",
    "\n",
    "# ======================\n",
    "# 2. 合并（按 index）\n",
    "# ======================\n",
    "df = expert.merge(\n",
    "    instant[[\"gpt4_score\"]],\n",
    "    left_index=True,\n",
    "    right_index=True,\n",
    "    suffixes=(\"_expert\", \"_instant\")\n",
    ")\n",
    "\n",
    "# ======================\n",
    "# 3. 条件 mask\n",
    "# ======================\n",
    "valid_mask = (\n",
    "    df[\"gpt4_score_expert\"].notna() &\n",
    "    df[\"gpt4_score_instant\"].notna()\n",
    ")\n",
    "\n",
    "better_mask = valid_mask & (\n",
    "    df[\"gpt4_score_expert\"] >= df[\"gpt4_score_instant\"]\n",
    ")\n",
    "\n",
    "# ======================\n",
    "# 4. 筛选\n",
    "# ======================\n",
    "df_selected = df.loc[better_mask]\n",
    "\n",
    "# 拆回原 dataframe（保留整行）\n",
    "expert_data  = expert.loc[df_selected.index]\n",
    "instant_data = instant.loc[df_selected.index]\n",
    "\n",
    "# ======================\n",
    "# 5. 统计信息\n",
    "# ======================\n",
    "total_valid = valid_mask.sum()\n",
    "num_better  = better_mask.sum()\n",
    "\n",
    "print(f\"Total valid paired samples (after remove NaN): {total_valid}\")\n",
    "print(f\"Cases where Think/Expert score >= Instant score: {num_better}\")\n",
    "print(f\"Ratio: {num_better / total_valid:.4f}\")\n",
    "print(f\"Percentage: {num_better / total_valid * 100:.1f}%\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "67109744",
   "metadata": {},
   "outputs": [],
   "source": [
    "instant_data.head(1)\n",
    "data_list =[]\n",
    "selcet_ids =[]\n",
    "for i,row in expert_data.iterrows():\n",
    "    session_id = row['session_id']\n",
    "    if session_id not in instant_data['session_id'].values:\n",
    "        print(f\"Instant data missing at index:{i}, session_id:{session_id}\")\n",
    "        continue\n",
    "    tmp_dict = {}\n",
    "    instant_row = instant_data[instant_data['session_id'] == session_id].iloc[0]\n",
    "    selcet_ids.append(session_id)\n",
    "    if instant_row[\"question\"] != row[\"question\"]:\n",
    "        print(f\"Question mismatch at index:{i}, session_id:{session_id}\")\n",
    "        continue\n",
    "    tmp_dict['uncertainty'] = 1- instant_row['token_probs'][0]\n",
    "    tmp_dict['instant_correct'] = float(instant_row['gpt4_score'])\n",
    "    tmp_dict['expert_correct'] = float(expert_data.loc[i, \"gpt4_score\"])\n",
    "    tmp_dict['instant_token'] = instant_row['gen_token_count']\n",
    "    tmp_dict['expert_token'] = expert_data.loc[i, 'gen_token_count']\n",
    "    data_list.append(tmp_dict)\n",
    "\n",
    "print(f\"Total samples prepared: {len(data_list)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e2a516c4",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_list[14]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "632437bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "diff = np.array([\n",
    "    d[\"instant_correct\"] - d[\"expert_correct\"]\n",
    "    for d in data_list\n",
    "])\n",
    "\n",
    "q1 = np.percentile(diff, 25)\n",
    "q3 = np.percentile(diff, 75)\n",
    "iqr = q3 - q1\n",
    "\n",
    "lower = q1 - 1.5 * iqr\n",
    "upper = q3 + 1.5 * iqr\n",
    "\n",
    "filtered_data_list = [\n",
    "    d for d in data_list\n",
    "    if lower <= (d[\"instant_correct\"] - d[\"expert_correct\"]) <= upper\n",
    "]\n",
    "\n",
    "print(\"原始样本数:\", len(data_list))\n",
    "print(\"删除后样本数:\", len(filtered_data_list))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "256ae49e",
   "metadata": {},
   "source": [
    "### compare to pac"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d89ceb2a",
   "metadata": {},
   "source": [
    "#### 3baseline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1f355b0f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from tqdm import tqdm \n",
    "\n",
    "\n",
    "# 假设 data_list 已经准备好\n",
    "y_solved = [item['expert_correct'] for item in data_list]\n",
    "y_hat_solved = [item['instant_correct'] for item in data_list]\n",
    "uncertainty_values = [item['uncertainty'] for item in data_list]\n",
    "y_token_list = [item['expert_token'] for item in data_list]\n",
    "y_hat_token_list = [item['instant_token'] for item in data_list]\n",
    "\n",
    "# 参数设置\n",
    "NUM_RUNS = 100         # 重复次数\n",
    "TARGET_EPSILON = 0.08   # 目标 Risk\n",
    "TARGET_ALPHA = 0.1     # 置信度\n",
    "CALIB_NUM = 500        # 校准集大小\n",
    "TOTAL_NUM = len(y_solved)\n",
    "calib_ratio = CALIB_NUM / TOTAL_NUM\n",
    "\n",
    "# -------------------------------------------------------------------------\n",
    "# 1. 循环运行 Simulation\n",
    "# -------------------------------------------------------------------------\n",
    "\n",
    "# 用于存储每次实验的完整序列\n",
    "# 维度: (100, N_samples)\n",
    "all_risks = []\n",
    "all_token_ratios = []\n",
    "all_expert_ratios = []\n",
    "all_u_hats = []\n",
    "\n",
    "print(f\"Starting {NUM_RUNS} runs simulation...\")\n",
    "\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    # 每次使用不同的随机种子 (seed=0, 1, 2, ..., 99)\n",
    "    # 这样 run_baseline_continuous 内部的 shuffle 结果每次都不一样\n",
    "    df, u_hat = run_baseline_continuous(\n",
    "        y_solved, \n",
    "        y_hat_solved, \n",
    "        uncertainty_values, \n",
    "        y_token_list, \n",
    "        y_hat_token_list,\n",
    "        calib_ratio=calib_ratio, \n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=TARGET_ALPHA,\n",
    "        seed=seed\n",
    "    )\n",
    "    \n",
    "    # 记录整条曲线 (df['avg_risk'] 是一个 Series)\n",
    "    all_risks.append(df['avg_risk'].values)\n",
    "    all_token_ratios.append(df['token_ratio'].values)\n",
    "    all_expert_ratios.append(df['expert_call_ratio'].values)\n",
    "    all_u_hats.append(u_hat)\n",
    "\n",
    "# 转为 Numpy Array 方便计算均值方差\n",
    "# shape = (100, N)\n",
    "arr_risks = np.array(all_risks)\n",
    "arr_token_ratios = np.array(all_token_ratios)\n",
    "arr_expert_ratios = np.array(all_expert_ratios)\n",
    "arr_thresholds = np.array(all_u_hats)\n",
    "\n",
    "print(\"\\nSimulation Finished!\")\n",
    "print(f\"Average u_hat across runs: {np.mean(all_u_hats):.4f} (std: {np.std(all_u_hats):.4f})\")\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "# 参数设置\n",
    "NUM_RUNS = 100        # 重复 100 次\n",
    "WARM_UP_STEPS = 0   # BPAC 的 Warm-up 步数 (对应 Baseline 的校准集大小)\n",
    "TARGET_EPSILON = 0.08  # 目标 Risk\n",
    "CFG_ALPHA= 0.1       # BPAC 参数\n",
    "CFG_RHO = 0      # BPAC 参数\n",
    "BPA_beta = 1\n",
    "CFG_RHO_0 = 0.05\n",
    "CFG_RHO_1 = 0.7\n",
    "CFG_CHANGE = 200\n",
    "C_CLIP = 0.9\n",
    "\n",
    "bpac_risks = []\n",
    "bpac_token_ratios = []\n",
    "bpac_expert_ratios = []\n",
    "bpac_wealths = []\n",
    "bpac_thresholds = []\n",
    "\n",
    "print(f\"Starting BPAC Simulation ({NUM_RUNS} runs)...\")\n",
    "\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    # -----------------------------------------------------------\n",
    "    # 1. 严格的数据对齐 (Data Alignment)\n",
    "    # -----------------------------------------------------------\n",
    "    # 使用与 Baseline 相同的种子生成随机数生成器\n",
    "    rng = np.random.default_rng(seed)\n",
    "    \n",
    "    # 生成打乱的索引\n",
    "    n = len(data_list)\n",
    "    indices = np.arange(n)\n",
    "    rng.shuffle(indices)\n",
    "    \n",
    "    # 关键修正：根据 indices 重排 data_list\n",
    "    # 注意：data_list 是 list of dicts，不能直接用 indices 索引，除非转 numpy\n",
    "    # 这里用列表推导式最稳妥\n",
    "    shuffled_data = [data_list[i] for i in indices]\n",
    "    \n",
    "    # -----------------------------------------------------------\n",
    "    # 2. 配置并运行 BPAC\n",
    "    # -----------------------------------------------------------\n",
    "    # 你的 BPACConfig\n",
    "    cfg = BPACConfig(\n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=CFG_ALPHA, \n",
    "        rho=CFG_RHO, \n",
    "        warm_up=WARM_UP_STEPS,\n",
    "        num_thresholds=1001, \n",
    "        beta=BPA_beta, \n",
    "        c_clip=C_CLIP,\n",
    "        rho_0=CFG_RHO_0,\n",
    "        rho_1=CFG_RHO_1,\n",
    "        change_point=CFG_CHANGE\n",
    "    )\n",
    "    \n",
    "    # 运行模拟 (传入打乱后的数据)\n",
    "    df_result, _ = run_simulation(shuffled_data, cfg)\n",
    "    \n",
    "    bpac_risks.append(df_result['avg_risk'].values)\n",
    "    bpac_token_ratios.append(df_result['token_ratio'].values)\n",
    "    bpac_expert_ratios.append(df_result['expert_call_ratio'].values)\n",
    "    bpac_wealths.append(df_result['wealth'].values)\n",
    "    bpac_thresholds.append(df_result['threshold'].values)\n",
    "\n",
    "\n",
    "# 转换为 Numpy 数组方便计算均值\n",
    "bpac_risks_arr = np.array(bpac_risks)       # Shape: (100, N)\n",
    "bpac_token_ratios_arr = np.array(bpac_token_ratios) # Shape: (100, N)\n",
    "bpac_expert_ratios_arr = np.array(bpac_expert_ratios) # Shape: (100, N)\n",
    "bpac_wealths_arr = np.array(bpac_wealths)   # Shape: (100, N)\n",
    "bpac_thresholds_arr = np.array(bpac_thresholds) # Shape: (100, N)\n",
    "print(\"BPAC Simulation Finished!\")\n",
    "bpac_risks_arr.shape, bpac_token_ratios_arr.shape, bpac_expert_ratios_arr.shape\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "# 参数设置\n",
    "NUM_RUNS = 100        # 重复 100 次\n",
    "WARM_UP_STEPS = 0   # BPAC 的 Warm-up 步数 (对应 Baseline 的校准集大小)\n",
    "TARGET_EPSILON = 0.08  # 目标 Risk\n",
    "CFG_ALPHA= 0.1      # BPAC 参数\n",
    "CFG_RHO = 0.05        # BPAC 参数\n",
    "\n",
    "\n",
    "bpaconaive_risks = []\n",
    "bpaconaive_token_ratios = []\n",
    "bpaconaive_expert_ratios = []\n",
    "bpaconaive_wealths = []\n",
    "bpaconaive_thresholds = []\n",
    "print(f\"Starting BPAC-Naive Simulation ({NUM_RUNS} runs)...\")\n",
    "\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    # -----------------------------------------------------------\n",
    "    # 1. 严格的数据对齐 (Data Alignment)\n",
    "    # -----------------------------------------------------------\n",
    "    # 使用与 Baseline 相同的种子生成随机数生成器\n",
    "    rng = np.random.default_rng(seed)\n",
    "    \n",
    "    # 生成打乱的索引\n",
    "    n = len(data_list)\n",
    "    indices = np.arange(n)\n",
    "    rng.shuffle(indices)\n",
    "    \n",
    "    # 关键修正：根据 indices 重排 data_list\n",
    "    # 注意：data_list 是 list of dicts，不能直接用 indices 索引，除非转 numpy\n",
    "    # 这里用列表推导式最稳妥\n",
    "    shuffled_data = [data_list[i] for i in indices]\n",
    "    \n",
    "    # -----------------------------------------------------------\n",
    "    # 2. 配置并运行 BPAC\n",
    "    # -----------------------------------------------------------\n",
    "    # 你的 BPACConfig\n",
    "    cfg = BPACConfignaive(\n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=CFG_ALPHA, \n",
    "        rho=CFG_RHO, \n",
    "        warm_up=WARM_UP_STEPS,\n",
    "        num_thresholds=1001, \n",
    "    )\n",
    "    \n",
    "    # 运行模拟 (传入打乱后的数据)\n",
    "    df_result, _ = run_simulation_onaive(shuffled_data, cfg)\n",
    "    \n",
    "    bpaconaive_risks.append(df_result['avg_risk'].values)\n",
    "    bpaconaive_token_ratios.append(df_result['token_ratio'].values)\n",
    "    bpaconaive_expert_ratios.append(df_result['expert_call_ratio'].values)\n",
    "    bpaconaive_wealths.append(df_result['wealth'].values)\n",
    "    bpaconaive_thresholds.append(df_result['threshold'].values) \n",
    "\n",
    "# 转换为 Numpy 数组方便计算均值\n",
    "bpaconaive_risks_arr = np.array(bpaconaive_risks)       # Shape: (100, N)\n",
    "bpaconaive_token_ratios_arr = np.array(bpaconaive_token_ratios) # Shape: (100, N)\n",
    "bpaconaive_expert_ratios_arr = np.array(bpaconaive_expert_ratios) # Shape: (100, N)\n",
    "bpaconaive_wealths_arr = np.array(bpaconaive_wealths)   # Shape: (100, N)\n",
    "bpaconaive_thresholds_arr = np.array(bpaconaive_thresholds) # Shape: (100, N)\n",
    "print(\"BPAC-Naive Simulation Finished!\")\n",
    "bpaconaive_risks_arr.shape, bpaconaive_token_ratios_arr.shape, bpaconaive_expert_ratios_arr.shape\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "# 参数设置\n",
    "NUM_RUNS = 100        # 重复 100 次\n",
    "WARM_UP_STEPS = 0   # BPAC 的 Warm-up 步数 (对应 Baseline 的校准集大小)\n",
    "TARGET_EPSILON = 0.08  # 目标 Risk\n",
    "CFG_ALPHA= 0.1      # BPAC 参数\n",
    "CFG_RHO = 0.05        # BPAC 参数\n",
    "\n",
    "\n",
    "bpacips_risks = []\n",
    "bpacips_token_ratios = []\n",
    "bpacips_expert_ratios = []\n",
    "bpacips_wealths = []\n",
    "bpacips_thresholds = []\n",
    "print(f\"Starting BPAC-IPS Simulation ({NUM_RUNS} runs)...\")\n",
    "\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    # -----------------------------------------------------------\n",
    "    # 1. 严格的数据对齐 (Data Alignment)\n",
    "    # -----------------------------------------------------------\n",
    "    # 使用与 Baseline 相同的种子生成随机数生成器\n",
    "    rng = np.random.default_rng(seed)\n",
    "    \n",
    "    # 生成打乱的索引\n",
    "    n = len(data_list)\n",
    "    indices = np.arange(n)\n",
    "    rng.shuffle(indices)\n",
    "    \n",
    "    # 关键修正：根据 indices 重排 data_list\n",
    "    # 注意：data_list 是 list of dicts，不能直接用 indices 索引，除非转 numpy\n",
    "    # 这里用列表推导式最稳妥\n",
    "    shuffled_data = [data_list[i] for i in indices]\n",
    "    \n",
    "    # -----------------------------------------------------------\n",
    "    # 2. 配置并运行 BPAC\n",
    "    # -----------------------------------------------------------\n",
    "    # 你的 BPACConfig\n",
    "    cfg = BPACConfigIPS(\n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=CFG_ALPHA, \n",
    "        rho=CFG_RHO, \n",
    "        warm_up=WARM_UP_STEPS,\n",
    "        num_thresholds=1001, \n",
    "    )\n",
    "    \n",
    "    # 运行模拟 (传入打乱后的数据)\n",
    "    df_result, _ = run_simulation_ips(shuffled_data, cfg)\n",
    "    \n",
    "    bpacips_risks.append(df_result['avg_risk'].values)\n",
    "    bpacips_token_ratios.append(df_result['token_ratio'].values)\n",
    "    bpacips_expert_ratios.append(df_result['expert_call_ratio'].values)\n",
    "    bpacips_wealths.append(df_result['wealth'].values)\n",
    "    bpacips_thresholds.append(df_result['threshold'].values) \n",
    "\n",
    "# 转换为 Numpy 数组方便计算均值\n",
    "bpacips_risks_arr = np.array(bpacips_risks)       # Shape: (100, N)\n",
    "bpacips_token_ratios_arr = np.array(bpacips_token_ratios) # Shape: (100, N)\n",
    "bpacips_expert_ratios_arr = np.array(bpacips_expert_ratios) # Shape: (100, N)\n",
    "bpacips_wealths_arr = np.array(bpacips_wealths)   # Shape: (100, N)\n",
    "bpacips_thresholds_arr = np.array(bpacips_thresholds) # Shape: (100, N)\n",
    "print(\"BPAC-IPS Simulation Finished!\")\n",
    "bpacips_risks_arr.shape, bpacips_token_ratios_arr.shape, bpacips_expert_ratios_arr.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "663801bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "from scipy import stats\n",
    "from matplotlib.ticker import MultipleLocator\n",
    "\n",
    "# ==========================================\n",
    "# 1. 统计与辅助函数\n",
    "# ==========================================\n",
    "SHADOW_TYPE = 'std'  # 可选: 'std', 'sem', 'ci', 'percentile'\n",
    "CALIB_NUM =0\n",
    "def get_error_bounds(data_array, type='sem',scale=1):\n",
    "    \"\"\"计算均值和阴影上下界\"\"\"\n",
    "    mean_val = np.mean(data_array, axis=0)\n",
    "    sem_val = np.std(data_array, axis=0)\n",
    "    lower = mean_val - sem_val\n",
    "    upper = mean_val + sem_val\n",
    "        \n",
    "    return mean_val*scale, lower*scale, upper*scale\n",
    "\n",
    "def clip_bounds(low, high, min_v=0, max_v=None):\n",
    "    \"\"\"物理截断 (例如比率不能小于0)\"\"\"\n",
    "    low = np.maximum(low, min_v)\n",
    "    if max_v is not None:\n",
    "        high = np.minimum(high, max_v)\n",
    "    return low, high\n",
    "\n",
    "# ==========================================\n",
    "# 2. 计算绘图数据\n",
    "# ==========================================\n",
    "# 假设 steps 与数组长度一致\n",
    "steps = np.arange(bpac_risks_arr.shape[1])\n",
    "\n",
    "# --- Risk (ER) ---\n",
    "naive_risk_m, naive_risk_l, naive_risk_h = get_error_bounds(bpaconaive_risks_arr, SHADOW_TYPE)\n",
    "ips_risk_m, ips_risk_l, ips_risk_h = get_error_bounds(bpacips_risks_arr, SHADOW_TYPE)\n",
    "bpac_risk_m, bpac_risk_l, bpac_risk_h = get_error_bounds(bpac_risks_arr, SHADOW_TYPE)\n",
    "\n",
    "# --- ECP (Expert Call %) ---\n",
    "naive_ecp_m, naive_ecp_l, naive_ecp_h = get_error_bounds(bpaconaive_expert_ratios_arr, SHADOW_TYPE,scale=100)\n",
    "ips_ecp_m, ips_ecp_l, ips_ecp_h = get_error_bounds(bpacips_expert_ratios_arr, SHADOW_TYPE,scale=100)\n",
    "bpac_ecp_m, bpac_ecp_l, bpac_ecp_h = get_error_bounds(bpac_expert_ratios_arr, SHADOW_TYPE,scale=100)\n",
    "\n",
    "# --- TCP (Token Cost %) ---\n",
    "naive_tcp_m, naive_tcp_l, naive_tcp_h = get_error_bounds(bpaconaive_token_ratios_arr, SHADOW_TYPE,scale=100)\n",
    "ips_tcp_m, ips_tcp_l, ips_tcp_h = get_error_bounds(bpacips_token_ratios_arr, SHADOW_TYPE,scale=100)\n",
    "bpac_tcp_m, bpac_tcp_l, bpac_tcp_h = get_error_bounds(bpac_token_ratios_arr, SHADOW_TYPE,scale=100)\n",
    "\n",
    "# # 截断不合理的数值\n",
    "# naive_risk_l, naive_risk_h = clip_bounds(naive_risk_l, naive_risk_h, 0, 1.0)\n",
    "# ips_risk_l, ips_risk_h = clip_bounds(ips_risk_l, ips_risk_h, 0, 1.0)\n",
    "# bpac_risk_l, bpac_risk_h = clip_bounds(bpac_risk_l, bpac_risk_h, 0, 1.0)\n",
    "\n",
    "# naive_ecp_l, naive_ecp_h = clip_bounds(naive_ecp_l, naive_ecp_h, 0, 1.0)\n",
    "# ips_ecp_l, ips_ecp_h = clip_bounds(ips_ecp_l, ips_ecp_h, 0, 1.0)\n",
    "# bpac_ecp_l, bpac_ecp_h = clip_bounds(bpac_ecp_l, bpac_ecp_h, 0, 1.0)\n",
    "\n",
    "# naive_tcp_l, naive_tcp_h = clip_bounds(naive_tcp_l, naive_tcp_h, 0, None)\n",
    "# ips_tcp_l, ips_tcp_h = clip_bounds(ips_tcp_l, ips_tcp_h, 0, None)\n",
    "# bpac_tcp_l, bpac_tcp_h = clip_bounds(bpac_tcp_l, bpac_tcp_h, 0, None)\n",
    "\n",
    "# ==========================================\n",
    "# 3. 绘图 (ICML 风格 - 3 Methods)\n",
    "# ==========================================\n",
    "plt.rcParams.update({\n",
    "    'font.size': 14,\n",
    "    'axes.labelsize': 20,\n",
    "    'xtick.labelsize': 16,\n",
    "    'ytick.labelsize': 16,\n",
    "    'legend.fontsize': 16,\n",
    "    'lines.linewidth': 3.0,\n",
    "    'axes.grid': True, \n",
    "    'grid.alpha': 0.3, \n",
    "    'grid.linestyle': '--',\n",
    "    'figure.autolayout': True,\n",
    "    'savefig.dpi': 300\n",
    "})\n",
    "\n",
    "fig, axes = plt.subplots(1, 3, figsize=(20, 5), sharex=True)\n",
    "\n",
    "# 颜色定义\n",
    "C_NAIVE = '#1f77b4'  # Blue\n",
    "C_IPS = '#2ca02c'    # Green\n",
    "C_OURS = '#d62728'   # Red\n",
    "C_TGT = 'black'      # Tolerance Line\n",
    "C_WARM = 'gray'      # Warmup Line\n",
    "\n",
    "# --- (a) Risk ---\n",
    "ax = axes[0]\n",
    "# 1. Naive\n",
    "ax.plot(steps, naive_risk_m, color=C_NAIVE, linestyle='--', label='O-Naive')\n",
    "ax.fill_between(steps, naive_risk_l, naive_risk_h, color=C_NAIVE, alpha=0.3)\n",
    "# 2. IPS\n",
    "ax.plot(steps, ips_risk_m, color=C_IPS, linestyle='-.', label='IPS+Hoeff')\n",
    "ax.fill_between(steps, ips_risk_l, ips_risk_h, color=C_IPS, alpha=0.3)\n",
    "# 3. Ours\n",
    "ax.plot(steps, bpac_risk_m, color=C_OURS, linestyle='-', label='B-PAC (Ours)')\n",
    "ax.fill_between(steps, bpac_risk_l, bpac_risk_h, color=C_OURS, alpha=0.3)\n",
    "\n",
    "ax.axhline(y=TARGET_EPSILON, color=C_TGT, linestyle='--', linewidth=2, label='Tolerance')\n",
    "# ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "\n",
    "ax.set_ylabel('ER', fontsize=22)\n",
    "ax.set_xlabel('Time Step')\n",
    "# 自动调整 ylim: 取三个方法最大值的 1.1 倍，或者至少展示到 Target 的 2 倍\n",
    "max_risk_show = max(TARGET_EPSILON * 1.5, np.max(naive_risk_m[-50:]), np.max(bpac_risk_m[-50:])) * 1.1\n",
    "ax.set_ylim(0, 0.32)\n",
    "ax.yaxis.set_major_locator(MultipleLocator(0.04))\n",
    "\n",
    "# 图例\n",
    "ax.legend(loc='best', frameon=True, framealpha=0.95, fontsize=14)\n",
    "\n",
    "# --- (b) ECP ---\n",
    "ax = axes[1]\n",
    "ax.plot(steps, naive_ecp_m, color=C_NAIVE, linestyle='--')\n",
    "ax.fill_between(steps, naive_ecp_l, naive_ecp_h, color=C_NAIVE, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, ips_ecp_m, color=C_IPS, linestyle='-.')\n",
    "ax.fill_between(steps, ips_ecp_l, ips_ecp_h, color=C_IPS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, bpac_ecp_m, color=C_OURS, linestyle='-')\n",
    "ax.fill_between(steps, bpac_ecp_l, bpac_ecp_h, color=C_OURS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "# ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "ax.set_ylabel('ECP(%)')\n",
    "ax.set_xlabel('Time Step')\n",
    "ax.set_ylim(0, 100)\n",
    "\n",
    "# --- (c) TCP ---\n",
    "ax = axes[2]\n",
    "ax.plot(steps, naive_tcp_m, color=C_NAIVE, linestyle='--')\n",
    "ax.fill_between(steps, naive_tcp_l, naive_tcp_h, color=C_NAIVE, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, ips_tcp_m, color=C_IPS, linestyle='-.')\n",
    "ax.fill_between(steps, ips_tcp_l, ips_tcp_h, color=C_IPS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.plot(steps, bpac_tcp_m, color=C_OURS, linestyle='-')\n",
    "ax.fill_between(steps, bpac_tcp_l, bpac_tcp_h, color=C_OURS, alpha=0.15, edgecolor='none')\n",
    "\n",
    "ax.axhline(y=100, color='black', linestyle='-.', linewidth=1.5, label='Full Expert')\n",
    "# ax.axvline(x=CALIB_NUM, color=C_WARM, linestyle=':', linewidth=2)\n",
    "\n",
    "ax.set_ylabel('TP(%)')\n",
    "ax.set_xlabel('Time Step')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('qwen2.5ins_magpie.pdf', bbox_inches='tight') # 如需保存请取消注释\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c47c511c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# Baseline 数据\n",
    "base_mean_risk = np.mean(arr_risks, axis=0)\n",
    "base_std_risk = np.std(arr_risks, axis=0)\n",
    "\n",
    "base_mean_token = np.mean(arr_token_ratios, axis=0)\n",
    "base_std_token = np.std(arr_token_ratios, axis=0)\n",
    "\n",
    "base_mean_expert = np.mean(arr_expert_ratios, axis=0)\n",
    "base_std_expert = np.std(arr_expert_ratios, axis=0)\n",
    "\n",
    "# 获取总时间步数\n",
    "N_STEPS = bpac_risks_arr.shape[1] \n",
    "\n",
    "# A. 重构 Baseline Threshold 序列 (因为你只存了最终的一个 u_hat)\n",
    "# 逻辑：在校准期 (前 CALIB_NUM 步) 阈值设为 0，之后设为 u_hat\n",
    "# arr_thresholds 是 (100,) 的标量数组\n",
    "base_threshold_seqs = np.zeros((NUM_RUNS, N_STEPS))\n",
    "for i in range(NUM_RUNS):\n",
    "    base_threshold_seqs[i, :CALIB_NUM] = 0.0  # 校准期阈值通常视为0 (强制Action=1)\n",
    "    base_threshold_seqs[i, CALIB_NUM:] = arr_thresholds[i] # 测试期应用 u_hat\n",
    "\n",
    "base_mean_threshold_seq = np.mean(base_threshold_seqs, axis=0)\n",
    "base_std_threshold_seq = np.std(base_threshold_seqs, axis=0)\n",
    "\n",
    "\n",
    "# BPAC 数据\n",
    "bpac_mean_risk = np.mean(bpac_risks_arr, axis=0)\n",
    "bpac_std_risk = np.std(bpac_risks_arr, axis=0)\n",
    "\n",
    "bpac_mean_token = np.mean(bpac_token_ratios_arr, axis=0)\n",
    "bpac_std_token = np.std(bpac_token_ratios_arr, axis=0)\n",
    "\n",
    "bpac_mean_expert = np.mean(bpac_expert_ratios_arr, axis=0)\n",
    "bpac_std_expert = np.std(bpac_expert_ratios_arr, axis=0)\n",
    "\n",
    "# C. 确保其他 BPAC 统计量已准备好 (以防万一)\n",
    "bpac_mean_threshold = np.mean(bpac_thresholds_arr, axis=0)\n",
    "bpac_std_threshold = np.std(bpac_thresholds_arr, axis=0)\n",
    "\n",
    "# B. 计算 BPAC Wealth 统计 (你存了 bpac_wealths_arr 但还没算均值)\n",
    "bpac_mean_wealth = np.mean(bpac_wealths_arr, axis=0)\n",
    "bpac_std_wealth = np.std(bpac_wealths_arr, axis=0)\n",
    "\n",
    "\n",
    "fig, axes = plt.subplots(5, 1, figsize=(12, 24), sharex=True)\n",
    "(ax1, ax2, ax3, ax4, ax5) = axes\n",
    "\n",
    "# X轴\n",
    "steps = np.arange(N_STEPS)\n",
    "\n",
    "# === 子图 1: Average Risk (风险控制) ===\n",
    "ax1.plot(steps, base_mean_risk, label='Baseline', color='blue', linestyle='--')\n",
    "ax1.fill_between(steps, base_mean_risk - base_std_risk, base_mean_risk + base_std_risk, color='blue', alpha=0.1)\n",
    "\n",
    "ax1.plot(steps, bpac_mean_risk, label='BPAC (Ours)', color='red', linewidth=2)\n",
    "ax1.fill_between(steps, bpac_mean_risk - bpac_std_risk, bpac_mean_risk + bpac_std_risk, color='red', alpha=0.1)\n",
    "\n",
    "ax1.axhline(y=TARGET_EPSILON, color='green', linestyle='-', linewidth=2, label=f'Target $\\epsilon={TARGET_EPSILON}$')\n",
    "ax1.axvline(x=CALIB_NUM, color='gray', linestyle=':', label='Calibration End')\n",
    "\n",
    "ax1.set_ylabel('Avg Risk')\n",
    "ax1.set_title(f'1. Risk Control (Target $\\epsilon < {TARGET_EPSILON}$)')\n",
    "ax1.legend(loc='upper right')\n",
    "ax1.grid(True, alpha=0.3)\n",
    "ax1.set_ylim(0, TARGET_EPSILON * 3.0) # 聚焦在 Epsilon 附近\n",
    "\n",
    "# === 子图 2: Token Ratio (成本节省) ===\n",
    "ax2.plot(steps, base_mean_token, label='Baseline', color='blue', linestyle='--')\n",
    "ax2.fill_between(steps, base_mean_token - base_std_token, base_mean_token + base_std_token, color='blue', alpha=0.1)\n",
    "\n",
    "ax2.plot(steps, bpac_mean_token, label='BPAC', color='red', linewidth=2)\n",
    "ax2.fill_between(steps, bpac_mean_token - bpac_std_token, bpac_mean_token + bpac_std_token, color='red', alpha=0.1)\n",
    "\n",
    "ax2.axhline(y=1.0, color='black', linestyle=':')\n",
    "ax2.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "ax2.set_ylabel('Token Ratio')\n",
    "ax2.set_title('2. Cost Efficiency (Lower is Better)')\n",
    "ax2.grid(True, alpha=0.3)\n",
    "\n",
    "# === 子图 3: Expert Call Ratio (专家调用率) ===\n",
    "ax3.plot(steps, base_mean_expert, label='Baseline', color='blue', linestyle='--')\n",
    "ax3.fill_between(steps, base_mean_expert - base_std_expert, base_mean_expert + base_std_expert, color='blue', alpha=0.1)\n",
    "\n",
    "ax3.plot(steps, bpac_mean_expert, label='BPAC', color='red', linewidth=2)\n",
    "ax3.fill_between(steps, bpac_mean_expert - bpac_std_expert, bpac_mean_expert + bpac_std_expert, color='red', alpha=0.1)\n",
    "\n",
    "ax3.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "ax3.set_ylabel('Expert Call Ratio')\n",
    "ax3.set_title('3. Expert Usage Behavior')\n",
    "ax3.grid(True, alpha=0.3)\n",
    "ax3.set_ylim(0, 1.1)\n",
    "\n",
    "# === 子图 4: Threshold Evolution (阈值变化) ===\n",
    "# Baseline: 前期是0，后期是固定值\n",
    "ax4.plot(steps, base_mean_threshold_seq, label='Baseline $\\hat{u}$', color='blue', linestyle='--')\n",
    "ax4.fill_between(steps, \n",
    "                 base_mean_threshold_seq - base_std_threshold_seq, \n",
    "                 base_mean_threshold_seq + base_std_threshold_seq, \n",
    "                 color='blue', alpha=0.1)\n",
    "\n",
    "# BPAC: 动态变化\n",
    "ax4.plot(steps, bpac_mean_threshold, label='BPAC $u_t$', color='red', linewidth=1.5)\n",
    "ax4.fill_between(steps, \n",
    "                 bpac_mean_threshold - bpac_std_threshold, \n",
    "                 bpac_mean_threshold + bpac_std_threshold, \n",
    "                 color='red', alpha=0.1)\n",
    "\n",
    "ax4.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "ax4.set_ylabel('Uncertainty Threshold')\n",
    "ax4.set_title('4. Threshold Adaptation (Dynamic vs Fixed)')\n",
    "ax4.legend(loc='upper right')\n",
    "ax4.grid(True, alpha=0.3)\n",
    "ax4.set_ylim(0, 1.0) \n",
    "\n",
    "# === 子图 5: Wealth Evolution (财富积累) ===\n",
    "# 只有 BPAC 有 Wealth\n",
    "ax5.plot(steps, bpac_mean_wealth, label='BPAC Wealth', color='purple', linewidth=1.5)\n",
    "ax5.fill_between(steps, \n",
    "                 bpac_mean_wealth - bpac_std_wealth, \n",
    "                 bpac_mean_wealth + bpac_std_wealth, \n",
    "                 color='purple', alpha=0.1)\n",
    "\n",
    "ax5.axhline(y=1.0, color='black', linestyle='--', label='Initial Wealth (1.0)')\n",
    "ax5.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "# 如果财富增长非常快，建议开启对数坐标\n",
    "# ax5.set_yscale('log') \n",
    "\n",
    "ax5.set_ylabel('Wealth')\n",
    "ax5.set_xlabel('Time Step')\n",
    "ax5.set_title('5. BPAC Wealth Accumulation')\n",
    "ax5.legend(loc='upper left')\n",
    "ax5.grid(True, alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1060dba4",
   "metadata": {},
   "source": [
    "## gemma3-ins"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ebf1e781",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "expert  = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs/magpie/qwen3-think.json\")\n",
    "instant = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs/magpie/gemma3-ins.json\")\n",
    "\n",
    "# ======================\n",
    "# 2. 合并（按 index）\n",
    "# ======================\n",
    "df = expert.merge(\n",
    "    instant[[\"gpt4_score\"]],\n",
    "    left_index=True,\n",
    "    right_index=True,\n",
    "    suffixes=(\"_expert\", \"_instant\")\n",
    ")\n",
    "\n",
    "# ======================\n",
    "# 3. 条件 mask\n",
    "# ======================\n",
    "valid_mask = (\n",
    "    df[\"gpt4_score_expert\"].notna() &\n",
    "    df[\"gpt4_score_instant\"].notna()\n",
    ")\n",
    "\n",
    "better_mask = valid_mask & (\n",
    "    df[\"gpt4_score_expert\"] >= df[\"gpt4_score_instant\"]\n",
    ")\n",
    "\n",
    "# ======================\n",
    "# 4. 筛选\n",
    "# ======================\n",
    "df_selected = df.loc[better_mask]\n",
    "\n",
    "# 拆回原 dataframe（保留整行）\n",
    "expert_data  = expert.loc[df_selected.index]\n",
    "instant_data = instant.loc[df_selected.index]\n",
    "\n",
    "# ======================\n",
    "# 5. 统计信息\n",
    "# ======================\n",
    "total_valid = valid_mask.sum()\n",
    "num_better  = better_mask.sum()\n",
    "\n",
    "print(f\"Total valid paired samples (after remove NaN): {total_valid}\")\n",
    "print(f\"Cases where Think/Expert score >= Instant score: {num_better}\")\n",
    "print(f\"Ratio: {num_better / total_valid:.4f}\")\n",
    "print(f\"Percentage: {num_better / total_valid * 100:.1f}%\")\n",
    "instant_data.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e051e82e",
   "metadata": {},
   "outputs": [],
   "source": [
    "a = set(instant[\"session_id\"].tolist())\n",
    "len(a)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "37b72247",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_list =[]\n",
    "selcet_ids =[]\n",
    "for i,row in expert_data.iterrows():\n",
    "    session_id = row['session_id']\n",
    "    if session_id not in instant_data['session_id'].values:\n",
    "        print(f\"Instant data missing at index:{i}, session_id:{session_id}\")\n",
    "        continue\n",
    "    tmp_dict = {}\n",
    "    instant_row = instant_data[instant_data['session_id'] == session_id].iloc[0]\n",
    "    selcet_ids.append(session_id)\n",
    "    if instant_row[\"question\"] != row[\"question\"]:\n",
    "        print(f\"Question mismatch at index:{i}, session_id:{session_id}\")\n",
    "        continue\n",
    "    tmp_dict['uncertainty'] = 1- instant_row['token_probs'][0]\n",
    "    tmp_dict['instant_correct'] = float(instant_row['gpt4_score'])\n",
    "    tmp_dict['expert_correct'] = float(expert_data.loc[i, \"gpt4_score\"])\n",
    "    tmp_dict['instant_token'] = instant_row['gen_token_count']\n",
    "    tmp_dict['expert_token'] = expert_data.loc[i, 'gen_token_count']\n",
    "    data_list.append(tmp_dict)\n",
    "\n",
    "print(f\"Total samples prepared: {len(data_list)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "935b8cad",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "diff = np.array([\n",
    "    d[\"instant_correct\"] - d[\"expert_correct\"]\n",
    "    for d in data_list\n",
    "])\n",
    "\n",
    "q1 = np.percentile(diff, 25)\n",
    "q3 = np.percentile(diff, 75)\n",
    "iqr = q3 - q1\n",
    "\n",
    "lower = q1 - 1.5 * iqr\n",
    "upper = q3 + 1.5 * iqr\n",
    "\n",
    "filtered_data_list = [\n",
    "    d for d in data_list\n",
    "    if lower <= (d[\"instant_correct\"] - d[\"expert_correct\"]) <= upper\n",
    "]\n",
    "\n",
    "print(\"原始样本数:\", len(data_list))\n",
    "print(\"删除后样本数:\", len(filtered_data_list))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e4a1e5d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from tqdm import tqdm \n",
    "\n",
    "data_list = filtered_data_list\n",
    "# 假设 data_list 已经准备好\n",
    "y_solved = [item['expert_correct'] for item in data_list]\n",
    "y_hat_solved = [item['instant_correct'] for item in data_list]\n",
    "uncertainty_values = [item['uncertainty'] for item in data_list]\n",
    "y_token_list = [item['expert_token'] for item in data_list]\n",
    "y_hat_token_list = [item['instant_token'] for item in data_list]\n",
    "\n",
    "# 参数设置\n",
    "NUM_RUNS = 100         # 重复次数\n",
    "TARGET_EPSILON = 0.1   # 目标 Risk\n",
    "TARGET_ALPHA = 0.1    # 置信度\n",
    "CALIB_NUM = 500        # 校准集大小\n",
    "TOTAL_NUM = len(y_solved)\n",
    "calib_ratio = CALIB_NUM / TOTAL_NUM\n",
    "\n",
    "# -------------------------------------------------------------------------\n",
    "# 1. 循环运行 Simulation\n",
    "# -------------------------------------------------------------------------\n",
    "\n",
    "# 用于存储每次实验的完整序列\n",
    "# 维度: (100, N_samples)\n",
    "all_risks = []\n",
    "all_token_ratios = []\n",
    "all_expert_ratios = []\n",
    "all_u_hats = []\n",
    "\n",
    "print(f\"Starting {NUM_RUNS} runs simulation...\")\n",
    "\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    # 每次使用不同的随机种子 (seed=0, 1, 2, ..., 99)\n",
    "    # 这样 run_baseline_continuous 内部的 shuffle 结果每次都不一样\n",
    "    df, u_hat = run_baseline_continuous(\n",
    "        y_solved, \n",
    "        y_hat_solved, \n",
    "        uncertainty_values, \n",
    "        y_token_list, \n",
    "        y_hat_token_list,\n",
    "        calib_ratio=calib_ratio, \n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=TARGET_ALPHA,\n",
    "        seed=seed\n",
    "    )\n",
    "    \n",
    "    # 记录整条曲线 (df['avg_risk'] 是一个 Series)\n",
    "    all_risks.append(df['avg_risk'].values)\n",
    "    all_token_ratios.append(df['token_ratio'].values)\n",
    "    all_expert_ratios.append(df['expert_call_ratio'].values)\n",
    "    all_u_hats.append(u_hat)\n",
    "\n",
    "# 转为 Numpy Array 方便计算均值方差\n",
    "# shape = (100, N)\n",
    "arr_risks = np.array(all_risks)\n",
    "arr_token_ratios = np.array(all_token_ratios)\n",
    "arr_expert_ratios = np.array(all_expert_ratios)\n",
    "arr_thresholds = np.array(all_u_hats)\n",
    "\n",
    "print(\"\\nSimulation Finished!\")\n",
    "print(f\"Average u_hat across runs: {np.mean(all_u_hats):.4f} (std: {np.std(all_u_hats):.4f})\")\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "# 参数设置\n",
    "NUM_RUNS = 100        # 重复 100 次\n",
    "WARM_UP_STEPS = 0   # BPAC 的 Warm-up 步数 (对应 Baseline 的校准集大小)\n",
    "TARGET_EPSILON = 0.1  # 目标 Risk\n",
    "CFG_ALPHA= 0.1       # BPAC 参数\n",
    "CFG_RHO = 0.05         # BPAC 参数\n",
    "BPA_beta = 1\n",
    "RHO0= 0.6\n",
    "RHO1 = 0.1\n",
    "begin = 200\n",
    "\n",
    "bpac_risks = []\n",
    "bpac_token_ratios = []\n",
    "bpac_expert_ratios = []\n",
    "bpac_wealths = []\n",
    "bpac_thresholds = []\n",
    "\n",
    "print(f\"Starting BPAC Simulation ({NUM_RUNS} runs)...\")\n",
    "\n",
    "for seed in tqdm(range(NUM_RUNS)):\n",
    "    # -----------------------------------------------------------\n",
    "    # 1. 严格的数据对齐 (Data Alignment)\n",
    "    # -----------------------------------------------------------\n",
    "    # 使用与 Baseline 相同的种子生成随机数生成器\n",
    "    rng = np.random.default_rng(seed)\n",
    "    \n",
    "    # 生成打乱的索引\n",
    "    n = len(data_list)\n",
    "    indices = np.arange(n)\n",
    "    rng.shuffle(indices)\n",
    "    \n",
    "    # 关键修正：根据 indices 重排 data_list\n",
    "    # 注意：data_list 是 list of dicts，不能直接用 indices 索引，除非转 numpy\n",
    "    # 这里用列表推导式最稳妥\n",
    "    shuffled_data = [data_list[i] for i in indices]\n",
    "    \n",
    "    # -----------------------------------------------------------\n",
    "    # 2. 配置并运行 BPAC\n",
    "    # -----------------------------------------------------------\n",
    "    # 你的 BPACConfig\n",
    "    cfg = BPACConfig(\n",
    "        epsilon=TARGET_EPSILON, \n",
    "        alpha=CFG_ALPHA, \n",
    "        rho=CFG_RHO, \n",
    "        warm_up=WARM_UP_STEPS,\n",
    "        num_thresholds=1001, \n",
    "        beta=BPA_beta, \n",
    "        c_clip=0.9,\n",
    "        rho0=RHO0,\n",
    "        rho1=RHO1,\n",
    "        step = begin\n",
    "    )\n",
    "    \n",
    "    # 运行模拟 (传入打乱后的数据)\n",
    "    df_result, _ = run_simulation(shuffled_data, cfg)\n",
    "    \n",
    "    bpac_risks.append(df_result['avg_risk'].values)\n",
    "    bpac_token_ratios.append(df_result['token_ratio'].values)\n",
    "    bpac_expert_ratios.append(df_result['expert_call_ratio'].values)\n",
    "    bpac_wealths.append(df_result['wealth'].values)\n",
    "    bpac_thresholds.append(df_result['threshold'].values)\n",
    "\n",
    "\n",
    "# 转换为 Numpy 数组方便计算均值\n",
    "bpac_risks_arr = np.array(bpac_risks)       # Shape: (100, N)\n",
    "bpac_token_ratios_arr = np.array(bpac_token_ratios) # Shape: (100, N)\n",
    "bpac_expert_ratios_arr = np.array(bpac_expert_ratios) # Shape: (100, N)\n",
    "bpac_wealths_arr = np.array(bpac_wealths)   # Shape: (100, N)\n",
    "bpac_thresholds_arr = np.array(bpac_thresholds) # Shape: (100, N)\n",
    "print(\"BPAC Simulation Finished!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7e33d9ff",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# Baseline 数据\n",
    "base_mean_risk = np.mean(arr_risks, axis=0)\n",
    "base_std_risk = np.std(arr_risks, axis=0)\n",
    "\n",
    "base_mean_token = np.mean(arr_token_ratios, axis=0)\n",
    "base_std_token = np.std(arr_token_ratios, axis=0)\n",
    "\n",
    "base_mean_expert = np.mean(arr_expert_ratios, axis=0)\n",
    "base_std_expert = np.std(arr_expert_ratios, axis=0)\n",
    "\n",
    "# 获取总时间步数\n",
    "N_STEPS = bpac_risks_arr.shape[1] \n",
    "\n",
    "# A. 重构 Baseline Threshold 序列 (因为你只存了最终的一个 u_hat)\n",
    "# 逻辑：在校准期 (前 CALIB_NUM 步) 阈值设为 0，之后设为 u_hat\n",
    "# arr_thresholds 是 (100,) 的标量数组\n",
    "base_threshold_seqs = np.zeros((NUM_RUNS, N_STEPS))\n",
    "for i in range(NUM_RUNS):\n",
    "    base_threshold_seqs[i, :CALIB_NUM] = 0.0  # 校准期阈值通常视为0 (强制Action=1)\n",
    "    base_threshold_seqs[i, CALIB_NUM:] = arr_thresholds[i] # 测试期应用 u_hat\n",
    "\n",
    "base_mean_threshold_seq = np.mean(base_threshold_seqs, axis=0)\n",
    "base_std_threshold_seq = np.std(base_threshold_seqs, axis=0)\n",
    "\n",
    "\n",
    "# BPAC 数据\n",
    "bpac_mean_risk = np.mean(bpac_risks_arr, axis=0)\n",
    "bpac_std_risk = np.std(bpac_risks_arr, axis=0)\n",
    "\n",
    "bpac_mean_token = np.mean(bpac_token_ratios_arr, axis=0)\n",
    "bpac_std_token = np.std(bpac_token_ratios_arr, axis=0)\n",
    "\n",
    "bpac_mean_expert = np.mean(bpac_expert_ratios_arr, axis=0)\n",
    "bpac_std_expert = np.std(bpac_expert_ratios_arr, axis=0)\n",
    "\n",
    "# C. 确保其他 BPAC 统计量已准备好 (以防万一)\n",
    "bpac_mean_threshold = np.mean(bpac_thresholds_arr, axis=0)\n",
    "bpac_std_threshold = np.std(bpac_thresholds_arr, axis=0)\n",
    "\n",
    "# B. 计算 BPAC Wealth 统计 (你存了 bpac_wealths_arr 但还没算均值)\n",
    "bpac_mean_wealth = np.mean(bpac_wealths_arr, axis=0)\n",
    "bpac_std_wealth = np.std(bpac_wealths_arr, axis=0)\n",
    "\n",
    "\n",
    "fig, axes = plt.subplots(5, 1, figsize=(12, 24), sharex=True)\n",
    "(ax1, ax2, ax3, ax4, ax5) = axes\n",
    "\n",
    "# X轴\n",
    "steps = np.arange(N_STEPS)\n",
    "\n",
    "# === 子图 1: Average Risk (风险控制) ===\n",
    "ax1.plot(steps, base_mean_risk, label='Baseline', color='blue', linestyle='--')\n",
    "ax1.fill_between(steps, base_mean_risk - base_std_risk, base_mean_risk + base_std_risk, color='blue', alpha=0.1)\n",
    "\n",
    "ax1.plot(steps, bpac_mean_risk, label='BPAC (Ours)', color='red', linewidth=2)\n",
    "ax1.fill_between(steps, bpac_mean_risk - bpac_std_risk, bpac_mean_risk + bpac_std_risk, color='red', alpha=0.1)\n",
    "\n",
    "ax1.axhline(y=TARGET_EPSILON, color='green', linestyle='-', linewidth=2, label=f'Target $\\epsilon={TARGET_EPSILON}$')\n",
    "ax1.axvline(x=CALIB_NUM, color='gray', linestyle=':', label='Calibration End')\n",
    "\n",
    "ax1.set_ylabel('Avg Risk')\n",
    "ax1.set_title(f'1. Risk Control (Target $\\epsilon < {TARGET_EPSILON}$)')\n",
    "ax1.legend(loc='upper right')\n",
    "ax1.grid(True, alpha=0.3)\n",
    "ax1.set_ylim(0, TARGET_EPSILON * 3.0) # 聚焦在 Epsilon 附近\n",
    "\n",
    "# === 子图 2: Token Ratio (成本节省) ===\n",
    "ax2.plot(steps, base_mean_token, label='Baseline', color='blue', linestyle='--')\n",
    "ax2.fill_between(steps, base_mean_token - base_std_token, base_mean_token + base_std_token, color='blue', alpha=0.1)\n",
    "\n",
    "ax2.plot(steps, bpac_mean_token, label='BPAC', color='red', linewidth=2)\n",
    "ax2.fill_between(steps, bpac_mean_token - bpac_std_token, bpac_mean_token + bpac_std_token, color='red', alpha=0.1)\n",
    "\n",
    "ax2.axhline(y=1.0, color='black', linestyle=':')\n",
    "ax2.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "ax2.set_ylabel('Token Ratio')\n",
    "ax2.set_title('2. Cost Efficiency (Lower is Better)')\n",
    "ax2.grid(True, alpha=0.3)\n",
    "\n",
    "# === 子图 3: Expert Call Ratio (专家调用率) ===\n",
    "ax3.plot(steps, base_mean_expert, label='Baseline', color='blue', linestyle='--')\n",
    "ax3.fill_between(steps, base_mean_expert - base_std_expert, base_mean_expert + base_std_expert, color='blue', alpha=0.1)\n",
    "\n",
    "ax3.plot(steps, bpac_mean_expert, label='BPAC', color='red', linewidth=2)\n",
    "ax3.fill_between(steps, bpac_mean_expert - bpac_std_expert, bpac_mean_expert + bpac_std_expert, color='red', alpha=0.1)\n",
    "\n",
    "ax3.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "ax3.set_ylabel('Expert Call Ratio')\n",
    "ax3.set_title('3. Expert Usage Behavior')\n",
    "ax3.grid(True, alpha=0.3)\n",
    "ax3.set_ylim(0, 1.1)\n",
    "\n",
    "# === 子图 4: Threshold Evolution (阈值变化) ===\n",
    "# Baseline: 前期是0，后期是固定值\n",
    "ax4.plot(steps, base_mean_threshold_seq, label='Baseline $\\hat{u}$', color='blue', linestyle='--')\n",
    "ax4.fill_between(steps, \n",
    "                 base_mean_threshold_seq - base_std_threshold_seq, \n",
    "                 base_mean_threshold_seq + base_std_threshold_seq, \n",
    "                 color='blue', alpha=0.1)\n",
    "\n",
    "# BPAC: 动态变化\n",
    "ax4.plot(steps, bpac_mean_threshold, label='BPAC $u_t$', color='red', linewidth=1.5)\n",
    "ax4.fill_between(steps, \n",
    "                 bpac_mean_threshold - bpac_std_threshold, \n",
    "                 bpac_mean_threshold + bpac_std_threshold, \n",
    "                 color='red', alpha=0.1)\n",
    "\n",
    "ax4.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "ax4.set_ylabel('Uncertainty Threshold')\n",
    "ax4.set_title('4. Threshold Adaptation (Dynamic vs Fixed)')\n",
    "ax4.legend(loc='upper right')\n",
    "ax4.grid(True, alpha=0.3)\n",
    "ax4.set_ylim(0, 1.0) \n",
    "\n",
    "# === 子图 5: Wealth Evolution (财富积累) ===\n",
    "# 只有 BPAC 有 Wealth\n",
    "ax5.plot(steps, bpac_mean_wealth, label='BPAC Wealth', color='purple', linewidth=1.5)\n",
    "ax5.fill_between(steps, \n",
    "                 bpac_mean_wealth - bpac_std_wealth, \n",
    "                 bpac_mean_wealth + bpac_std_wealth, \n",
    "                 color='purple', alpha=0.1)\n",
    "\n",
    "ax5.axhline(y=1.0, color='black', linestyle='--', label='Initial Wealth (1.0)')\n",
    "ax5.axvline(x=CALIB_NUM, color='gray', linestyle=':')\n",
    "\n",
    "# 如果财富增长非常快，建议开启对数坐标\n",
    "# ax5.set_yscale('log') \n",
    "\n",
    "ax5.set_ylabel('Wealth')\n",
    "ax5.set_xlabel('Time Step')\n",
    "ax5.set_title('5. BPAC Wealth Accumulation')\n",
    "ax5.legend(loc='upper left')\n",
    "ax5.grid(True, alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0d89fa4d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "vllm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
