{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "67bd11fb",
   "metadata": {},
   "source": [
    "NoThinking 和 CoD的办法"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f428479e",
   "metadata": {},
   "source": [
    "# CoD"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "92821cd8",
   "metadata": {},
   "outputs": [],
   "source": [
    "math_sys = \"\"\"\n",
    "Think step by step, but only keep minimum draft for each thinking step, with 5 words at most. \n",
    "Return the answer at the end of the response after a separator ####.\n",
    "\"\"\"\n",
    "\n",
    "math_user = \"\"\"\n",
    "Q: {question}\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "174b7e1b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "data = pd.read_parquet(\"/home/-/datasets/MATH/train-00000-of-00001-7320a6f3aba8ebd2_5000.parquet\")\n",
    "data[\"id\"] = [f\"math#{i}\" for i in range(len(data))]\n",
    "data.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5c74d0b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "mmlupro_sys = \"\"\"\n",
    "Think step by step, but only keep minimum draft for each thinking step, with 5 words at most. \n",
    "Return the choice at the end of the response after a separator ####, e.g., #### A.\n",
    "\"\"\"\n",
    "\n",
    "mmlupro_user = \"\"\"\n",
    "Q: {question}\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7f244fae",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "data1 = pd.read_parquet(\"/home/-/datasets/mmlupro/test-00000-of-00001_5000.parquet\")\n",
    "data1[\"id\"] = [f\"mmlupro#{i}\" for i in range(len(data1))]\n",
    "data1.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "22d180d7",
   "metadata": {},
   "outputs": [],
   "source": [
    "da1 = pd.read_json(\"/home/-/-/pac/a-all_bench/a-CoD/mmlupro_result.jsonl\", lines=True)\n",
    "da1.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "afe8a0c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "da1.iloc[0,9]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4d9671cc",
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_prompt(row):\n",
    "    q = row[\"question\"]\n",
    "    options = row[\"options\"] if \"options\" in row else row[\"choices\"]\n",
    "\n",
    "    option_text = \"\\n\".join(\n",
    "        [f\"{chr(65+i)}. {opt}\" for i, opt in enumerate(options)]\n",
    "    )\n",
    "\n",
    "    prompt = f\"{q}\\n{option_text}\"\n",
    "    return prompt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "92879b32",
   "metadata": {},
   "outputs": [],
   "source": [
    "build_prompt(data1.iloc[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9b743817",
   "metadata": {},
   "outputs": [],
   "source": [
    "bbh_sys = \"\"\"\n",
    "Think step by step, but only keep minimum draft for each thinking step, with 5 words at most. \n",
    "Return the answer at the end of the response after a separator ####.\n",
    "\"\"\"\n",
    "\n",
    "bbh_user = \"\"\"\n",
    "Q: {question}\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "663d2c13",
   "metadata": {},
   "outputs": [],
   "source": [
    "data3 = pd.read_parquet(\"/home/-/datasets/bbh/bbh_all.parquet\")\n",
    "data3[\"id\"] = [f\"bbh#{i}\" for i in range(len(data3))]\n",
    "data3.head(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0907efdf",
   "metadata": {},
   "source": [
    "# NoThinking"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0941a259",
   "metadata": {},
   "outputs": [],
   "source": [
    "Promt_template = \"\"\"\n",
    "Think step by step, and answer the following question. \n",
    "Return the answer at the end of the response after a separator ####, e.g., ####A.\n",
    "Q: {question}\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e89c77bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "data4 = pd.read_json(\"/home/-/-/pac/a-all_bench/a-CoD/mmlupro_result.jsonl\", lines=True)\n",
    "data4.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5f4878dc",
   "metadata": {},
   "outputs": [],
   "source": [
    "data4.iloc[0,9]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "71384384",
   "metadata": {},
   "outputs": [],
   "source": [
    "data4.iloc[0,6]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3cc8eb66",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer\n",
    "from vllm import LLM, SamplingParams\n",
    "import os\n",
    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"5\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"/mnt/sharedata/ssd_large/users/-/MODEL/Qwen/Qwen3-4B-Thinking-2507\", trust_remote_code=True)\n",
    "llm = LLM(\n",
    "    model=\"/mnt/sharedata/ssd_large/users/-/MODEL/Qwen/Qwen3-4B-Thinking-2507\", \n",
    "    disable_log_stats=True,  \n",
    "    max_model_len=2000,\n",
    "    trust_remote_code=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a06c63b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "from vllm import SamplingParams\n",
    "sampling_params = SamplingParams(\n",
    "    temperature=0.7,\n",
    "    top_p=0.8,\n",
    "    top_k=20,\n",
    "    max_tokens=8192,\n",
    ")\n",
    "\n",
    "data1 = pd.read_parquet(\"/home/-/datasets/mmlupro/test-00000-of-00001_5000.parquet\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cf0beeac",
   "metadata": {},
   "outputs": [],
   "source": [
    "UNIFIED_PROMPT_TEMPLATE = \"\"\"\n",
    "Think step by step, and answer the following question. \n",
    "Return the answer at the end of the response after a separator ####, e.g., ####A.\n",
    "Q: {question}\n",
    "\"\"\"\n",
    "\n",
    "ques = data1.iloc[0][\"question\"]\n",
    "user_content = UNIFIED_PROMPT_TEMPLATE.format(question=ques)\n",
    "\n",
    "messages = [\n",
    "    {\"role\": \"user\", \"content\": user_content}\n",
    "]\n",
    "a = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking = False)\n",
    "prompt = a.rstrip()  # 去掉结尾多余空白，防止格式乱\n",
    "\n",
    "prompt += (\n",
    "    \"\\n\"\n",
    "    \"Okay, I think I have finished thinking.\\n\"\n",
    "    \"</think>\\n\"\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fb567577",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(prompt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1b33aefe",
   "metadata": {},
   "outputs": [],
   "source": [
    "llm.generate([prompt], sampling_params=sampling_params)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5d33771b",
   "metadata": {},
   "source": [
    "# Evaluation"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f40d262c",
   "metadata": {},
   "source": [
    "## mmlu pro"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "081232eb",
   "metadata": {},
   "source": [
    "### Thinking 数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10f9f8c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 主要是看token节省情况\n",
    "import pandas as pd\n",
    "expert_data1 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/mmlupro/qwen3-think.json\")\n",
    "expert_data2 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/mmlupro/qwen3-think2224.2224--1.json\")\n",
    "expert_data3 = pd.concat([expert_data1, expert_data2.iloc[1:, :]], ignore_index=True)\n",
    "expert_data = expert_data3[expert_data3[\"matched\"] == True]\n",
    "session_ids = list(expert_data[\"session_id\"])\n",
    "\n",
    "instant_data1 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/mmlupro/qwen3-ins1.json\")\n",
    "instant_data = instant_data1[instant_data1[\"session_id\"].isin(session_ids)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2f7b7c02",
   "metadata": {},
   "outputs": [],
   "source": [
    "instant_data.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0d6fd00e",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_list =[]\n",
    "selcet_ids =[]\n",
    "for i,row in expert_data.iterrows():\n",
    "    session_id = row['session_id']\n",
    "    if session_id not in instant_data['session_id'].values:\n",
    "        print(f\"Instant data missing at index:{i}, session_id:{session_id}\")\n",
    "        continue\n",
    "    tmp_dict = {}\n",
    "    instant_row = instant_data[instant_data['session_id'] == session_id].iloc[0]\n",
    "    selcet_ids.append(session_id)\n",
    "    if instant_row[\"question\"] != row[\"question\"]:\n",
    "        print(f\"Question mismatch at index:{i}, session_id:{session_id}\")\n",
    "        continue\n",
    "    tmp_dict['uncertainty'] = 1- instant_row['token_probs'][0]\n",
    "    tmp_dict['instant_correct'] = int(instant_row['matched']) if instant_row['matched'] != \"No answer extracted\" else 0\n",
    "    tmp_dict['expert_correct'] = int(expert_data.loc[i, 'matched'])\n",
    "    tmp_dict['instant_token'] = instant_row['gen_token_count']\n",
    "    tmp_dict['expert_token'] = expert_data.loc[i, 'gen_token_count']\n",
    "    tmp_dict[\"ques_id\"]= expert_data.loc[i, 'question_id']\n",
    "    data_list.append(tmp_dict)\n",
    "\n",
    "print(f\"Total samples prepared: {len(data_list)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3968cccf",
   "metadata": {},
   "outputs": [],
   "source": [
    "need_ques_ids = [row[\"ques_id\"] for row in data_list]\n",
    "len(need_ques_ids)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4bf49b46",
   "metadata": {},
   "source": [
    "### NoThinking"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "034373e4",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "data_mmlu = pd.read_json(\"/home/-/-/pac/a-all_bench/a-NoThinking/mmlupro_result.jsonl\", lines=True)\n",
    "data_mmlu.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3017eabb",
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_answer(output):\n",
    "    if \"####\" in output:\n",
    "        answer = output.split(\"####\")[-1].strip()\n",
    "        return answer\n",
    "    else:\n",
    "        return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3fce20d7",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_mmlu2 = data_mmlu.copy()\n",
    "for i, row in data_mmlu.iterrows():\n",
    "    ans = extract_answer(row[\"output\"])\n",
    "    data_mmlu2.at[i, \"extracted_answer\"] = ans\n",
    "    if ans == row[\"answer\"]:\n",
    "        data_mmlu2.at[i, \"matched\"] = True\n",
    "    else:\n",
    "        data_mmlu2.at[i, \"matched\"] = False  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d0b6822a",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_mmlu2[\"matched\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d5493fa5",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_mmlu2_filtered = data_mmlu2[data_mmlu2[\"question_id\"].isin(need_ques_ids)]\n",
    "data_mmlu2_filtered.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1fdbaa40",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_mmlu2_filtered.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e803153f",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(len(need_ques_ids)):\n",
    "    if need_ques_ids[i] != data_mmlu2_filtered.iloc[i][\"question_id\"]:\n",
    "        print(f\"Mismatch at index {i}: {need_ques_ids[i]} vs {data_mmlu2_filtered.iloc[i]['question_id']}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "86dad8ea",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_list[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a6e9ad66",
   "metadata": {},
   "outputs": [],
   "source": [
    "thining_tokens = [data_mmlu2_filtered.iloc[i][\"token_count\"] for i in range(len(data_mmlu2_filtered))]\n",
    "\n",
    "ori_tokens = [item['expert_token'] for item in data_list]\n",
    "assert len(thining_tokens) == len(ori_tokens) \n",
    "save_ratio = sum(thining_tokens) / sum(ori_tokens)\n",
    "print(f\"Token saving ratio: {save_ratio:.4f}\")\n",
    "\n",
    "loss = 1- sum([data_mmlu2_filtered.iloc[i][\"matched\"] for i in range(len(data_mmlu2_filtered))]) / len(data_mmlu2_filtered)\n",
    "print(f\"Accuracy after thinning: {loss:.4f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "830fc5ad",
   "metadata": {},
   "source": [
    "### CoD"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "428e22a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "data2 = pd.read_json(\"/home/-/-/pac/a-all_bench/a-CoD/mmlupro_result.jsonl\", lines=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1efd12d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_answer(output):\n",
    "    if \"####\" in output:\n",
    "        answer = output.split(\"####\")[-1].strip()\n",
    "        return answer\n",
    "    else:\n",
    "        return None\n",
    "data2_2 = data2.copy()\n",
    "for i, row in data2.iterrows():\n",
    "    ans = extract_answer(row[\"output\"])\n",
    "    data2_2.at[i, \"extracted_answer\"] = ans\n",
    "    if ans == row[\"answer\"]:\n",
    "        data2_2.at[i, \"matched\"] = True\n",
    "    else:\n",
    "        data2_2.at[i, \"matched\"] = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c08baf85",
   "metadata": {},
   "outputs": [],
   "source": [
    "data2_2[\"matched\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a2aaeac8",
   "metadata": {},
   "outputs": [],
   "source": [
    "data2_2_filtered = data2_2[data2_2[\"question_id\"].isin(need_ques_ids)]\n",
    "data2_2_filtered.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "00cffbb8",
   "metadata": {},
   "outputs": [],
   "source": [
    "cod_tokens = [data2_2_filtered.iloc[i][\"token_count\"] for i in range(len(data2_2_filtered))]\n",
    "ori_tokens = [item['expert_token'] for item in data_list]\n",
    "assert len(cod_tokens) == len(ori_tokens) \n",
    "save_ratio = sum(cod_tokens) / sum(ori_tokens)\n",
    "print(f\"Token saving ratio: {save_ratio:.4f}\")\n",
    "loss = 1- sum([data2_2_filtered.iloc[i][\"matched\"] for i in range(len(data2_2_filtered))]) / len(data2_2_filtered)\n",
    "print(f\"loss after thinning: {loss:.4f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0299483e",
   "metadata": {},
   "source": [
    "## math"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2f6e0942",
   "metadata": {},
   "source": [
    "### thinking 数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b932c74b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "expert_data1 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/math/qwen3-think.json\")\n",
    "expert_data2 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/math/qwen3-think2648.2648--1.json\")\n",
    "expert_data3 = pd.concat([expert_data1, expert_data2.iloc[1:, :]], ignore_index=True)\n",
    "expert_data = expert_data3[expert_data3[\"matched\"] == True]\n",
    "session_ids = list(expert_data[\"session_id\"])\n",
    "\n",
    "instant_data1 = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs_parsed/math/qwen3-ins1.json\")\n",
    "instant_data = instant_data1[instant_data1[\"session_id\"].isin(session_ids)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "09fa2ce4",
   "metadata": {},
   "outputs": [],
   "source": [
    "instant_data.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9dc403b9",
   "metadata": {},
   "outputs": [],
   "source": [
    "instant_data.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9305d1c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_list =[]\n",
    "selcet_ids =[]\n",
    "for i,row in expert_data.iterrows():\n",
    "    session_id = row['session_id']\n",
    "    if session_id not in instant_data['session_id'].values:\n",
    "        print(f\"Instant data missing at index:{i}, session_id:{session_id}\")\n",
    "        continue\n",
    "    tmp_dict = {}\n",
    "    instant_row = instant_data[instant_data['session_id'] == session_id].iloc[0]\n",
    "    selcet_ids.append(session_id)\n",
    "    if instant_row[\"problem\"] != row[\"problem\"]:\n",
    "        print(f\"Question mismatch at index:{i}, session_id:{session_id}\")\n",
    "        continue\n",
    "    tmp_dict['uncertainty'] = 1- instant_row['token_probs'][0]\n",
    "    tmp_dict['instant_correct'] = int(instant_row['matched']) if instant_row['matched'] != \"No answer extracted\" else 0\n",
    "    tmp_dict['expert_correct'] = int(expert_data.loc[i, 'matched'])\n",
    "    tmp_dict['instant_token'] = instant_row['gen_token_count']\n",
    "    tmp_dict['expert_token'] = expert_data.loc[i, 'gen_token_count']\n",
    "    tmp_dict[\"ques_id\"]= expert_data.loc[i, 'session_id']\n",
    "    data_list.append(tmp_dict)\n",
    "\n",
    "print(f\"Total samples prepared: {len(data_list)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e7767b5e",
   "metadata": {},
   "outputs": [],
   "source": [
    "need_ques_ids = [row[\"ques_id\"] for row in data_list]\n",
    "len(need_ques_ids)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f4fd71ab",
   "metadata": {},
   "source": [
    "### NoThinking"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "95b84a95",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "data6 = pd.read_json(\"/home/-/-/pac/a-all_bench/a-NoThinking/math_result.jsonl\", lines=True)\n",
    "data6.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5bb94bad",
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_boxed_answer(text):\n",
    "\n",
    "    if \"\\\\boxed{\" not in text:\n",
    "        return text # 如果没有 boxed，就返回原文，留给后续正则去提取数字\n",
    "    \n",
    "    # 找最后一个 \\boxed{，因为有时候推理过程也有 box，但答案通常在最后\n",
    "    idx = text.rfind(\"\\\\boxed{\")\n",
    "    if idx == -1:\n",
    "        return text\n",
    "    \n",
    "    # 开始提取\n",
    "    content = \"\"\n",
    "    balance = 0\n",
    "    started = False\n",
    "    \n",
    "    # 从 \\boxed{ 后面开始遍历\n",
    "    for char in text[idx + 7:]: # 7 是 len(\"\\boxed{\")\n",
    "        if char == '{':\n",
    "            balance += 1\n",
    "            content += char\n",
    "        elif char == '}':\n",
    "            if balance == 0:\n",
    "                # 找到了匹配的结束括号\n",
    "                return content\n",
    "            balance -= 1\n",
    "            content += char\n",
    "        else:\n",
    "            content += char\n",
    "            \n",
    "    return content\n",
    "def extract_answer(output):\n",
    "    if \"####\" in output:\n",
    "        answer = output.split(\"####\")[-1].strip()\n",
    "        return answer\n",
    "    else:\n",
    "        return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d4dbc41a",
   "metadata": {},
   "outputs": [],
   "source": [
    "from math_verify import verify,parse\n",
    "for i, row in data6.iterrows():\n",
    "    ans = extract_answer(row[\"output\"])\n",
    "    data6.at[i, \"extracted_answer\"] = ans\n",
    "    gold = extract_boxed_answer(row[\"solution\"])\n",
    "    ans1 = parse(f\"\\\\boxed{{{ans}}}\")\n",
    "    gold1 = parse(f\"\\\\boxed{{{gold}}}\")\n",
    "    if verify(gold1, ans1):\n",
    "        data6.at[i, \"matched\"] = True\n",
    "    else:\n",
    "        data6.at[i, \"matched\"] = False\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b5c9ae4c",
   "metadata": {},
   "outputs": [],
   "source": [
    "data6[\"matched\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2a3f1857",
   "metadata": {},
   "outputs": [],
   "source": [
    "data6.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c489fd05",
   "metadata": {},
   "outputs": [],
   "source": [
    "data6_filtered = data6[data6[\"id\"].isin(need_ques_ids)]\n",
    "data6_filtered.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "435f1a45",
   "metadata": {},
   "outputs": [],
   "source": [
    "thining_tokens = [data6_filtered.iloc[i][\"token_count\"] for i in range(len(data6_filtered))]\n",
    "ori_tokens = [item['expert_token'] for item in data_list]\n",
    "assert len(thining_tokens) == len(ori_tokens) \n",
    "save_ratio = sum(thining_tokens) / sum(ori_tokens)\n",
    "print(f\"Token saving ratio: {save_ratio:.4f}\")\n",
    "loss = 1- sum([data6_filtered.iloc[i][\"matched\"] for i in range(len(data6_filtered))]) / len(data6_filtered)\n",
    "print(f\"Loss after thinning: {loss:.4f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "31d2c586",
   "metadata": {},
   "source": [
    "### CoD"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "43f1ad62",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_cod = pd.read_json(\"/home/-/-/pac/a-all_bench/a-CoD/math_result.jsonl\",lines=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9338aafa",
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_boxed_answer(text):\n",
    "\n",
    "    if \"\\\\boxed{\" not in text:\n",
    "        return text # 如果没有 boxed，就返回原文，留给后续正则去提取数字\n",
    "    \n",
    "    # 找最后一个 \\boxed{，因为有时候推理过程也有 box，但答案通常在最后\n",
    "    idx = text.rfind(\"\\\\boxed{\")\n",
    "    if idx == -1:\n",
    "        return text\n",
    "    \n",
    "    # 开始提取\n",
    "    content = \"\"\n",
    "    balance = 0\n",
    "    started = False\n",
    "    \n",
    "    # 从 \\boxed{ 后面开始遍历\n",
    "    for char in text[idx + 7:]: # 7 是 len(\"\\boxed{\")\n",
    "        if char == '{':\n",
    "            balance += 1\n",
    "            content += char\n",
    "        elif char == '}':\n",
    "            if balance == 0:\n",
    "                # 找到了匹配的结束括号\n",
    "                return content\n",
    "            balance -= 1\n",
    "            content += char\n",
    "        else:\n",
    "            content += char\n",
    "            \n",
    "    return content\n",
    "def extract_answer(output):\n",
    "    if \"####\" in output:\n",
    "        answer = output.split(\"####\")[-1].strip()\n",
    "        return answer\n",
    "    else:\n",
    "        return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3ea1b748",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i, row in data_cod.iterrows():\n",
    "    ans = extract_answer(row[\"output\"])\n",
    "    data_cod.at[i, \"extracted_answer\"] = ans\n",
    "    gold = extract_boxed_answer(row[\"solution\"])\n",
    "    ans1 = parse(f\"\\\\boxed{{{ans}}}\")\n",
    "    gold1 = parse(f\"\\\\boxed{{{gold}}}\")\n",
    "    if verify(gold1, ans1):\n",
    "        data_cod.at[i, \"matched\"] = True\n",
    "    else:\n",
    "        data_cod.at[i, \"matched\"] = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "752d427c",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_cod[\"matched\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f5031dbb",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_cod_filtered = data_cod[data_cod[\"id\"].isin(need_ques_ids)]\n",
    "data_cod_filtered.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "28d991a4",
   "metadata": {},
   "outputs": [],
   "source": [
    "cod_tokens = [data_cod_filtered.iloc[i][\"token_count\"] for i in range(len(data_cod_filtered))]\n",
    "ori_tokens = [item['expert_token'] for item in data_list]\n",
    "assert len(cod_tokens) == len(ori_tokens) \n",
    "save_ratio = sum(cod_tokens) / sum(ori_tokens)\n",
    "print(f\"Token saving ratio: {save_ratio:.4f}\")\n",
    "loss = 1- sum([data_cod_filtered.iloc[i][\"matched\"] for i in range(len(data_cod_filtered))]) / len(data_cod_filtered)\n",
    "print(f\"Loss after thinning: {loss:.4f}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "vllm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
