{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "eb80b7ca",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Grade not found in 'content'. Using 'reasoning_content' to extract grade = B.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>correct</th>\n",
       "      <th>incorrect</th>\n",
       "      <th>refusal</th>\n",
       "      <th>total_input_tokens</th>\n",
       "      <th>total_output_tokens</th>\n",
       "      <th>normalized_reward</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>gpt-4o-mini-2024-07-18</th>\n",
       "      <td>107</td>\n",
       "      <td>843</td>\n",
       "      <td>50</td>\n",
       "      <td>57478</td>\n",
       "      <td>174334</td>\n",
       "      <td>-3.265</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gpt-4o-2024-08-06</th>\n",
       "      <td>339</td>\n",
       "      <td>584</td>\n",
       "      <td>77</td>\n",
       "      <td>57478</td>\n",
       "      <td>235465</td>\n",
       "      <td>-1.997</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>claude-3-5-haiku-20241022</th>\n",
       "      <td>65</td>\n",
       "      <td>202</td>\n",
       "      <td>733</td>\n",
       "      <td>65285</td>\n",
       "      <td>127239</td>\n",
       "      <td>-0.743</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>claude-sonnet-4-5-20250929 (no thinking)</th>\n",
       "      <td>284</td>\n",
       "      <td>468</td>\n",
       "      <td>248</td>\n",
       "      <td>65285</td>\n",
       "      <td>263960</td>\n",
       "      <td>-1.588</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gemini-2.5-flash (no thinking)</th>\n",
       "      <td>299</td>\n",
       "      <td>673</td>\n",
       "      <td>28</td>\n",
       "      <td>55214</td>\n",
       "      <td>366763</td>\n",
       "      <td>-2.393</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gemini-2.5-flash (reasoning_effort='medium')</th>\n",
       "      <td>307</td>\n",
       "      <td>684</td>\n",
       "      <td>9</td>\n",
       "      <td>55214</td>\n",
       "      <td>741656</td>\n",
       "      <td>-2.429</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gemini-2.5-pro (reasoning_effort='medium')</th>\n",
       "      <td>557</td>\n",
       "      <td>442</td>\n",
       "      <td>1</td>\n",
       "      <td>55214</td>\n",
       "      <td>1309309</td>\n",
       "      <td>-1.211</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>claude-sonnet-4-5-20250929 (thinking_budget=8192)</th>\n",
       "      <td>301</td>\n",
       "      <td>454</td>\n",
       "      <td>245</td>\n",
       "      <td>94285</td>\n",
       "      <td>579993</td>\n",
       "      <td>-1.515</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   correct  incorrect  \\\n",
       "gpt-4o-mini-2024-07-18                                 107        843   \n",
       "gpt-4o-2024-08-06                                      339        584   \n",
       "claude-3-5-haiku-20241022                               65        202   \n",
       "claude-sonnet-4-5-20250929 (no thinking)               284        468   \n",
       "gemini-2.5-flash (no thinking)                         299        673   \n",
       "gemini-2.5-flash (reasoning_effort='medium')           307        684   \n",
       "gemini-2.5-pro (reasoning_effort='medium')             557        442   \n",
       "claude-sonnet-4-5-20250929 (thinking_budget=8192)      301        454   \n",
       "\n",
       "                                                   refusal  \\\n",
       "gpt-4o-mini-2024-07-18                                  50   \n",
       "gpt-4o-2024-08-06                                       77   \n",
       "claude-3-5-haiku-20241022                              733   \n",
       "claude-sonnet-4-5-20250929 (no thinking)               248   \n",
       "gemini-2.5-flash (no thinking)                          28   \n",
       "gemini-2.5-flash (reasoning_effort='medium')             9   \n",
       "gemini-2.5-pro (reasoning_effort='medium')               1   \n",
       "claude-sonnet-4-5-20250929 (thinking_budget=8192)      245   \n",
       "\n",
       "                                                   total_input_tokens  \\\n",
       "gpt-4o-mini-2024-07-18                                          57478   \n",
       "gpt-4o-2024-08-06                                               57478   \n",
       "claude-3-5-haiku-20241022                                       65285   \n",
       "claude-sonnet-4-5-20250929 (no thinking)                        65285   \n",
       "gemini-2.5-flash (no thinking)                                  55214   \n",
       "gemini-2.5-flash (reasoning_effort='medium')                    55214   \n",
       "gemini-2.5-pro (reasoning_effort='medium')                      55214   \n",
       "claude-sonnet-4-5-20250929 (thinking_budget=8192)               94285   \n",
       "\n",
       "                                                   total_output_tokens  \\\n",
       "gpt-4o-mini-2024-07-18                                          174334   \n",
       "gpt-4o-2024-08-06                                               235465   \n",
       "claude-3-5-haiku-20241022                                       127239   \n",
       "claude-sonnet-4-5-20250929 (no thinking)                        263960   \n",
       "gemini-2.5-flash (no thinking)                                  366763   \n",
       "gemini-2.5-flash (reasoning_effort='medium')                    741656   \n",
       "gemini-2.5-pro (reasoning_effort='medium')                     1309309   \n",
       "claude-sonnet-4-5-20250929 (thinking_budget=8192)               579993   \n",
       "\n",
       "                                                   normalized_reward  \n",
       "gpt-4o-mini-2024-07-18                                        -3.265  \n",
       "gpt-4o-2024-08-06                                             -1.997  \n",
       "claude-3-5-haiku-20241022                                     -0.743  \n",
       "claude-sonnet-4-5-20250929 (no thinking)                      -1.588  \n",
       "gemini-2.5-flash (no thinking)                                -2.393  \n",
       "gemini-2.5-flash (reasoning_effort='medium')                  -2.429  \n",
       "gemini-2.5-pro (reasoning_effort='medium')                    -1.211  \n",
       "claude-sonnet-4-5-20250929 (thinking_budget=8192)             -1.515  "
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import json\n",
    "from typing import Any\n",
    "from collections import Counter\n",
    "from utils import get_logging_prefix\n",
    "from run_freeform_qa import read_grades\n",
    "from evaluation_metrics import calc_normalized_reward\n",
    "\n",
    "def read_stats(\n",
    "    dataset: str,\n",
    "    model: str,\n",
    "    prompt_type: str,\n",
    "    reasoning_effort: str,\n",
    "    thinking_budget: int,\n",
    "    r_cor: str,\n",
    "    r_inc: str,\n",
    "    r_ref: str,\n",
    "    grader_model: str\n",
    ") -> dict[str, Any]:\n",
    "    \"\"\"Columns: [Correct, Incorrect, Refusal, Total input tokens, Total output tokens]\"\"\"\n",
    "    logging_prefix = get_logging_prefix(\n",
    "        dataset=dataset,\n",
    "        model=model,\n",
    "        prompt_type=prompt_type,\n",
    "        r_cor=r_cor,\n",
    "        r_inc=r_inc,\n",
    "        r_ref=r_ref,\n",
    "        reasoning_effort=reasoning_effort,\n",
    "        thinking_budget=thinking_budget,\n",
    "    )\n",
    "    stats = dict()\n",
    "    # Evaluation stats\n",
    "    eval_output_jsonl = f\"{logging_prefix}__eval-results-by-{grader_model.split('/')[-1]}.jsonl\"\n",
    "    grades = read_grades(eval_output_jsonl)\n",
    "    grades_counter = Counter(grades)\n",
    "    stats[\"correct\"] = grades_counter[\"A\"]\n",
    "    stats[\"incorrect\"] = grades_counter[\"B\"]\n",
    "    stats[\"refusal\"] = grades_counter[\"C\"]\n",
    "    # Generation stats\n",
    "    raw_output_json = f\"{logging_prefix}__raw-output.json\"\n",
    "    with open(raw_output_json, \"r\") as f:\n",
    "        data = json.load(f)\n",
    "        stats[\"total_input_tokens\"] = data[\"total_prompt_tokens\"]\n",
    "        stats[\"total_output_tokens\"] = data[\"total_total_tokens\"] - data[\"total_prompt_tokens\"]\n",
    "    # Final metric\n",
    "    stats[\"normalized_reward\"] = calc_normalized_reward(\n",
    "        n_cor=stats[\"correct\"],\n",
    "        n_inc=stats[\"incorrect\"],\n",
    "        n_ref=stats[\"refusal\"],\n",
    "        r_cor=float(r_cor),\n",
    "        r_inc=float(r_inc),\n",
    "        r_ref=float(r_ref),\n",
    "    )\n",
    "    return stats\n",
    "\n",
    "grader_model = \"openai/gpt-oss-20b\"\n",
    "dataset = \"simple_qa_verified\"\n",
    "prompt_type = \"no_risk\"\n",
    "r_cor = \"1\"\n",
    "r_inc = \"-4\"\n",
    "r_ref = \"0\"\n",
    "\n",
    "name2config = {\n",
    "    # Non-reasoning models or modes\n",
    "    \"gpt-4o-mini-2024-07-18\": {\"reasoning_effort\": None, \"thinking_budget\": None},\n",
    "    \"gpt-4o-2024-08-06\": {\"reasoning_effort\": None, \"thinking_budget\": None},\n",
    "    \"claude-3-5-haiku-20241022\": {\"reasoning_effort\": None, \"thinking_budget\": None},\n",
    "    \"claude-sonnet-4-5-20250929 (no thinking)\": {\"reasoning_effort\": None, \"thinking_budget\": None},\n",
    "    \"gemini-2.5-flash (no thinking)\": {\"reasoning_effort\": \"none\", \"thinking_budget\": None},\n",
    "    # Reasoning models or modes\n",
    "    \"gemini-2.5-flash (reasoning_effort='medium')\": {\"reasoning_effort\": \"medium\", \"thinking_budget\": None},\n",
    "    \"gemini-2.5-pro (reasoning_effort='medium')\": {\"reasoning_effort\": \"medium\", \"thinking_budget\": None},\n",
    "    \"claude-sonnet-4-5-20250929 (thinking_budget=8192)\": {\"reasoning_effort\": None, \"thinking_budget\": 8192},\n",
    "}\n",
    "\n",
    "name2stats = dict()\n",
    "\n",
    "for name, config in name2config.items():\n",
    "    model = name.split(\"(\")[0].strip()\n",
    "\n",
    "    stats = read_stats(\n",
    "        dataset,\n",
    "        model,\n",
    "        prompt_type,\n",
    "        r_cor=r_cor,\n",
    "        r_inc=r_inc,\n",
    "        r_ref=r_ref,\n",
    "        grader_model=grader_model,\n",
    "        **config\n",
    "    )\n",
    "    name2stats[name] = stats\n",
    "\n",
    "import pandas as pd\n",
    "\n",
    "df = pd.DataFrame(name2stats).T.astype({\n",
    "    \"correct\": int, \n",
    "    \"incorrect\": int, \n",
    "    \"refusal\": int, \n",
    "    \"total_input_tokens\": int, \n",
    "    \"total_output_tokens\": int, \n",
    "    \"normalized_reward\": float\n",
    "})\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e5458358",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>gpt-4o-mini-2024-07-18</th>\n",
       "      <th>gpt-4o-2024-08-06</th>\n",
       "      <th>claude-3-5-haiku-20241022</th>\n",
       "      <th>claude-sonnet-4-5-20250929 (no thinking)</th>\n",
       "      <th>gemini-2.5-flash (no thinking)</th>\n",
       "      <th>gemini-2.5-flash (reasoning_effort='medium')</th>\n",
       "      <th>gemini-2.5-pro (reasoning_effort='medium')</th>\n",
       "      <th>claude-sonnet-4-5-20250929 (thinking_budget=8192)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>correct</th>\n",
       "      <td>107.000</td>\n",
       "      <td>339.000</td>\n",
       "      <td>65.000</td>\n",
       "      <td>284.000</td>\n",
       "      <td>299.000</td>\n",
       "      <td>307.000</td>\n",
       "      <td>557.000</td>\n",
       "      <td>301.000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>incorrect</th>\n",
       "      <td>843.000</td>\n",
       "      <td>584.000</td>\n",
       "      <td>202.000</td>\n",
       "      <td>468.000</td>\n",
       "      <td>673.000</td>\n",
       "      <td>684.000</td>\n",
       "      <td>442.000</td>\n",
       "      <td>454.000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>refusal</th>\n",
       "      <td>50.000</td>\n",
       "      <td>77.000</td>\n",
       "      <td>733.000</td>\n",
       "      <td>248.000</td>\n",
       "      <td>28.000</td>\n",
       "      <td>9.000</td>\n",
       "      <td>1.000</td>\n",
       "      <td>245.000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>total_input_tokens</th>\n",
       "      <td>57478.000</td>\n",
       "      <td>57478.000</td>\n",
       "      <td>65285.000</td>\n",
       "      <td>65285.000</td>\n",
       "      <td>55214.000</td>\n",
       "      <td>55214.000</td>\n",
       "      <td>55214.000</td>\n",
       "      <td>94285.000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>total_output_tokens</th>\n",
       "      <td>174334.000</td>\n",
       "      <td>235465.000</td>\n",
       "      <td>127239.000</td>\n",
       "      <td>263960.000</td>\n",
       "      <td>366763.000</td>\n",
       "      <td>741656.000</td>\n",
       "      <td>1309309.000</td>\n",
       "      <td>579993.000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>normalized_reward</th>\n",
       "      <td>-3.265</td>\n",
       "      <td>-1.997</td>\n",
       "      <td>-0.743</td>\n",
       "      <td>-1.588</td>\n",
       "      <td>-2.393</td>\n",
       "      <td>-2.429</td>\n",
       "      <td>-1.211</td>\n",
       "      <td>-1.515</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                     gpt-4o-mini-2024-07-18  gpt-4o-2024-08-06  \\\n",
       "correct                             107.000            339.000   \n",
       "incorrect                           843.000            584.000   \n",
       "refusal                              50.000             77.000   \n",
       "total_input_tokens                57478.000          57478.000   \n",
       "total_output_tokens              174334.000         235465.000   \n",
       "normalized_reward                    -3.265             -1.997   \n",
       "\n",
       "                     claude-3-5-haiku-20241022  \\\n",
       "correct                                 65.000   \n",
       "incorrect                              202.000   \n",
       "refusal                                733.000   \n",
       "total_input_tokens                   65285.000   \n",
       "total_output_tokens                 127239.000   \n",
       "normalized_reward                       -0.743   \n",
       "\n",
       "                     claude-sonnet-4-5-20250929 (no thinking)  \\\n",
       "correct                                               284.000   \n",
       "incorrect                                             468.000   \n",
       "refusal                                               248.000   \n",
       "total_input_tokens                                  65285.000   \n",
       "total_output_tokens                                263960.000   \n",
       "normalized_reward                                      -1.588   \n",
       "\n",
       "                     gemini-2.5-flash (no thinking)  \\\n",
       "correct                                     299.000   \n",
       "incorrect                                   673.000   \n",
       "refusal                                      28.000   \n",
       "total_input_tokens                        55214.000   \n",
       "total_output_tokens                      366763.000   \n",
       "normalized_reward                            -2.393   \n",
       "\n",
       "                     gemini-2.5-flash (reasoning_effort='medium')  \\\n",
       "correct                                                   307.000   \n",
       "incorrect                                                 684.000   \n",
       "refusal                                                     9.000   \n",
       "total_input_tokens                                      55214.000   \n",
       "total_output_tokens                                    741656.000   \n",
       "normalized_reward                                          -2.429   \n",
       "\n",
       "                     gemini-2.5-pro (reasoning_effort='medium')  \\\n",
       "correct                                                 557.000   \n",
       "incorrect                                               442.000   \n",
       "refusal                                                   1.000   \n",
       "total_input_tokens                                    55214.000   \n",
       "total_output_tokens                                 1309309.000   \n",
       "normalized_reward                                        -1.211   \n",
       "\n",
       "                     claude-sonnet-4-5-20250929 (thinking_budget=8192)  \n",
       "correct                                                        301.000  \n",
       "incorrect                                                      454.000  \n",
       "refusal                                                        245.000  \n",
       "total_input_tokens                                           94285.000  \n",
       "total_output_tokens                                         579993.000  \n",
       "normalized_reward                                               -1.515  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.T"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8a46cc35",
   "metadata": {},
   "source": [
    "## Calibration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b9777add",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Averaged confidence score: 0.3329624311642108\n",
      "Averaged confidence score: 0.6353289130857952\n",
      "Averaged confidence score: 0.8345664839955307\n",
      "Averaged confidence score: 0.88185037961687\n",
      "Averaged confidence score: 0.9491866558737267\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>accuracy</th>\n",
       "      <th>avg_confidence</th>\n",
       "      <th>ece</th>\n",
       "      <th>prr</th>\n",
       "      <th>total_input_tokens</th>\n",
       "      <th>total_output_tokens</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>gpt-4o-mini-2024-07-18</th>\n",
       "      <td>0.107</td>\n",
       "      <td>0.332962</td>\n",
       "      <td>0.313866</td>\n",
       "      <td>0.194779</td>\n",
       "      <td>65498.0</td>\n",
       "      <td>1000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gpt-4o-2024-08-06</th>\n",
       "      <td>0.339</td>\n",
       "      <td>0.635329</td>\n",
       "      <td>0.403581</td>\n",
       "      <td>0.272013</td>\n",
       "      <td>71100.0</td>\n",
       "      <td>1000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gemini-2.5-flash (no thinking)</th>\n",
       "      <td>0.299</td>\n",
       "      <td>0.834566</td>\n",
       "      <td>0.595562</td>\n",
       "      <td>0.151847</td>\n",
       "      <td>147613.0</td>\n",
       "      <td>343260.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gemini-2.5-flash (reasoning_effort='medium')</th>\n",
       "      <td>0.307</td>\n",
       "      <td>0.881850</td>\n",
       "      <td>0.620836</td>\n",
       "      <td>0.104867</td>\n",
       "      <td>139256.0</td>\n",
       "      <td>329243.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gemini-2.5-pro (reasoning_effort='medium')</th>\n",
       "      <td>0.557</td>\n",
       "      <td>0.949187</td>\n",
       "      <td>0.427702</td>\n",
       "      <td>0.244427</td>\n",
       "      <td>232935.0</td>\n",
       "      <td>678200.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                              accuracy  avg_confidence  \\\n",
       "gpt-4o-mini-2024-07-18                           0.107        0.332962   \n",
       "gpt-4o-2024-08-06                                0.339        0.635329   \n",
       "gemini-2.5-flash (no thinking)                   0.299        0.834566   \n",
       "gemini-2.5-flash (reasoning_effort='medium')     0.307        0.881850   \n",
       "gemini-2.5-pro (reasoning_effort='medium')       0.557        0.949187   \n",
       "\n",
       "                                                   ece       prr  \\\n",
       "gpt-4o-mini-2024-07-18                        0.313866  0.194779   \n",
       "gpt-4o-2024-08-06                             0.403581  0.272013   \n",
       "gemini-2.5-flash (no thinking)                0.595562  0.151847   \n",
       "gemini-2.5-flash (reasoning_effort='medium')  0.620836  0.104867   \n",
       "gemini-2.5-pro (reasoning_effort='medium')    0.427702  0.244427   \n",
       "\n",
       "                                              total_input_tokens  \\\n",
       "gpt-4o-mini-2024-07-18                                   65498.0   \n",
       "gpt-4o-2024-08-06                                        71100.0   \n",
       "gemini-2.5-flash (no thinking)                          147613.0   \n",
       "gemini-2.5-flash (reasoning_effort='medium')            139256.0   \n",
       "gemini-2.5-pro (reasoning_effort='medium')              232935.0   \n",
       "\n",
       "                                              total_output_tokens  \n",
       "gpt-4o-mini-2024-07-18                                     1000.0  \n",
       "gpt-4o-2024-08-06                                          1000.0  \n",
       "gemini-2.5-flash (no thinking)                           343260.0  \n",
       "gemini-2.5-flash (reasoning_effort='medium')             329243.0  \n",
       "gemini-2.5-pro (reasoning_effort='medium')               678200.0  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import json\n",
    "from typing import Any\n",
    "from collections import Counter\n",
    "from utils import get_logging_prefix\n",
    "from run_confidence_estimation import read_confs, read_correctness, calc_ece, calc_prr\n",
    "\n",
    "def read_stats(\n",
    "    dataset: str,\n",
    "    model: str,\n",
    "    prompt_type: str,\n",
    "    reasoning_effort: str,\n",
    "    thinking_budget: int,\n",
    "    grader_model: str,\n",
    "    calibration_method: str\n",
    ") -> dict[str, Any]:\n",
    "    \"\"\"Columns: [Correct, Incorrect, Refusal, Total input tokens, Total output tokens]\"\"\"\n",
    "    logging_prefix = get_logging_prefix(\n",
    "        dataset=dataset,\n",
    "        model=model,\n",
    "        prompt_type=prompt_type,\n",
    "        reasoning_effort=reasoning_effort,\n",
    "        thinking_budget=thinking_budget,\n",
    "    )\n",
    "    stats = dict()\n",
    "    # Performance stats\n",
    "    eval_output_jsonl = f\"{logging_prefix}__eval-results-by-{grader_model.split('/')[-1]}.jsonl\"\n",
    "    correctness = read_correctness(dataset, eval_output_jsonl)\n",
    "    stats[\"accuracy\"] = sum(correctness) / len(correctness)\n",
    "    confidence_output_jsonl = f\"{logging_prefix}__confidence-estimation-by-{calibration_method}.jsonl\"\n",
    "    confs = read_confs(confidence_output_jsonl, calibration_method)\n",
    "    stats[\"avg_confidence\"] = sum(confs) / len(confs)\n",
    "    stats[\"ece\"] = calc_ece(confs, correctness)[0]\n",
    "    stats[\"prr\"] = calc_prr(confs, correctness)[0]\n",
    "    # Cost stats\n",
    "    confidence_output_json = confidence_output_jsonl.replace(\".jsonl\", \".json\")\n",
    "    with open(confidence_output_json, \"r\") as f:\n",
    "        data = json.load(f)\n",
    "        stats[\"total_input_tokens\"] = data[\"total_prompt_tokens\"]\n",
    "        stats[\"total_output_tokens\"] = data[\"total_total_tokens\"] - data[\"total_prompt_tokens\"]\n",
    "    return stats\n",
    "\n",
    "grader_model = \"openai/gpt-oss-20b\"\n",
    "dataset = \"simple_qa_verified\"\n",
    "prompt_type = \"no_risk\"\n",
    "calibration_method = \"ptrue\"\n",
    "\n",
    "name2config = {\n",
    "    # Non-reasoning models or modes\n",
    "    \"gpt-4o-mini-2024-07-18\": {\"reasoning_effort\": None, \"thinking_budget\": None},\n",
    "    \"gpt-4o-2024-08-06\": {\"reasoning_effort\": None, \"thinking_budget\": None},\n",
    "    # \"claude-3-5-haiku-20241022\": {\"reasoning_effort\": None, \"thinking_budget\": None},\n",
    "    # \"claude-sonnet-4-5-20250929 (no thinking)\": {\"reasoning_effort\": None, \"thinking_budget\": None},\n",
    "    \"gemini-2.5-flash (no thinking)\": {\"reasoning_effort\": \"none\", \"thinking_budget\": None},\n",
    "    # # Reasoning models or modes\n",
    "    \"gemini-2.5-flash (reasoning_effort='medium')\": {\"reasoning_effort\": \"medium\", \"thinking_budget\": None},\n",
    "    \"gemini-2.5-pro (reasoning_effort='medium')\": {\"reasoning_effort\": \"medium\", \"thinking_budget\": None},\n",
    "    # \"claude-sonnet-4-5-20250929 (thinking_budget=8192)\": {\"reasoning_effort\": None, \"thinking_budget\": 8192},\n",
    "}\n",
    "\n",
    "name2stats = dict()\n",
    "\n",
    "for name, config in name2config.items():\n",
    "    model = name.split(\"(\")[0].strip()\n",
    "\n",
    "    stats = read_stats(\n",
    "        dataset,\n",
    "        model,\n",
    "        prompt_type,\n",
    "        grader_model=grader_model,\n",
    "        calibration_method=calibration_method,\n",
    "        **config\n",
    "    )\n",
    "    name2stats[name] = stats\n",
    "\n",
    "import pandas as pd\n",
    "\n",
    "df = pd.DataFrame(name2stats).T\n",
    "df"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
