{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "4850772c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>dataset</th>\n",
       "      <th>model</th>\n",
       "      <th>prompt_index</th>\n",
       "      <th>n</th>\n",
       "      <th>c</th>\n",
       "      <th>c_ci_lower</th>\n",
       "      <th>c_ci_upper</th>\n",
       "      <th>c_bootstrap_std</th>\n",
       "      <th>r</th>\n",
       "      <th>r_ci_lower</th>\n",
       "      <th>...</th>\n",
       "      <th>correct_attempted_ci_upper</th>\n",
       "      <th>correct_attempted_bootstrap_std</th>\n",
       "      <th>fscore</th>\n",
       "      <th>fscore_ci_lower</th>\n",
       "      <th>fscore_ci_upper</th>\n",
       "      <th>fscore_bootstrap_std</th>\n",
       "      <th>weighted</th>\n",
       "      <th>weighted_ci_lower</th>\n",
       "      <th>weighted_ci_upper</th>\n",
       "      <th>weighted_bootstrap_std</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>gemma3_12b</td>\n",
       "      <td>1</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.0565</td>\n",
       "      <td>0.047000</td>\n",
       "      <td>0.066000</td>\n",
       "      <td>0.004949</td>\n",
       "      <td>0.0310</td>\n",
       "      <td>0.023000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.067973</td>\n",
       "      <td>0.005089</td>\n",
       "      <td>0.057390</td>\n",
       "      <td>0.047764</td>\n",
       "      <td>0.067010</td>\n",
       "      <td>0.005016</td>\n",
       "      <td>-0.9125</td>\n",
       "      <td>-0.925000</td>\n",
       "      <td>-0.900000</td>\n",
       "      <td>0.006064</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>gemma3_12b</td>\n",
       "      <td>2</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.0525</td>\n",
       "      <td>0.043000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.005043</td>\n",
       "      <td>0.1560</td>\n",
       "      <td>0.141500</td>\n",
       "      <td>...</td>\n",
       "      <td>0.073919</td>\n",
       "      <td>0.005944</td>\n",
       "      <td>0.056941</td>\n",
       "      <td>0.046812</td>\n",
       "      <td>0.067451</td>\n",
       "      <td>0.005450</td>\n",
       "      <td>-0.7915</td>\n",
       "      <td>-0.807512</td>\n",
       "      <td>-0.773000</td>\n",
       "      <td>0.009130</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>gemma3_12b</td>\n",
       "      <td>3</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.0270</td>\n",
       "      <td>0.020000</td>\n",
       "      <td>0.034500</td>\n",
       "      <td>0.003714</td>\n",
       "      <td>0.7015</td>\n",
       "      <td>0.683000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.113822</td>\n",
       "      <td>0.011958</td>\n",
       "      <td>0.041586</td>\n",
       "      <td>0.030769</td>\n",
       "      <td>0.052719</td>\n",
       "      <td>0.005639</td>\n",
       "      <td>-0.2715</td>\n",
       "      <td>-0.290000</td>\n",
       "      <td>-0.252500</td>\n",
       "      <td>0.009549</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>gemma3_12b</td>\n",
       "      <td>4</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.0145</td>\n",
       "      <td>0.009500</td>\n",
       "      <td>0.020000</td>\n",
       "      <td>0.002616</td>\n",
       "      <td>0.9115</td>\n",
       "      <td>0.899500</td>\n",
       "      <td>...</td>\n",
       "      <td>0.219111</td>\n",
       "      <td>0.027461</td>\n",
       "      <td>0.026642</td>\n",
       "      <td>0.017511</td>\n",
       "      <td>0.036480</td>\n",
       "      <td>0.004751</td>\n",
       "      <td>-0.0740</td>\n",
       "      <td>-0.084513</td>\n",
       "      <td>-0.063000</td>\n",
       "      <td>0.005607</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>mistral-123b</td>\n",
       "      <td>1</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.2155</td>\n",
       "      <td>0.196000</td>\n",
       "      <td>0.232513</td>\n",
       "      <td>0.009293</td>\n",
       "      <td>0.2625</td>\n",
       "      <td>0.243000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.313794</td>\n",
       "      <td>0.012008</td>\n",
       "      <td>0.248058</td>\n",
       "      <td>0.226757</td>\n",
       "      <td>0.267283</td>\n",
       "      <td>0.010358</td>\n",
       "      <td>-0.5220</td>\n",
       "      <td>-0.545000</td>\n",
       "      <td>-0.501988</td>\n",
       "      <td>0.011291</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>mistral-123b</td>\n",
       "      <td>2</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.2055</td>\n",
       "      <td>0.189000</td>\n",
       "      <td>0.223500</td>\n",
       "      <td>0.008956</td>\n",
       "      <td>0.3325</td>\n",
       "      <td>0.311987</td>\n",
       "      <td>...</td>\n",
       "      <td>0.332106</td>\n",
       "      <td>0.012468</td>\n",
       "      <td>0.246477</td>\n",
       "      <td>0.227494</td>\n",
       "      <td>0.266674</td>\n",
       "      <td>0.010265</td>\n",
       "      <td>-0.4620</td>\n",
       "      <td>-0.484000</td>\n",
       "      <td>-0.440500</td>\n",
       "      <td>0.011037</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>mistral-123b</td>\n",
       "      <td>3</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.1790</td>\n",
       "      <td>0.162988</td>\n",
       "      <td>0.195500</td>\n",
       "      <td>0.008420</td>\n",
       "      <td>0.4510</td>\n",
       "      <td>0.428988</td>\n",
       "      <td>...</td>\n",
       "      <td>0.354116</td>\n",
       "      <td>0.013924</td>\n",
       "      <td>0.231117</td>\n",
       "      <td>0.211334</td>\n",
       "      <td>0.251284</td>\n",
       "      <td>0.010279</td>\n",
       "      <td>-0.3700</td>\n",
       "      <td>-0.392000</td>\n",
       "      <td>-0.348488</td>\n",
       "      <td>0.010844</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>mistral-123b</td>\n",
       "      <td>4</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.1410</td>\n",
       "      <td>0.126500</td>\n",
       "      <td>0.156512</td>\n",
       "      <td>0.007873</td>\n",
       "      <td>0.6600</td>\n",
       "      <td>0.639487</td>\n",
       "      <td>...</td>\n",
       "      <td>0.450727</td>\n",
       "      <td>0.019473</td>\n",
       "      <td>0.210448</td>\n",
       "      <td>0.189752</td>\n",
       "      <td>0.231422</td>\n",
       "      <td>0.010937</td>\n",
       "      <td>-0.1990</td>\n",
       "      <td>-0.216512</td>\n",
       "      <td>-0.181500</td>\n",
       "      <td>0.009123</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>qwen235b</td>\n",
       "      <td>1</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.4075</td>\n",
       "      <td>0.387500</td>\n",
       "      <td>0.429500</td>\n",
       "      <td>0.010821</td>\n",
       "      <td>0.3080</td>\n",
       "      <td>0.288000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.614504</td>\n",
       "      <td>0.012920</td>\n",
       "      <td>0.481678</td>\n",
       "      <td>0.461305</td>\n",
       "      <td>0.504603</td>\n",
       "      <td>0.011435</td>\n",
       "      <td>-0.2845</td>\n",
       "      <td>-0.303000</td>\n",
       "      <td>-0.265500</td>\n",
       "      <td>0.009726</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>qwen235b</td>\n",
       "      <td>2</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.4375</td>\n",
       "      <td>0.418488</td>\n",
       "      <td>0.461000</td>\n",
       "      <td>0.010744</td>\n",
       "      <td>0.2325</td>\n",
       "      <td>0.214487</td>\n",
       "      <td>...</td>\n",
       "      <td>0.594703</td>\n",
       "      <td>0.012015</td>\n",
       "      <td>0.495050</td>\n",
       "      <td>0.475893</td>\n",
       "      <td>0.519102</td>\n",
       "      <td>0.011055</td>\n",
       "      <td>-0.3300</td>\n",
       "      <td>-0.348500</td>\n",
       "      <td>-0.310500</td>\n",
       "      <td>0.009941</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>qwen235b</td>\n",
       "      <td>3</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.4245</td>\n",
       "      <td>0.402487</td>\n",
       "      <td>0.447000</td>\n",
       "      <td>0.011041</td>\n",
       "      <td>0.2445</td>\n",
       "      <td>0.225500</td>\n",
       "      <td>...</td>\n",
       "      <td>0.588480</td>\n",
       "      <td>0.012388</td>\n",
       "      <td>0.483623</td>\n",
       "      <td>0.461270</td>\n",
       "      <td>0.507237</td>\n",
       "      <td>0.011389</td>\n",
       "      <td>-0.3310</td>\n",
       "      <td>-0.349500</td>\n",
       "      <td>-0.310000</td>\n",
       "      <td>0.010069</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>qwen235b</td>\n",
       "      <td>4</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.2325</td>\n",
       "      <td>0.214000</td>\n",
       "      <td>0.250500</td>\n",
       "      <td>0.009518</td>\n",
       "      <td>0.6470</td>\n",
       "      <td>0.625500</td>\n",
       "      <td>...</td>\n",
       "      <td>0.690343</td>\n",
       "      <td>0.017484</td>\n",
       "      <td>0.343681</td>\n",
       "      <td>0.320059</td>\n",
       "      <td>0.366694</td>\n",
       "      <td>0.012079</td>\n",
       "      <td>-0.1205</td>\n",
       "      <td>-0.135000</td>\n",
       "      <td>-0.107000</td>\n",
       "      <td>0.007237</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>qwen25-72b</td>\n",
       "      <td>1</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.0690</td>\n",
       "      <td>0.058500</td>\n",
       "      <td>0.080000</td>\n",
       "      <td>0.005573</td>\n",
       "      <td>0.5710</td>\n",
       "      <td>0.546988</td>\n",
       "      <td>...</td>\n",
       "      <td>0.184841</td>\n",
       "      <td>0.012151</td>\n",
       "      <td>0.096571</td>\n",
       "      <td>0.081491</td>\n",
       "      <td>0.111658</td>\n",
       "      <td>0.007554</td>\n",
       "      <td>-0.3600</td>\n",
       "      <td>-0.380500</td>\n",
       "      <td>-0.338987</td>\n",
       "      <td>0.011039</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>qwen25-72b</td>\n",
       "      <td>2</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.0590</td>\n",
       "      <td>0.049000</td>\n",
       "      <td>0.069000</td>\n",
       "      <td>0.005021</td>\n",
       "      <td>0.6690</td>\n",
       "      <td>0.649000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.206480</td>\n",
       "      <td>0.014318</td>\n",
       "      <td>0.088655</td>\n",
       "      <td>0.074411</td>\n",
       "      <td>0.102815</td>\n",
       "      <td>0.007345</td>\n",
       "      <td>-0.2720</td>\n",
       "      <td>-0.289513</td>\n",
       "      <td>-0.253000</td>\n",
       "      <td>0.009778</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>qwen25-72b</td>\n",
       "      <td>3</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.0340</td>\n",
       "      <td>0.026000</td>\n",
       "      <td>0.042012</td>\n",
       "      <td>0.004076</td>\n",
       "      <td>0.8375</td>\n",
       "      <td>0.820500</td>\n",
       "      <td>...</td>\n",
       "      <td>0.253186</td>\n",
       "      <td>0.022759</td>\n",
       "      <td>0.058495</td>\n",
       "      <td>0.045260</td>\n",
       "      <td>0.072262</td>\n",
       "      <td>0.006839</td>\n",
       "      <td>-0.1285</td>\n",
       "      <td>-0.143512</td>\n",
       "      <td>-0.113500</td>\n",
       "      <td>0.007755</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>qwen25-72b</td>\n",
       "      <td>4</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.0290</td>\n",
       "      <td>0.021988</td>\n",
       "      <td>0.036500</td>\n",
       "      <td>0.003833</td>\n",
       "      <td>0.8890</td>\n",
       "      <td>0.875487</td>\n",
       "      <td>...</td>\n",
       "      <td>0.322591</td>\n",
       "      <td>0.031134</td>\n",
       "      <td>0.052205</td>\n",
       "      <td>0.039630</td>\n",
       "      <td>0.065680</td>\n",
       "      <td>0.006761</td>\n",
       "      <td>-0.0820</td>\n",
       "      <td>-0.094000</td>\n",
       "      <td>-0.070000</td>\n",
       "      <td>0.006177</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>qwen32b</td>\n",
       "      <td>1</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.0430</td>\n",
       "      <td>0.034487</td>\n",
       "      <td>0.052500</td>\n",
       "      <td>0.004544</td>\n",
       "      <td>0.4390</td>\n",
       "      <td>0.418500</td>\n",
       "      <td>...</td>\n",
       "      <td>0.093645</td>\n",
       "      <td>0.007916</td>\n",
       "      <td>0.055093</td>\n",
       "      <td>0.044032</td>\n",
       "      <td>0.067266</td>\n",
       "      <td>0.005752</td>\n",
       "      <td>-0.5180</td>\n",
       "      <td>-0.538500</td>\n",
       "      <td>-0.496000</td>\n",
       "      <td>0.010790</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>qwen32b</td>\n",
       "      <td>2</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.0390</td>\n",
       "      <td>0.031000</td>\n",
       "      <td>0.048012</td>\n",
       "      <td>0.004381</td>\n",
       "      <td>0.6020</td>\n",
       "      <td>0.580000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.119114</td>\n",
       "      <td>0.010668</td>\n",
       "      <td>0.055794</td>\n",
       "      <td>0.044081</td>\n",
       "      <td>0.068475</td>\n",
       "      <td>0.006174</td>\n",
       "      <td>-0.3590</td>\n",
       "      <td>-0.381000</td>\n",
       "      <td>-0.338987</td>\n",
       "      <td>0.010727</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>qwen32b</td>\n",
       "      <td>3</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.0290</td>\n",
       "      <td>0.021500</td>\n",
       "      <td>0.036500</td>\n",
       "      <td>0.003805</td>\n",
       "      <td>0.7515</td>\n",
       "      <td>0.731500</td>\n",
       "      <td>...</td>\n",
       "      <td>0.146498</td>\n",
       "      <td>0.014723</td>\n",
       "      <td>0.046456</td>\n",
       "      <td>0.034801</td>\n",
       "      <td>0.058075</td>\n",
       "      <td>0.006003</td>\n",
       "      <td>-0.2195</td>\n",
       "      <td>-0.238500</td>\n",
       "      <td>-0.201487</td>\n",
       "      <td>0.009529</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>qwen32b</td>\n",
       "      <td>4</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.0120</td>\n",
       "      <td>0.007500</td>\n",
       "      <td>0.017500</td>\n",
       "      <td>0.002438</td>\n",
       "      <td>0.9420</td>\n",
       "      <td>0.931488</td>\n",
       "      <td>...</td>\n",
       "      <td>0.284722</td>\n",
       "      <td>0.037540</td>\n",
       "      <td>0.022684</td>\n",
       "      <td>0.014245</td>\n",
       "      <td>0.032668</td>\n",
       "      <td>0.004555</td>\n",
       "      <td>-0.0460</td>\n",
       "      <td>-0.055000</td>\n",
       "      <td>-0.037000</td>\n",
       "      <td>0.004721</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>qwen32b_think</td>\n",
       "      <td>1</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.0530</td>\n",
       "      <td>0.043000</td>\n",
       "      <td>0.062000</td>\n",
       "      <td>0.004989</td>\n",
       "      <td>0.5160</td>\n",
       "      <td>0.493000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.128501</td>\n",
       "      <td>0.010124</td>\n",
       "      <td>0.071429</td>\n",
       "      <td>0.058231</td>\n",
       "      <td>0.083672</td>\n",
       "      <td>0.006643</td>\n",
       "      <td>-0.4310</td>\n",
       "      <td>-0.454012</td>\n",
       "      <td>-0.411000</td>\n",
       "      <td>0.011070</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>qwen32b_think</td>\n",
       "      <td>2</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.0445</td>\n",
       "      <td>0.036500</td>\n",
       "      <td>0.054000</td>\n",
       "      <td>0.004467</td>\n",
       "      <td>0.6920</td>\n",
       "      <td>0.670500</td>\n",
       "      <td>...</td>\n",
       "      <td>0.173916</td>\n",
       "      <td>0.013702</td>\n",
       "      <td>0.068043</td>\n",
       "      <td>0.055949</td>\n",
       "      <td>0.082413</td>\n",
       "      <td>0.006678</td>\n",
       "      <td>-0.2635</td>\n",
       "      <td>-0.283500</td>\n",
       "      <td>-0.245487</td>\n",
       "      <td>0.009549</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>qwen32b_think</td>\n",
       "      <td>3</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.0360</td>\n",
       "      <td>0.028000</td>\n",
       "      <td>0.044500</td>\n",
       "      <td>0.004106</td>\n",
       "      <td>0.7435</td>\n",
       "      <td>0.725000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.170022</td>\n",
       "      <td>0.014889</td>\n",
       "      <td>0.057302</td>\n",
       "      <td>0.044997</td>\n",
       "      <td>0.070125</td>\n",
       "      <td>0.006377</td>\n",
       "      <td>-0.2205</td>\n",
       "      <td>-0.237500</td>\n",
       "      <td>-0.202500</td>\n",
       "      <td>0.009323</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>qwen32b_think</td>\n",
       "      <td>4</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.0170</td>\n",
       "      <td>0.012000</td>\n",
       "      <td>0.023000</td>\n",
       "      <td>0.002870</td>\n",
       "      <td>0.9050</td>\n",
       "      <td>0.892000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.235001</td>\n",
       "      <td>0.027836</td>\n",
       "      <td>0.031050</td>\n",
       "      <td>0.021877</td>\n",
       "      <td>0.041538</td>\n",
       "      <td>0.005171</td>\n",
       "      <td>-0.0780</td>\n",
       "      <td>-0.089500</td>\n",
       "      <td>-0.066487</td>\n",
       "      <td>0.005962</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>llama_70b</td>\n",
       "      <td>1</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.0330</td>\n",
       "      <td>0.026000</td>\n",
       "      <td>0.041000</td>\n",
       "      <td>0.003884</td>\n",
       "      <td>0.8425</td>\n",
       "      <td>0.826488</td>\n",
       "      <td>...</td>\n",
       "      <td>0.255257</td>\n",
       "      <td>0.022227</td>\n",
       "      <td>0.057019</td>\n",
       "      <td>0.044923</td>\n",
       "      <td>0.070389</td>\n",
       "      <td>0.006543</td>\n",
       "      <td>-0.1245</td>\n",
       "      <td>-0.138513</td>\n",
       "      <td>-0.109500</td>\n",
       "      <td>0.007309</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>gpt-4.1</td>\n",
       "      <td>1</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.3445</td>\n",
       "      <td>0.323500</td>\n",
       "      <td>0.366000</td>\n",
       "      <td>0.010448</td>\n",
       "      <td>0.0555</td>\n",
       "      <td>0.045000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.386365</td>\n",
       "      <td>0.010932</td>\n",
       "      <td>0.354333</td>\n",
       "      <td>0.332991</td>\n",
       "      <td>0.375548</td>\n",
       "      <td>0.010642</td>\n",
       "      <td>-0.6000</td>\n",
       "      <td>-0.622000</td>\n",
       "      <td>-0.578000</td>\n",
       "      <td>0.010906</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>gpt-4.1-mini</td>\n",
       "      <td>1</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.1330</td>\n",
       "      <td>0.118000</td>\n",
       "      <td>0.150000</td>\n",
       "      <td>0.007997</td>\n",
       "      <td>0.3075</td>\n",
       "      <td>0.288500</td>\n",
       "      <td>...</td>\n",
       "      <td>0.215604</td>\n",
       "      <td>0.011225</td>\n",
       "      <td>0.157164</td>\n",
       "      <td>0.140045</td>\n",
       "      <td>0.177137</td>\n",
       "      <td>0.009272</td>\n",
       "      <td>-0.5595</td>\n",
       "      <td>-0.581000</td>\n",
       "      <td>-0.536488</td>\n",
       "      <td>0.011325</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>claude-sonnet-4</td>\n",
       "      <td>1</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.0875</td>\n",
       "      <td>0.075987</td>\n",
       "      <td>0.099512</td>\n",
       "      <td>0.006371</td>\n",
       "      <td>0.8490</td>\n",
       "      <td>0.834000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.634390</td>\n",
       "      <td>0.028243</td>\n",
       "      <td>0.152042</td>\n",
       "      <td>0.132856</td>\n",
       "      <td>0.171515</td>\n",
       "      <td>0.010303</td>\n",
       "      <td>-0.0635</td>\n",
       "      <td>-0.074500</td>\n",
       "      <td>-0.053000</td>\n",
       "      <td>0.005411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>claude-3.5-haiku</td>\n",
       "      <td>1</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.0245</td>\n",
       "      <td>0.018000</td>\n",
       "      <td>0.031500</td>\n",
       "      <td>0.003491</td>\n",
       "      <td>0.9335</td>\n",
       "      <td>0.922987</td>\n",
       "      <td>...</td>\n",
       "      <td>0.453237</td>\n",
       "      <td>0.042665</td>\n",
       "      <td>0.045945</td>\n",
       "      <td>0.033897</td>\n",
       "      <td>0.058797</td>\n",
       "      <td>0.006408</td>\n",
       "      <td>-0.0420</td>\n",
       "      <td>-0.050513</td>\n",
       "      <td>-0.033500</td>\n",
       "      <td>0.004488</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>gemini-2.5-flash</td>\n",
       "      <td>1</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.1875</td>\n",
       "      <td>0.171000</td>\n",
       "      <td>0.206012</td>\n",
       "      <td>0.008963</td>\n",
       "      <td>0.4170</td>\n",
       "      <td>0.395500</td>\n",
       "      <td>...</td>\n",
       "      <td>0.349524</td>\n",
       "      <td>0.013773</td>\n",
       "      <td>0.236892</td>\n",
       "      <td>0.216864</td>\n",
       "      <td>0.258784</td>\n",
       "      <td>0.010687</td>\n",
       "      <td>-0.3955</td>\n",
       "      <td>-0.416000</td>\n",
       "      <td>-0.374500</td>\n",
       "      <td>0.010642</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>gemini-2.5-flash-lite</td>\n",
       "      <td>1</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.0820</td>\n",
       "      <td>0.070000</td>\n",
       "      <td>0.093512</td>\n",
       "      <td>0.006028</td>\n",
       "      <td>0.4060</td>\n",
       "      <td>0.384987</td>\n",
       "      <td>...</td>\n",
       "      <td>0.157899</td>\n",
       "      <td>0.009930</td>\n",
       "      <td>0.102886</td>\n",
       "      <td>0.087935</td>\n",
       "      <td>0.117538</td>\n",
       "      <td>0.007446</td>\n",
       "      <td>-0.5120</td>\n",
       "      <td>-0.534000</td>\n",
       "      <td>-0.489500</td>\n",
       "      <td>0.011432</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>deepseek-chat-v3-0324</td>\n",
       "      <td>1</td>\n",
       "      <td>1000</td>\n",
       "      <td>0.1960</td>\n",
       "      <td>0.173000</td>\n",
       "      <td>0.221000</td>\n",
       "      <td>0.012546</td>\n",
       "      <td>0.2940</td>\n",
       "      <td>0.266000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.311622</td>\n",
       "      <td>0.016822</td>\n",
       "      <td>0.229777</td>\n",
       "      <td>0.203168</td>\n",
       "      <td>0.258292</td>\n",
       "      <td>0.014195</td>\n",
       "      <td>-0.5100</td>\n",
       "      <td>-0.540000</td>\n",
       "      <td>-0.479975</td>\n",
       "      <td>0.015913</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>deepseek-chat-v3-0324</td>\n",
       "      <td>2</td>\n",
       "      <td>1000</td>\n",
       "      <td>0.1550</td>\n",
       "      <td>0.134000</td>\n",
       "      <td>0.179000</td>\n",
       "      <td>0.011386</td>\n",
       "      <td>0.5210</td>\n",
       "      <td>0.490000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.366821</td>\n",
       "      <td>0.021329</td>\n",
       "      <td>0.209601</td>\n",
       "      <td>0.181944</td>\n",
       "      <td>0.239684</td>\n",
       "      <td>0.014539</td>\n",
       "      <td>-0.3240</td>\n",
       "      <td>-0.353000</td>\n",
       "      <td>-0.294000</td>\n",
       "      <td>0.014839</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>deepseek-chat-v3-0324</td>\n",
       "      <td>3</td>\n",
       "      <td>1000</td>\n",
       "      <td>0.1180</td>\n",
       "      <td>0.099000</td>\n",
       "      <td>0.139025</td>\n",
       "      <td>0.010424</td>\n",
       "      <td>0.6800</td>\n",
       "      <td>0.652000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.420898</td>\n",
       "      <td>0.026857</td>\n",
       "      <td>0.178788</td>\n",
       "      <td>0.151373</td>\n",
       "      <td>0.209202</td>\n",
       "      <td>0.014742</td>\n",
       "      <td>-0.2020</td>\n",
       "      <td>-0.225000</td>\n",
       "      <td>-0.179000</td>\n",
       "      <td>0.012298</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>deepseek-chat-v3-0324</td>\n",
       "      <td>4</td>\n",
       "      <td>1000</td>\n",
       "      <td>0.0640</td>\n",
       "      <td>0.049000</td>\n",
       "      <td>0.081000</td>\n",
       "      <td>0.007942</td>\n",
       "      <td>0.8470</td>\n",
       "      <td>0.824000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.040661</td>\n",
       "      <td>0.111015</td>\n",
       "      <td>0.085664</td>\n",
       "      <td>0.138699</td>\n",
       "      <td>0.013125</td>\n",
       "      <td>-0.0890</td>\n",
       "      <td>-0.107000</td>\n",
       "      <td>-0.072000</td>\n",
       "      <td>0.008814</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>glm-4.5</td>\n",
       "      <td>1</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.0645</td>\n",
       "      <td>0.053000</td>\n",
       "      <td>0.075500</td>\n",
       "      <td>0.005550</td>\n",
       "      <td>0.7895</td>\n",
       "      <td>0.771000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.346961</td>\n",
       "      <td>0.022675</td>\n",
       "      <td>0.106568</td>\n",
       "      <td>0.088963</td>\n",
       "      <td>0.123498</td>\n",
       "      <td>0.008779</td>\n",
       "      <td>-0.1460</td>\n",
       "      <td>-0.161500</td>\n",
       "      <td>-0.131000</td>\n",
       "      <td>0.007891</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>glm-4.5-air</td>\n",
       "      <td>1</td>\n",
       "      <td>2000</td>\n",
       "      <td>0.0485</td>\n",
       "      <td>0.039000</td>\n",
       "      <td>0.058500</td>\n",
       "      <td>0.004936</td>\n",
       "      <td>0.7070</td>\n",
       "      <td>0.687000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.196279</td>\n",
       "      <td>0.015722</td>\n",
       "      <td>0.075019</td>\n",
       "      <td>0.060700</td>\n",
       "      <td>0.090107</td>\n",
       "      <td>0.007440</td>\n",
       "      <td>-0.2445</td>\n",
       "      <td>-0.263012</td>\n",
       "      <td>-0.226500</td>\n",
       "      <td>0.009547</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>37 rows × 24 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     dataset                  model  prompt_index     n       c  c_ci_lower  \\\n",
       "0   simpleqa             gemma3_12b             1  2000  0.0565    0.047000   \n",
       "1   simpleqa             gemma3_12b             2  2000  0.0525    0.043000   \n",
       "2   simpleqa             gemma3_12b             3  2000  0.0270    0.020000   \n",
       "3   simpleqa             gemma3_12b             4  2000  0.0145    0.009500   \n",
       "4   simpleqa           mistral-123b             1  2000  0.2155    0.196000   \n",
       "5   simpleqa           mistral-123b             2  2000  0.2055    0.189000   \n",
       "6   simpleqa           mistral-123b             3  2000  0.1790    0.162988   \n",
       "7   simpleqa           mistral-123b             4  2000  0.1410    0.126500   \n",
       "8   simpleqa               qwen235b             1  2000  0.4075    0.387500   \n",
       "9   simpleqa               qwen235b             2  2000  0.4375    0.418488   \n",
       "10  simpleqa               qwen235b             3  2000  0.4245    0.402487   \n",
       "11  simpleqa               qwen235b             4  2000  0.2325    0.214000   \n",
       "12  simpleqa             qwen25-72b             1  2000  0.0690    0.058500   \n",
       "13  simpleqa             qwen25-72b             2  2000  0.0590    0.049000   \n",
       "14  simpleqa             qwen25-72b             3  2000  0.0340    0.026000   \n",
       "15  simpleqa             qwen25-72b             4  2000  0.0290    0.021988   \n",
       "16  simpleqa                qwen32b             1  2000  0.0430    0.034487   \n",
       "17  simpleqa                qwen32b             2  2000  0.0390    0.031000   \n",
       "18  simpleqa                qwen32b             3  2000  0.0290    0.021500   \n",
       "19  simpleqa                qwen32b             4  2000  0.0120    0.007500   \n",
       "20  simpleqa          qwen32b_think             1  2000  0.0530    0.043000   \n",
       "21  simpleqa          qwen32b_think             2  2000  0.0445    0.036500   \n",
       "22  simpleqa          qwen32b_think             3  2000  0.0360    0.028000   \n",
       "23  simpleqa          qwen32b_think             4  2000  0.0170    0.012000   \n",
       "24  simpleqa              llama_70b             1  2000  0.0330    0.026000   \n",
       "25  simpleqa                gpt-4.1             1  2000  0.3445    0.323500   \n",
       "26  simpleqa           gpt-4.1-mini             1  2000  0.1330    0.118000   \n",
       "27  simpleqa        claude-sonnet-4             1  2000  0.0875    0.075987   \n",
       "28  simpleqa       claude-3.5-haiku             1  2000  0.0245    0.018000   \n",
       "29  simpleqa       gemini-2.5-flash             1  2000  0.1875    0.171000   \n",
       "30  simpleqa  gemini-2.5-flash-lite             1  2000  0.0820    0.070000   \n",
       "31  simpleqa  deepseek-chat-v3-0324             1  1000  0.1960    0.173000   \n",
       "32  simpleqa  deepseek-chat-v3-0324             2  1000  0.1550    0.134000   \n",
       "33  simpleqa  deepseek-chat-v3-0324             3  1000  0.1180    0.099000   \n",
       "34  simpleqa  deepseek-chat-v3-0324             4  1000  0.0640    0.049000   \n",
       "35  simpleqa                glm-4.5             1  2000  0.0645    0.053000   \n",
       "36  simpleqa            glm-4.5-air             1  2000  0.0485    0.039000   \n",
       "\n",
       "    c_ci_upper  c_bootstrap_std       r  r_ci_lower  ...  \\\n",
       "0     0.066000         0.004949  0.0310    0.023000  ...   \n",
       "1     0.062500         0.005043  0.1560    0.141500  ...   \n",
       "2     0.034500         0.003714  0.7015    0.683000  ...   \n",
       "3     0.020000         0.002616  0.9115    0.899500  ...   \n",
       "4     0.232513         0.009293  0.2625    0.243000  ...   \n",
       "5     0.223500         0.008956  0.3325    0.311987  ...   \n",
       "6     0.195500         0.008420  0.4510    0.428988  ...   \n",
       "7     0.156512         0.007873  0.6600    0.639487  ...   \n",
       "8     0.429500         0.010821  0.3080    0.288000  ...   \n",
       "9     0.461000         0.010744  0.2325    0.214487  ...   \n",
       "10    0.447000         0.011041  0.2445    0.225500  ...   \n",
       "11    0.250500         0.009518  0.6470    0.625500  ...   \n",
       "12    0.080000         0.005573  0.5710    0.546988  ...   \n",
       "13    0.069000         0.005021  0.6690    0.649000  ...   \n",
       "14    0.042012         0.004076  0.8375    0.820500  ...   \n",
       "15    0.036500         0.003833  0.8890    0.875487  ...   \n",
       "16    0.052500         0.004544  0.4390    0.418500  ...   \n",
       "17    0.048012         0.004381  0.6020    0.580000  ...   \n",
       "18    0.036500         0.003805  0.7515    0.731500  ...   \n",
       "19    0.017500         0.002438  0.9420    0.931488  ...   \n",
       "20    0.062000         0.004989  0.5160    0.493000  ...   \n",
       "21    0.054000         0.004467  0.6920    0.670500  ...   \n",
       "22    0.044500         0.004106  0.7435    0.725000  ...   \n",
       "23    0.023000         0.002870  0.9050    0.892000  ...   \n",
       "24    0.041000         0.003884  0.8425    0.826488  ...   \n",
       "25    0.366000         0.010448  0.0555    0.045000  ...   \n",
       "26    0.150000         0.007997  0.3075    0.288500  ...   \n",
       "27    0.099512         0.006371  0.8490    0.834000  ...   \n",
       "28    0.031500         0.003491  0.9335    0.922987  ...   \n",
       "29    0.206012         0.008963  0.4170    0.395500  ...   \n",
       "30    0.093512         0.006028  0.4060    0.384987  ...   \n",
       "31    0.221000         0.012546  0.2940    0.266000  ...   \n",
       "32    0.179000         0.011386  0.5210    0.490000  ...   \n",
       "33    0.139025         0.010424  0.6800    0.652000  ...   \n",
       "34    0.081000         0.007942  0.8470    0.824000  ...   \n",
       "35    0.075500         0.005550  0.7895    0.771000  ...   \n",
       "36    0.058500         0.004936  0.7070    0.687000  ...   \n",
       "\n",
       "    correct_attempted_ci_upper  correct_attempted_bootstrap_std    fscore  \\\n",
       "0                     0.067973                         0.005089  0.057390   \n",
       "1                     0.073919                         0.005944  0.056941   \n",
       "2                     0.113822                         0.011958  0.041586   \n",
       "3                     0.219111                         0.027461  0.026642   \n",
       "4                     0.313794                         0.012008  0.248058   \n",
       "5                     0.332106                         0.012468  0.246477   \n",
       "6                     0.354116                         0.013924  0.231117   \n",
       "7                     0.450727                         0.019473  0.210448   \n",
       "8                     0.614504                         0.012920  0.481678   \n",
       "9                     0.594703                         0.012015  0.495050   \n",
       "10                    0.588480                         0.012388  0.483623   \n",
       "11                    0.690343                         0.017484  0.343681   \n",
       "12                    0.184841                         0.012151  0.096571   \n",
       "13                    0.206480                         0.014318  0.088655   \n",
       "14                    0.253186                         0.022759  0.058495   \n",
       "15                    0.322591                         0.031134  0.052205   \n",
       "16                    0.093645                         0.007916  0.055093   \n",
       "17                    0.119114                         0.010668  0.055794   \n",
       "18                    0.146498                         0.014723  0.046456   \n",
       "19                    0.284722                         0.037540  0.022684   \n",
       "20                    0.128501                         0.010124  0.071429   \n",
       "21                    0.173916                         0.013702  0.068043   \n",
       "22                    0.170022                         0.014889  0.057302   \n",
       "23                    0.235001                         0.027836  0.031050   \n",
       "24                    0.255257                         0.022227  0.057019   \n",
       "25                    0.386365                         0.010932  0.354333   \n",
       "26                    0.215604                         0.011225  0.157164   \n",
       "27                    0.634390                         0.028243  0.152042   \n",
       "28                    0.453237                         0.042665  0.045945   \n",
       "29                    0.349524                         0.013773  0.236892   \n",
       "30                    0.157899                         0.009930  0.102886   \n",
       "31                    0.311622                         0.016822  0.229777   \n",
       "32                    0.366821                         0.021329  0.209601   \n",
       "33                    0.420898                         0.026857  0.178788   \n",
       "34                    0.500000                         0.040661  0.111015   \n",
       "35                    0.346961                         0.022675  0.106568   \n",
       "36                    0.196279                         0.015722  0.075019   \n",
       "\n",
       "    fscore_ci_lower  fscore_ci_upper  fscore_bootstrap_std  weighted  \\\n",
       "0          0.047764         0.067010              0.005016   -0.9125   \n",
       "1          0.046812         0.067451              0.005450   -0.7915   \n",
       "2          0.030769         0.052719              0.005639   -0.2715   \n",
       "3          0.017511         0.036480              0.004751   -0.0740   \n",
       "4          0.226757         0.267283              0.010358   -0.5220   \n",
       "5          0.227494         0.266674              0.010265   -0.4620   \n",
       "6          0.211334         0.251284              0.010279   -0.3700   \n",
       "7          0.189752         0.231422              0.010937   -0.1990   \n",
       "8          0.461305         0.504603              0.011435   -0.2845   \n",
       "9          0.475893         0.519102              0.011055   -0.3300   \n",
       "10         0.461270         0.507237              0.011389   -0.3310   \n",
       "11         0.320059         0.366694              0.012079   -0.1205   \n",
       "12         0.081491         0.111658              0.007554   -0.3600   \n",
       "13         0.074411         0.102815              0.007345   -0.2720   \n",
       "14         0.045260         0.072262              0.006839   -0.1285   \n",
       "15         0.039630         0.065680              0.006761   -0.0820   \n",
       "16         0.044032         0.067266              0.005752   -0.5180   \n",
       "17         0.044081         0.068475              0.006174   -0.3590   \n",
       "18         0.034801         0.058075              0.006003   -0.2195   \n",
       "19         0.014245         0.032668              0.004555   -0.0460   \n",
       "20         0.058231         0.083672              0.006643   -0.4310   \n",
       "21         0.055949         0.082413              0.006678   -0.2635   \n",
       "22         0.044997         0.070125              0.006377   -0.2205   \n",
       "23         0.021877         0.041538              0.005171   -0.0780   \n",
       "24         0.044923         0.070389              0.006543   -0.1245   \n",
       "25         0.332991         0.375548              0.010642   -0.6000   \n",
       "26         0.140045         0.177137              0.009272   -0.5595   \n",
       "27         0.132856         0.171515              0.010303   -0.0635   \n",
       "28         0.033897         0.058797              0.006408   -0.0420   \n",
       "29         0.216864         0.258784              0.010687   -0.3955   \n",
       "30         0.087935         0.117538              0.007446   -0.5120   \n",
       "31         0.203168         0.258292              0.014195   -0.5100   \n",
       "32         0.181944         0.239684              0.014539   -0.3240   \n",
       "33         0.151373         0.209202              0.014742   -0.2020   \n",
       "34         0.085664         0.138699              0.013125   -0.0890   \n",
       "35         0.088963         0.123498              0.008779   -0.1460   \n",
       "36         0.060700         0.090107              0.007440   -0.2445   \n",
       "\n",
       "    weighted_ci_lower  weighted_ci_upper  weighted_bootstrap_std  \n",
       "0           -0.925000          -0.900000                0.006064  \n",
       "1           -0.807512          -0.773000                0.009130  \n",
       "2           -0.290000          -0.252500                0.009549  \n",
       "3           -0.084513          -0.063000                0.005607  \n",
       "4           -0.545000          -0.501988                0.011291  \n",
       "5           -0.484000          -0.440500                0.011037  \n",
       "6           -0.392000          -0.348488                0.010844  \n",
       "7           -0.216512          -0.181500                0.009123  \n",
       "8           -0.303000          -0.265500                0.009726  \n",
       "9           -0.348500          -0.310500                0.009941  \n",
       "10          -0.349500          -0.310000                0.010069  \n",
       "11          -0.135000          -0.107000                0.007237  \n",
       "12          -0.380500          -0.338987                0.011039  \n",
       "13          -0.289513          -0.253000                0.009778  \n",
       "14          -0.143512          -0.113500                0.007755  \n",
       "15          -0.094000          -0.070000                0.006177  \n",
       "16          -0.538500          -0.496000                0.010790  \n",
       "17          -0.381000          -0.338987                0.010727  \n",
       "18          -0.238500          -0.201487                0.009529  \n",
       "19          -0.055000          -0.037000                0.004721  \n",
       "20          -0.454012          -0.411000                0.011070  \n",
       "21          -0.283500          -0.245487                0.009549  \n",
       "22          -0.237500          -0.202500                0.009323  \n",
       "23          -0.089500          -0.066487                0.005962  \n",
       "24          -0.138513          -0.109500                0.007309  \n",
       "25          -0.622000          -0.578000                0.010906  \n",
       "26          -0.581000          -0.536488                0.011325  \n",
       "27          -0.074500          -0.053000                0.005411  \n",
       "28          -0.050513          -0.033500                0.004488  \n",
       "29          -0.416000          -0.374500                0.010642  \n",
       "30          -0.534000          -0.489500                0.011432  \n",
       "31          -0.540000          -0.479975                0.015913  \n",
       "32          -0.353000          -0.294000                0.014839  \n",
       "33          -0.225000          -0.179000                0.012298  \n",
       "34          -0.107000          -0.072000                0.008814  \n",
       "35          -0.161500          -0.131000                0.007891  \n",
       "36          -0.263012          -0.226500                0.009547  \n",
       "\n",
       "[37 rows x 24 columns]"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "simple_metrics_path = \"path/to/results/simple_metrics_with_ci.csv\"\n",
    "\n",
    "sm_df = pd.read_csv(simple_metrics_path)\n",
    "\n",
    "sm_df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "90427f28",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>dataset</th>\n",
       "      <th>model</th>\n",
       "      <th>prompt_index</th>\n",
       "      <th>refusal_index</th>\n",
       "      <th>ci_lower</th>\n",
       "      <th>ci_upper</th>\n",
       "      <th>ci_width</th>\n",
       "      <th>bootstrap_std</th>\n",
       "      <th>refusal_rate</th>\n",
       "      <th>mu</th>\n",
       "      <th>mu_a</th>\n",
       "      <th>mu_r</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>gemma3_12b</td>\n",
       "      <td>1</td>\n",
       "      <td>0.207471</td>\n",
       "      <td>-0.153486</td>\n",
       "      <td>0.901116</td>\n",
       "      <td>1.054602</td>\n",
       "      <td>0.227385</td>\n",
       "      <td>0.0310</td>\n",
       "      <td>0.9450</td>\n",
       "      <td>0.944788</td>\n",
       "      <td>0.951613</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>gemma3_12b</td>\n",
       "      <td>2</td>\n",
       "      <td>0.251995</td>\n",
       "      <td>-0.073331</td>\n",
       "      <td>0.192668</td>\n",
       "      <td>0.265999</td>\n",
       "      <td>0.070451</td>\n",
       "      <td>0.1560</td>\n",
       "      <td>0.9450</td>\n",
       "      <td>0.943720</td>\n",
       "      <td>0.951923</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>gemma3_12b</td>\n",
       "      <td>3</td>\n",
       "      <td>0.171961</td>\n",
       "      <td>0.055522</td>\n",
       "      <td>0.270219</td>\n",
       "      <td>0.214697</td>\n",
       "      <td>0.054858</td>\n",
       "      <td>0.7015</td>\n",
       "      <td>0.9450</td>\n",
       "      <td>0.921273</td>\n",
       "      <td>0.955096</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>gemma3_12b</td>\n",
       "      <td>4</td>\n",
       "      <td>0.329575</td>\n",
       "      <td>0.167230</td>\n",
       "      <td>0.414636</td>\n",
       "      <td>0.247406</td>\n",
       "      <td>0.066915</td>\n",
       "      <td>0.9115</td>\n",
       "      <td>0.9450</td>\n",
       "      <td>0.853107</td>\n",
       "      <td>0.953922</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>mistral-123b</td>\n",
       "      <td>1</td>\n",
       "      <td>0.386068</td>\n",
       "      <td>0.315948</td>\n",
       "      <td>0.452849</td>\n",
       "      <td>0.136901</td>\n",
       "      <td>0.033958</td>\n",
       "      <td>0.2625</td>\n",
       "      <td>0.7495</td>\n",
       "      <td>0.699661</td>\n",
       "      <td>0.889524</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>mistral-123b</td>\n",
       "      <td>2</td>\n",
       "      <td>0.395116</td>\n",
       "      <td>0.331371</td>\n",
       "      <td>0.460671</td>\n",
       "      <td>0.129300</td>\n",
       "      <td>0.038004</td>\n",
       "      <td>0.3325</td>\n",
       "      <td>0.7495</td>\n",
       "      <td>0.684644</td>\n",
       "      <td>0.879699</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>mistral-123b</td>\n",
       "      <td>3</td>\n",
       "      <td>0.366625</td>\n",
       "      <td>0.299547</td>\n",
       "      <td>0.421378</td>\n",
       "      <td>0.121832</td>\n",
       "      <td>0.035892</td>\n",
       "      <td>0.4510</td>\n",
       "      <td>0.7495</td>\n",
       "      <td>0.665756</td>\n",
       "      <td>0.851441</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>mistral-123b</td>\n",
       "      <td>4</td>\n",
       "      <td>0.409233</td>\n",
       "      <td>0.350027</td>\n",
       "      <td>0.475931</td>\n",
       "      <td>0.125904</td>\n",
       "      <td>0.033378</td>\n",
       "      <td>0.6600</td>\n",
       "      <td>0.7495</td>\n",
       "      <td>0.598529</td>\n",
       "      <td>0.827273</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>qwen235b</td>\n",
       "      <td>1</td>\n",
       "      <td>0.361352</td>\n",
       "      <td>0.294596</td>\n",
       "      <td>0.435840</td>\n",
       "      <td>0.141244</td>\n",
       "      <td>0.037152</td>\n",
       "      <td>0.3080</td>\n",
       "      <td>0.4930</td>\n",
       "      <td>0.418353</td>\n",
       "      <td>0.660714</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>qwen235b</td>\n",
       "      <td>2</td>\n",
       "      <td>0.359678</td>\n",
       "      <td>0.290251</td>\n",
       "      <td>0.426455</td>\n",
       "      <td>0.136204</td>\n",
       "      <td>0.035944</td>\n",
       "      <td>0.2325</td>\n",
       "      <td>0.4930</td>\n",
       "      <td>0.435179</td>\n",
       "      <td>0.683871</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>qwen235b</td>\n",
       "      <td>3</td>\n",
       "      <td>0.311762</td>\n",
       "      <td>0.253115</td>\n",
       "      <td>0.398039</td>\n",
       "      <td>0.144925</td>\n",
       "      <td>0.037029</td>\n",
       "      <td>0.2445</td>\n",
       "      <td>0.4930</td>\n",
       "      <td>0.440768</td>\n",
       "      <td>0.654397</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>qwen235b</td>\n",
       "      <td>4</td>\n",
       "      <td>0.296903</td>\n",
       "      <td>0.236356</td>\n",
       "      <td>0.369441</td>\n",
       "      <td>0.133085</td>\n",
       "      <td>0.035599</td>\n",
       "      <td>0.6470</td>\n",
       "      <td>0.4930</td>\n",
       "      <td>0.366856</td>\n",
       "      <td>0.561824</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>qwen25-72b</td>\n",
       "      <td>1</td>\n",
       "      <td>0.517655</td>\n",
       "      <td>0.448471</td>\n",
       "      <td>0.589370</td>\n",
       "      <td>0.140899</td>\n",
       "      <td>0.037748</td>\n",
       "      <td>0.5710</td>\n",
       "      <td>0.8945</td>\n",
       "      <td>0.805361</td>\n",
       "      <td>0.961471</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>qwen25-72b</td>\n",
       "      <td>2</td>\n",
       "      <td>0.492207</td>\n",
       "      <td>0.414341</td>\n",
       "      <td>0.567238</td>\n",
       "      <td>0.152897</td>\n",
       "      <td>0.041016</td>\n",
       "      <td>0.6690</td>\n",
       "      <td>0.8945</td>\n",
       "      <td>0.783988</td>\n",
       "      <td>0.949178</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>qwen25-72b</td>\n",
       "      <td>3</td>\n",
       "      <td>0.473703</td>\n",
       "      <td>0.382206</td>\n",
       "      <td>0.551031</td>\n",
       "      <td>0.168825</td>\n",
       "      <td>0.044393</td>\n",
       "      <td>0.8375</td>\n",
       "      <td>0.8945</td>\n",
       "      <td>0.723077</td>\n",
       "      <td>0.927761</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>qwen25-72b</td>\n",
       "      <td>4</td>\n",
       "      <td>0.481125</td>\n",
       "      <td>0.393016</td>\n",
       "      <td>0.558190</td>\n",
       "      <td>0.165174</td>\n",
       "      <td>0.043507</td>\n",
       "      <td>0.8890</td>\n",
       "      <td>0.8945</td>\n",
       "      <td>0.684685</td>\n",
       "      <td>0.920697</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>qwen32b</td>\n",
       "      <td>1</td>\n",
       "      <td>0.291442</td>\n",
       "      <td>0.203971</td>\n",
       "      <td>0.403346</td>\n",
       "      <td>0.199375</td>\n",
       "      <td>0.049537</td>\n",
       "      <td>0.4390</td>\n",
       "      <td>0.9435</td>\n",
       "      <td>0.921569</td>\n",
       "      <td>0.971526</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>qwen32b</td>\n",
       "      <td>2</td>\n",
       "      <td>0.296473</td>\n",
       "      <td>0.181612</td>\n",
       "      <td>0.389921</td>\n",
       "      <td>0.208309</td>\n",
       "      <td>0.055054</td>\n",
       "      <td>0.6020</td>\n",
       "      <td>0.9435</td>\n",
       "      <td>0.909548</td>\n",
       "      <td>0.965947</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>qwen32b</td>\n",
       "      <td>3</td>\n",
       "      <td>0.322171</td>\n",
       "      <td>0.219749</td>\n",
       "      <td>0.414285</td>\n",
       "      <td>0.194536</td>\n",
       "      <td>0.054425</td>\n",
       "      <td>0.7515</td>\n",
       "      <td>0.9435</td>\n",
       "      <td>0.889336</td>\n",
       "      <td>0.961411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>qwen32b</td>\n",
       "      <td>4</td>\n",
       "      <td>0.447783</td>\n",
       "      <td>0.317144</td>\n",
       "      <td>0.559869</td>\n",
       "      <td>0.242724</td>\n",
       "      <td>0.066591</td>\n",
       "      <td>0.9420</td>\n",
       "      <td>0.9435</td>\n",
       "      <td>0.775862</td>\n",
       "      <td>0.953822</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>qwen32b_think</td>\n",
       "      <td>1</td>\n",
       "      <td>0.297557</td>\n",
       "      <td>0.186169</td>\n",
       "      <td>0.397690</td>\n",
       "      <td>0.211521</td>\n",
       "      <td>0.057326</td>\n",
       "      <td>0.5160</td>\n",
       "      <td>0.9340</td>\n",
       "      <td>0.902893</td>\n",
       "      <td>0.963178</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>qwen32b_think</td>\n",
       "      <td>2</td>\n",
       "      <td>0.342114</td>\n",
       "      <td>0.242874</td>\n",
       "      <td>0.425747</td>\n",
       "      <td>0.182873</td>\n",
       "      <td>0.048472</td>\n",
       "      <td>0.6920</td>\n",
       "      <td>0.9340</td>\n",
       "      <td>0.878247</td>\n",
       "      <td>0.958815</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>qwen32b_think</td>\n",
       "      <td>3</td>\n",
       "      <td>0.364501</td>\n",
       "      <td>0.243550</td>\n",
       "      <td>0.443941</td>\n",
       "      <td>0.200391</td>\n",
       "      <td>0.047240</td>\n",
       "      <td>0.7435</td>\n",
       "      <td>0.9340</td>\n",
       "      <td>0.865497</td>\n",
       "      <td>0.957633</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>qwen32b_think</td>\n",
       "      <td>4</td>\n",
       "      <td>0.345511</td>\n",
       "      <td>0.240086</td>\n",
       "      <td>0.461165</td>\n",
       "      <td>0.221078</td>\n",
       "      <td>0.058596</td>\n",
       "      <td>0.9050</td>\n",
       "      <td>0.9340</td>\n",
       "      <td>0.826316</td>\n",
       "      <td>0.945304</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>llama_70b</td>\n",
       "      <td>1</td>\n",
       "      <td>0.375121</td>\n",
       "      <td>0.282770</td>\n",
       "      <td>0.462760</td>\n",
       "      <td>0.179990</td>\n",
       "      <td>0.047394</td>\n",
       "      <td>0.8425</td>\n",
       "      <td>0.9140</td>\n",
       "      <td>0.800000</td>\n",
       "      <td>0.935312</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>gpt-4.1</td>\n",
       "      <td>1</td>\n",
       "      <td>0.281184</td>\n",
       "      <td>0.198417</td>\n",
       "      <td>0.355331</td>\n",
       "      <td>0.156914</td>\n",
       "      <td>0.047147</td>\n",
       "      <td>0.0555</td>\n",
       "      <td>0.5755</td>\n",
       "      <td>0.563261</td>\n",
       "      <td>0.783784</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>gpt-4.1-mini</td>\n",
       "      <td>1</td>\n",
       "      <td>0.266808</td>\n",
       "      <td>0.193033</td>\n",
       "      <td>0.321866</td>\n",
       "      <td>0.128833</td>\n",
       "      <td>0.038325</td>\n",
       "      <td>0.3075</td>\n",
       "      <td>0.8445</td>\n",
       "      <td>0.814440</td>\n",
       "      <td>0.912195</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>claude-sonnet-4</td>\n",
       "      <td>1</td>\n",
       "      <td>0.524014</td>\n",
       "      <td>0.448076</td>\n",
       "      <td>0.588094</td>\n",
       "      <td>0.140018</td>\n",
       "      <td>0.039918</td>\n",
       "      <td>0.8490</td>\n",
       "      <td>0.7365</td>\n",
       "      <td>0.420530</td>\n",
       "      <td>0.792697</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>claude-3.5-haiku</td>\n",
       "      <td>1</td>\n",
       "      <td>0.518576</td>\n",
       "      <td>0.394558</td>\n",
       "      <td>0.605176</td>\n",
       "      <td>0.210618</td>\n",
       "      <td>0.057619</td>\n",
       "      <td>0.9335</td>\n",
       "      <td>0.9175</td>\n",
       "      <td>0.669173</td>\n",
       "      <td>0.935190</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>gemini-2.5-flash</td>\n",
       "      <td>1</td>\n",
       "      <td>0.297826</td>\n",
       "      <td>0.231624</td>\n",
       "      <td>0.363722</td>\n",
       "      <td>0.132097</td>\n",
       "      <td>0.033126</td>\n",
       "      <td>0.4170</td>\n",
       "      <td>0.7215</td>\n",
       "      <td>0.655232</td>\n",
       "      <td>0.814149</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>gemini-2.5-flash-lite</td>\n",
       "      <td>1</td>\n",
       "      <td>0.115504</td>\n",
       "      <td>0.035928</td>\n",
       "      <td>0.196554</td>\n",
       "      <td>0.160626</td>\n",
       "      <td>0.045231</td>\n",
       "      <td>0.4060</td>\n",
       "      <td>0.8840</td>\n",
       "      <td>0.869529</td>\n",
       "      <td>0.905172</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>deepseek-chat-v3-0324</td>\n",
       "      <td>1</td>\n",
       "      <td>0.424681</td>\n",
       "      <td>0.353335</td>\n",
       "      <td>0.485170</td>\n",
       "      <td>0.131835</td>\n",
       "      <td>0.033341</td>\n",
       "      <td>0.5050</td>\n",
       "      <td>0.7630</td>\n",
       "      <td>0.655556</td>\n",
       "      <td>0.868317</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>glm-4.5</td>\n",
       "      <td>1</td>\n",
       "      <td>0.298168</td>\n",
       "      <td>0.238147</td>\n",
       "      <td>0.385832</td>\n",
       "      <td>0.147685</td>\n",
       "      <td>0.038119</td>\n",
       "      <td>0.7895</td>\n",
       "      <td>0.8185</td>\n",
       "      <td>0.698337</td>\n",
       "      <td>0.850538</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>simpleqa</td>\n",
       "      <td>glm-4.5-air</td>\n",
       "      <td>1</td>\n",
       "      <td>0.146343</td>\n",
       "      <td>0.053940</td>\n",
       "      <td>0.216523</td>\n",
       "      <td>0.162583</td>\n",
       "      <td>0.044307</td>\n",
       "      <td>0.7070</td>\n",
       "      <td>0.8505</td>\n",
       "      <td>0.808874</td>\n",
       "      <td>0.867751</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     dataset                  model  prompt_index  refusal_index  ci_lower  \\\n",
       "0   simpleqa             gemma3_12b             1       0.207471 -0.153486   \n",
       "1   simpleqa             gemma3_12b             2       0.251995 -0.073331   \n",
       "2   simpleqa             gemma3_12b             3       0.171961  0.055522   \n",
       "3   simpleqa             gemma3_12b             4       0.329575  0.167230   \n",
       "4   simpleqa           mistral-123b             1       0.386068  0.315948   \n",
       "5   simpleqa           mistral-123b             2       0.395116  0.331371   \n",
       "6   simpleqa           mistral-123b             3       0.366625  0.299547   \n",
       "7   simpleqa           mistral-123b             4       0.409233  0.350027   \n",
       "8   simpleqa               qwen235b             1       0.361352  0.294596   \n",
       "9   simpleqa               qwen235b             2       0.359678  0.290251   \n",
       "10  simpleqa               qwen235b             3       0.311762  0.253115   \n",
       "11  simpleqa               qwen235b             4       0.296903  0.236356   \n",
       "12  simpleqa             qwen25-72b             1       0.517655  0.448471   \n",
       "13  simpleqa             qwen25-72b             2       0.492207  0.414341   \n",
       "14  simpleqa             qwen25-72b             3       0.473703  0.382206   \n",
       "15  simpleqa             qwen25-72b             4       0.481125  0.393016   \n",
       "16  simpleqa                qwen32b             1       0.291442  0.203971   \n",
       "17  simpleqa                qwen32b             2       0.296473  0.181612   \n",
       "18  simpleqa                qwen32b             3       0.322171  0.219749   \n",
       "19  simpleqa                qwen32b             4       0.447783  0.317144   \n",
       "20  simpleqa          qwen32b_think             1       0.297557  0.186169   \n",
       "21  simpleqa          qwen32b_think             2       0.342114  0.242874   \n",
       "22  simpleqa          qwen32b_think             3       0.364501  0.243550   \n",
       "23  simpleqa          qwen32b_think             4       0.345511  0.240086   \n",
       "24  simpleqa              llama_70b             1       0.375121  0.282770   \n",
       "25  simpleqa                gpt-4.1             1       0.281184  0.198417   \n",
       "26  simpleqa           gpt-4.1-mini             1       0.266808  0.193033   \n",
       "27  simpleqa        claude-sonnet-4             1       0.524014  0.448076   \n",
       "28  simpleqa       claude-3.5-haiku             1       0.518576  0.394558   \n",
       "29  simpleqa       gemini-2.5-flash             1       0.297826  0.231624   \n",
       "30  simpleqa  gemini-2.5-flash-lite             1       0.115504  0.035928   \n",
       "31  simpleqa  deepseek-chat-v3-0324             1       0.424681  0.353335   \n",
       "32  simpleqa                glm-4.5             1       0.298168  0.238147   \n",
       "33  simpleqa            glm-4.5-air             1       0.146343  0.053940   \n",
       "\n",
       "    ci_upper  ci_width  bootstrap_std  refusal_rate      mu      mu_a  \\\n",
       "0   0.901116  1.054602       0.227385        0.0310  0.9450  0.944788   \n",
       "1   0.192668  0.265999       0.070451        0.1560  0.9450  0.943720   \n",
       "2   0.270219  0.214697       0.054858        0.7015  0.9450  0.921273   \n",
       "3   0.414636  0.247406       0.066915        0.9115  0.9450  0.853107   \n",
       "4   0.452849  0.136901       0.033958        0.2625  0.7495  0.699661   \n",
       "5   0.460671  0.129300       0.038004        0.3325  0.7495  0.684644   \n",
       "6   0.421378  0.121832       0.035892        0.4510  0.7495  0.665756   \n",
       "7   0.475931  0.125904       0.033378        0.6600  0.7495  0.598529   \n",
       "8   0.435840  0.141244       0.037152        0.3080  0.4930  0.418353   \n",
       "9   0.426455  0.136204       0.035944        0.2325  0.4930  0.435179   \n",
       "10  0.398039  0.144925       0.037029        0.2445  0.4930  0.440768   \n",
       "11  0.369441  0.133085       0.035599        0.6470  0.4930  0.366856   \n",
       "12  0.589370  0.140899       0.037748        0.5710  0.8945  0.805361   \n",
       "13  0.567238  0.152897       0.041016        0.6690  0.8945  0.783988   \n",
       "14  0.551031  0.168825       0.044393        0.8375  0.8945  0.723077   \n",
       "15  0.558190  0.165174       0.043507        0.8890  0.8945  0.684685   \n",
       "16  0.403346  0.199375       0.049537        0.4390  0.9435  0.921569   \n",
       "17  0.389921  0.208309       0.055054        0.6020  0.9435  0.909548   \n",
       "18  0.414285  0.194536       0.054425        0.7515  0.9435  0.889336   \n",
       "19  0.559869  0.242724       0.066591        0.9420  0.9435  0.775862   \n",
       "20  0.397690  0.211521       0.057326        0.5160  0.9340  0.902893   \n",
       "21  0.425747  0.182873       0.048472        0.6920  0.9340  0.878247   \n",
       "22  0.443941  0.200391       0.047240        0.7435  0.9340  0.865497   \n",
       "23  0.461165  0.221078       0.058596        0.9050  0.9340  0.826316   \n",
       "24  0.462760  0.179990       0.047394        0.8425  0.9140  0.800000   \n",
       "25  0.355331  0.156914       0.047147        0.0555  0.5755  0.563261   \n",
       "26  0.321866  0.128833       0.038325        0.3075  0.8445  0.814440   \n",
       "27  0.588094  0.140018       0.039918        0.8490  0.7365  0.420530   \n",
       "28  0.605176  0.210618       0.057619        0.9335  0.9175  0.669173   \n",
       "29  0.363722  0.132097       0.033126        0.4170  0.7215  0.655232   \n",
       "30  0.196554  0.160626       0.045231        0.4060  0.8840  0.869529   \n",
       "31  0.485170  0.131835       0.033341        0.5050  0.7630  0.655556   \n",
       "32  0.385832  0.147685       0.038119        0.7895  0.8185  0.698337   \n",
       "33  0.216523  0.162583       0.044307        0.7070  0.8505  0.808874   \n",
       "\n",
       "        mu_r  \n",
       "0   0.951613  \n",
       "1   0.951923  \n",
       "2   0.955096  \n",
       "3   0.953922  \n",
       "4   0.889524  \n",
       "5   0.879699  \n",
       "6   0.851441  \n",
       "7   0.827273  \n",
       "8   0.660714  \n",
       "9   0.683871  \n",
       "10  0.654397  \n",
       "11  0.561824  \n",
       "12  0.961471  \n",
       "13  0.949178  \n",
       "14  0.927761  \n",
       "15  0.920697  \n",
       "16  0.971526  \n",
       "17  0.965947  \n",
       "18  0.961411  \n",
       "19  0.953822  \n",
       "20  0.963178  \n",
       "21  0.958815  \n",
       "22  0.957633  \n",
       "23  0.945304  \n",
       "24  0.935312  \n",
       "25  0.783784  \n",
       "26  0.912195  \n",
       "27  0.792697  \n",
       "28  0.935190  \n",
       "29  0.814149  \n",
       "30  0.905172  \n",
       "31  0.868317  \n",
       "32  0.850538  \n",
       "33  0.867751  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ri_path = \"path/to/results/refusal_index_with_ci.csv\"\n",
    "\n",
    "ri_df = pd.read_csv(ri_path)\n",
    "\n",
    "ri_df\n",
    "\n",
    "# remove all rows where model is gemma3_12b\n",
    "# ri_df = ri_df[ri_df['model'] != 'gemma3_12b']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "09c2fd27",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "01b3aefd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Merged (per-prompt) DataFrame columns:\n",
      "['c_avg', 'c_ci_lower_avg', 'c_ci_upper_avg', 'c_sem_avg', 'correct_attempted_avg', 'correct_attempted_ci_lower_avg', 'correct_attempted_ci_upper_avg', 'correct_attempted_sem_avg', 'dataset', 'fscore_avg', 'fscore_ci_lower_avg', 'fscore_ci_upper_avg', 'fscore_sem_avg', 'model', 'n_prompts', 'r_avg', 'r_ci_lower_avg', 'r_ci_upper_avg', 'r_sem_avg', 'ri_refusal_index_avg', 'ri_refusal_index_ci_lower_avg', 'ri_refusal_index_ci_upper_avg', 'ri_refusal_index_sem_avg', 'weighted_avg', 'weighted_ci_lower_avg', 'weighted_ci_upper_avg', 'weighted_sem_avg']\n",
      "\n",
      "Averaged (per-model, >1 prompt) head:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_3630194/1077122074.py:130: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
      "  .apply(aggregate_group)\n"
     ]
    }
   ],
   "source": [
    "\n",
    "# Head of sm_df\n",
    "# > sm_df.head()\n",
    "#     dataset         model  prompt_index     n       c  c_ci_lower  c_ci_upper  \\\n",
    "# 4  simpleqa  mistral-123b             1  2000  0.2155    0.197000    0.232500   \n",
    "# 5  simpleqa  mistral-123b             2  2000  0.2055    0.187488    0.223500   \n",
    "# 6  simpleqa  mistral-123b             3  2000  0.1790    0.163488    0.196500   \n",
    "# 7  simpleqa  mistral-123b             4  2000  0.1410    0.126000    0.157012   \n",
    "# 8  simpleqa      qwen235b             1  2000  0.4075    0.387000    0.427512   \n",
    "\n",
    "#    c_bootstrap_std       r  r_ci_lower  ...  correct_attempted_ci_upper  \\\n",
    "# 4         0.008966  0.2625      0.2450  ...                    0.313758   \n",
    "# 5         0.008807  0.3325      0.3125  ...                    0.331589   \n",
    "# 6         0.008589  0.4510      0.4300  ...                    0.355438   \n",
    "# 7         0.007826  0.6600      0.6395  ...                    0.452665   \n",
    "# 8         0.010539  0.3080      0.2895  ...                    0.615949   \n",
    "\n",
    "#    correct_attempted_bootstrap_std    fscore  fscore_ci_lower  \\\n",
    "# 4                         0.011619  0.248058         0.227782   \n",
    "# 5                         0.012192  0.246477         0.225097   \n",
    "# 6                         0.014357  0.231117         0.211609   \n",
    "# 7                         0.018808  0.210448         0.190189   \n",
    "# 8                         0.013476  0.481678         0.458765   \n",
    "\n",
    "#    fscore_ci_upper  fscore_bootstrap_std  weighted  weighted_ci_lower  \\\n",
    "# 4         0.266709              0.010003   -0.5220          -0.542500   \n",
    "# 5         0.266554              0.010067   -0.4620          -0.484012   \n",
    "# 6         0.252322              0.010559   -0.3700          -0.390500   \n",
    "# 7         0.232411              0.010802   -0.1990          -0.215500   \n",
    "# 8         0.504127              0.011369   -0.2845          -0.305000   \n",
    "\n",
    "#    weighted_ci_upper  weighted_bootstrap_std  \n",
    "# 4          -0.500000                0.011006  \n",
    "# 5          -0.440000                0.010908  \n",
    "# 6          -0.348487                0.010702  \n",
    "# 7          -0.181487                0.008861  \n",
    "# 8          -0.264500                0.010611  \n",
    "\n",
    "# [5 rows x 24 columns]\n",
    "\n",
    "# Head of ri_df\n",
    "# > ri_df.head()\n",
    "#     dataset         model  prompt_index  refusal_index  ci_lower  ci_upper  \\\n",
    "# 4  simpleqa  mistral-123b             1       0.386068  0.315948  0.452849   \n",
    "# 5  simpleqa  mistral-123b             2       0.395116  0.331371  0.460671   \n",
    "# 6  simpleqa  mistral-123b             3       0.366625  0.299547  0.421378   \n",
    "# 7  simpleqa  mistral-123b             4       0.409233  0.350027  0.475931   \n",
    "# 8  simpleqa      qwen235b             1       0.361352  0.294596  0.435840   \n",
    "\n",
    "#    ci_width  bootstrap_std  refusal_rate      mu      mu_a      mu_r  \n",
    "# 4  0.136901       0.033958        0.2625  0.7495  0.699661  0.889524  \n",
    "# 5  0.129300       0.038004        0.3325  0.7495  0.684644  0.879699  \n",
    "# 6  0.121832       0.035892        0.4510  0.7495  0.665756  0.851441  \n",
    "# 7  0.125904       0.033378        0.6600  0.7495  0.598529  0.827273  \n",
    "# 8  0.141244       0.037152        0.3080  0.4930  0.418353  0.660714  \n",
    "\n",
    "# TODO: create a merged_df that merges ri_df colums into the sm_df columns as ri_xx. And then average all models with more than one prompt_index. You need to come up with a sensible way to compute the ci for the averaged values.\n",
    "\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "# 1) Merge RI into SM with ri_ prefix\n",
    "merge_keys = ['dataset', 'model', 'prompt_index']\n",
    "ri_cols_to_prefix = [c for c in ri_df.columns if c not in merge_keys]\n",
    "ri_df_prefixed = ri_df.rename(columns={c: f\"ri_{c}\" for c in ri_cols_to_prefix})\n",
    "merged_df = pd.merge(sm_df, ri_df_prefixed, on=merge_keys, how='left')\n",
    "\n",
    "# 2) Compute per-model averages across prompts with sensible CIs\n",
    "# We'll combine per-prompt standard errors in quadrature and scale by k (count of prompts):\n",
    "#   sem_mean = sqrt(sum(sem_i^2)) / k,  CI = mean ± 1.96 * sem_mean\n",
    "# If per-prompt bootstrap stds are missing, estimate sem_i from (ci_upper-ci_lower)/(2*1.96).\n",
    "# As a last resort, use sample std across prompts / sqrt(k).\n",
    "\n",
    "sm_metric_specs = [\n",
    "    ('c', 'c_bootstrap_std', 'c_ci_lower', 'c_ci_upper'),\n",
    "    ('r', 'r_bootstrap_std', 'r_ci_lower', 'r_ci_upper'),\n",
    "    ('correct_attempted', 'correct_attempted_bootstrap_std', 'correct_attempted_ci_lower', 'correct_attempted_ci_upper'),\n",
    "    ('fscore', 'fscore_bootstrap_std', 'fscore_ci_lower', 'fscore_ci_upper'),\n",
    "    ('weighted', 'weighted_bootstrap_std', 'weighted_ci_lower', 'weighted_ci_upper'),\n",
    "]\n",
    "ri_metric_specs = [\n",
    "    ('ri_refusal_index', 'ri_bootstrap_std', 'ri_ci_lower', 'ri_ci_upper')\n",
    "]\n",
    "\n",
    "# Only keep metrics that exist in the merged_df\n",
    "metric_specs = [spec for spec in (sm_metric_specs + ri_metric_specs) if spec[0] in merged_df.columns]\n",
    "Z = 1.96\n",
    "\n",
    "\n",
    "def aggregate_group(group: pd.DataFrame) -> pd.Series:\n",
    "    record = {\n",
    "        'dataset': group.name[0],\n",
    "        'model': group.name[1],\n",
    "        'n_prompts': int(group['prompt_index'].nunique()),\n",
    "    }\n",
    "    for val_col, std_col, lower_col, upper_col in metric_specs:\n",
    "        values = group[val_col].astype(float).to_numpy()\n",
    "        k = values.size\n",
    "        if k == 0 or np.all(np.isnan(values)):\n",
    "            continue\n",
    "        mean_val = np.nanmean(values)\n",
    "\n",
    "        sems = None\n",
    "        if std_col in group.columns:\n",
    "            sems = group[std_col].astype(float).to_numpy()\n",
    "            if np.all(np.isnan(sems)):\n",
    "                sems = None\n",
    "        if sems is None and lower_col in group.columns and upper_col in group.columns:\n",
    "            half_widths = (group[upper_col].astype(float) - group[lower_col].astype(float)) / 2.0\n",
    "            sems = (half_widths / Z).to_numpy()\n",
    "            if np.all(np.isnan(sems)):\n",
    "                sems = None\n",
    "\n",
    "        if sems is None:\n",
    "            # Fallback: variability across prompts (conservative if prompts differ)\n",
    "            sem = np.nan if k == 1 else np.nanstd(values, ddof=1) / np.sqrt(k)\n",
    "        else:\n",
    "            sem = np.sqrt(np.nansum(np.square(sems))) / k\n",
    "\n",
    "        half_width = Z * sem if not np.isnan(sem) else np.nan\n",
    "        record[f'{val_col}_avg'] = mean_val\n",
    "        record[f'{val_col}_ci_lower_avg'] = mean_val - half_width if not np.isnan(half_width) else np.nan\n",
    "        record[f'{val_col}_ci_upper_avg'] = mean_val + half_width if not np.isnan(half_width) else np.nan\n",
    "        record[f'{val_col}_sem_avg'] = sem\n",
    "\n",
    "    return pd.Series(record)\n",
    "\n",
    "averaged_df = (\n",
    "    merged_df\n",
    "    .groupby(['dataset', 'model'], as_index=False)\n",
    "    .apply(aggregate_group)\n",
    "    .reset_index(drop=True)\n",
    ")\n",
    "\n",
    "\n",
    "print('Merged (per-prompt) DataFrame columns:')\n",
    "print(sorted(averaged_df.columns))\n",
    "print('\\nAveraged (per-model, >1 prompt) head:')\n",
    "# save merged_df to csv\n",
    "averaged_df.round(2).to_json    ('path/to/results/averaged_df.json', orient='records')\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "22b4475b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Models with all 4 prompt indices:\n",
      "- gemma3_12b\n",
      "- mistral-123b\n",
      "- qwen235b\n",
      "- qwen25-72b\n",
      "- qwen32b\n",
      "- qwen32b_think\n",
      "- deepseek-chat-v3-0324\n",
      "\n",
      "Total: 7 models\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "# First, let's identify models that have all 4 prompt_index values\n",
    "models_with_4_prompts = []\n",
    "for model in sm_df['model'].unique():\n",
    "    model_data = sm_df[sm_df['model'] == model]\n",
    "    prompt_indices = sorted(model_data['prompt_index'].unique())\n",
    "    if prompt_indices == [1, 2, 3, 4]:\n",
    "        models_with_4_prompts.append(model)\n",
    "\n",
    "print(\"Models with all 4 prompt indices:\")\n",
    "for model in models_with_4_prompts:\n",
    "    print(f\"- {model}\")\n",
    "print(f\"\\nTotal: {len(models_with_4_prompts)} models\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "c046cfae",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Simple Metrics Stability Analysis:\n",
      "==================================================\n",
      "\n",
      "C:\n",
      "  gemma3_12b           | Norm Diff:  -1.1163 | CV:   0.4652\n",
      "  mistral-123b         | Norm Diff:  -0.4022 | CV:   0.1556\n",
      "  qwen235b             | Norm Diff:  -0.4660 | CV:   0.2217\n",
      "  qwen25-72b           | Norm Diff:  -0.8377 | CV:   0.3502\n",
      "  qwen32b              | Norm Diff:  -1.0081 | CV:   0.3891\n",
      "  qwen32b_think        | Norm Diff:  -0.9568 | CV:   0.3545\n",
      "  deepseek-chat-v3-0324 | Norm Diff:  -0.9906 | CV:   0.3646\n",
      "\n",
      "R:\n",
      "  gemma3_12b           | Norm Diff:   1.9567 | CV:   0.8152\n",
      "  mistral-123b         | Norm Diff:   0.9320 | CV:   0.3534\n",
      "  qwen235b             | Norm Diff:   0.9469 | CV:   0.4729\n",
      "  qwen25-72b           | Norm Diff:   0.4288 | CV:   0.1723\n",
      "  qwen32b              | Norm Diff:   0.7358 | CV:   0.2716\n",
      "  qwen32b_think        | Norm Diff:   0.5447 | CV:   0.1943\n",
      "  deepseek-chat-v3-0324 | Norm Diff:   0.9445 | CV:   0.3484\n",
      "\n",
      "CORRECT_ATTEMPTED:\n",
      "  gemma3_12b           | Norm Diff:   1.1263 | CV:   0.4520\n",
      "  mistral-123b         | Norm Diff:   0.3655 | CV:   0.1415\n",
      "  qwen235b             | Norm Diff:   0.1173 | CV:   0.0641\n",
      "  qwen25-72b           | Norm Diff:   0.4962 | CV:   0.1885\n",
      "  qwen32b              | Norm Diff:   1.0457 | CV:   0.3982\n",
      "  qwen32b_think        | Norm Diff:   0.4845 | CV:   0.1717\n",
      "  deepseek-chat-v3-0324 | Norm Diff:   0.4053 | CV:   0.1505\n",
      "\n",
      "FSCORE:\n",
      "  gemma3_12b           | Norm Diff:  -0.6737 | CV:   0.2778\n",
      "  mistral-123b         | Norm Diff:  -0.1607 | CV:   0.0647\n",
      "  qwen235b             | Norm Diff:  -0.3060 | CV:   0.1379\n",
      "  qwen25-72b           | Norm Diff:  -0.5997 | CV:   0.2564\n",
      "  qwen32b              | Norm Diff:  -0.7201 | CV:   0.2978\n",
      "  qwen32b_think        | Norm Diff:  -0.7089 | CV:   0.2781\n",
      "  deepseek-chat-v3-0324 | Norm Diff:  -0.6515 | CV:   0.2468\n",
      "\n",
      "WEIGHTED:\n",
      "  gemma3_12b           | Norm Diff:  -1.6365 | CV:   0.6818\n",
      "  mistral-123b         | Norm Diff:  -0.8319 | CV:   0.3141\n",
      "  qwen235b             | Norm Diff:  -0.6154 | CV:   0.3241\n",
      "  qwen25-72b           | Norm Diff:  -1.3199 | CV:   0.5275\n",
      "  qwen32b              | Norm Diff:  -1.6525 | CV:   0.6094\n",
      "  qwen32b_think        | Norm Diff:  -1.4220 | CV:   0.5071\n",
      "  deepseek-chat-v3-0324 | Norm Diff:  -1.4969 | CV:   0.5548\n"
     ]
    }
   ],
   "source": [
    "# Define the metrics to analyze from simple_metrics\n",
    "sm_metrics = ['c', 'r', 'correct_attempted', 'fscore', 'weighted']\n",
    "\n",
    "# Calculate stability metrics for each model and metric\n",
    "stability_results = []\n",
    "\n",
    "for model in models_with_4_prompts:\n",
    "    model_data = sm_df[sm_df['model'] == model].sort_values('prompt_index')\n",
    "    \n",
    "    for metric in sm_metrics:\n",
    "        scores = model_data[metric].values\n",
    "        \n",
    "        # Calculate (score[4] - score[1]) / avg(score)\n",
    "        # prompt_index 4 is at index 3, prompt_index 1 is at index 0\n",
    "        score_diff_normalized = (scores[3] - scores[0]) / np.mean(scores)\n",
    "        \n",
    "        # Calculate coefficient of variance (CV = std/mean)\n",
    "        cv = np.std(scores) / np.abs(np.mean(scores))\n",
    "        \n",
    "        stability_results.append({\n",
    "            'model': model,\n",
    "            'metric': metric,\n",
    "            'score_diff_normalized': score_diff_normalized,\n",
    "            'cv': cv,\n",
    "            'scores': scores.tolist()\n",
    "        })\n",
    "\n",
    "# Convert to DataFrame for easier viewing\n",
    "stability_df = pd.DataFrame(stability_results)\n",
    "print(\"Simple Metrics Stability Analysis:\")\n",
    "print(\"=\"*50)\n",
    "for metric in sm_metrics:\n",
    "    print(f\"\\n{metric.upper()}:\")\n",
    "    metric_data = stability_df[stability_df['metric'] == metric]\n",
    "    for _, row in metric_data.iterrows():\n",
    "        print(f\"  {row['model']:20} | Norm Diff: {row['score_diff_normalized']:8.4f} | CV: {row['cv']:8.4f}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "cba110cd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "============================================================\n",
      "REFUSAL INDEX Stability Analysis:\n",
      "============================================================\n",
      "Models with all 4 prompt indices in RI data: 6\n",
      "\n",
      "REFUSAL_INDEX:\n",
      "  gemma3_12b           | Norm Diff:   0.5082 | CV:   0.2450\n",
      "  mistral-123b         | Norm Diff:   0.0595 | CV:   0.0397\n",
      "  qwen235b             | Norm Diff:  -0.1939 | CV:   0.0860\n",
      "  qwen25-72b           | Norm Diff:  -0.0744 | CV:   0.0339\n",
      "  qwen32b              | Norm Diff:   0.4605 | CV:   0.1874\n",
      "  qwen32b_think        | Norm Diff:   0.1421 | CV:   0.0727\n"
     ]
    }
   ],
   "source": [
    "# Now analyze refusal index stability\n",
    "print(\"\\n\" + \"=\"*60)\n",
    "print(\"REFUSAL INDEX Stability Analysis:\")\n",
    "print(\"=\"*60)\n",
    "\n",
    "# Check which models have all 4 prompt_index in refusal index data\n",
    "ri_models_with_4_prompts = []\n",
    "for model in ri_df['model'].unique():\n",
    "    model_data = ri_df[ri_df['model'] == model]\n",
    "    prompt_indices = sorted(model_data['prompt_index'].unique())\n",
    "    if prompt_indices == [1, 2, 3, 4]:\n",
    "        ri_models_with_4_prompts.append(model)\n",
    "\n",
    "print(f\"Models with all 4 prompt indices in RI data: {len(ri_models_with_4_prompts)}\")\n",
    "\n",
    "# Calculate stability for refusal index\n",
    "ri_stability_results = []\n",
    "metric = 'refusal_index'\n",
    "\n",
    "for model in ri_models_with_4_prompts:\n",
    "    model_data = ri_df[ri_df['model'] == model].sort_values('prompt_index')\n",
    "    scores = model_data[metric].values\n",
    "    \n",
    "    # Calculate (score[4] - score[1]) / avg(score)\n",
    "    score_diff_normalized = (scores[3] - scores[0]) / np.mean(scores)\n",
    "    \n",
    "    # Calculate coefficient of variance (CV = std/mean)\n",
    "    cv = np.std(scores) / np.abs(np.mean(scores))\n",
    "    \n",
    "    ri_stability_results.append({\n",
    "        'model': model,\n",
    "        'metric': metric,\n",
    "        'score_diff_normalized': score_diff_normalized,\n",
    "        'cv': cv,\n",
    "        'scores': scores.tolist()\n",
    "    })\n",
    "\n",
    "print(f\"\\n{metric.upper()}:\")\n",
    "for result in ri_stability_results:\n",
    "    print(f\"  {result['model']:20} | Norm Diff: {result['score_diff_normalized']:8.4f} | CV: {result['cv']:8.4f}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "cd914549",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "============================================================\n",
      "MODEL-AGGREGATED STABILITY STATISTICS:\n",
      "============================================================\n",
      "Processing metric: c\n",
      "Processing metric: r\n",
      "Processing metric: correct_attempted\n",
      "Processing metric: fscore\n",
      "Processing metric: weighted\n",
      "Processing metric: refusal_index\n",
      "Metric             | Avg Norm Diff | Std Norm Diff | Avg CV   | Std CV   | N Models\n",
      "--------------------------------------------------------------------------------\n",
      "c                  |     -0.8254 |      0.2594 |  0.3287 |  0.0972 |       7\n",
      "r                  |      0.9271 |      0.4617 |  0.3754 |  0.2032 |       7\n",
      "correct_attempted  |      0.5772 |      0.3427 |  0.2238 |  0.1332 |       7\n",
      "fscore             |     -0.5458 |      0.2047 |  0.2228 |  0.0807 |       7\n",
      "weighted           |     -1.2822 |      0.3736 |  0.5027 |  0.1278 |       7\n",
      "refusal_index      |      0.1504 |      0.2587 |  0.1108 |  0.0784 |       6\n",
      "\n",
      "Aggregated Statistics DataFrame:\n",
      "              metric  avg_norm_diff  std_norm_diff    avg_cv    std_cv  \\\n",
      "0                  c      -0.825391       0.259391  0.328694  0.097184   \n",
      "1                  r       0.927055       0.461652  0.375432  0.203165   \n",
      "2  correct_attempted       0.577248       0.342680  0.223796  0.133159   \n",
      "3             fscore      -0.545797       0.204671  0.222776  0.080740   \n",
      "4           weighted      -1.282151       0.373608  0.502669  0.127789   \n",
      "5      refusal_index       0.150360       0.258720  0.110782  0.078420   \n",
      "\n",
      "   n_models  \n",
      "0         7  \n",
      "1         7  \n",
      "2         7  \n",
      "3         7  \n",
      "4         7  \n",
      "5         6  \n"
     ]
    }
   ],
   "source": [
    "# Calculate model-aggregated statistics\n",
    "print(\"\\n\" + \"=\"*60)\n",
    "print(\"MODEL-AGGREGATED STABILITY STATISTICS:\")\n",
    "print(\"=\"*60)\n",
    "\n",
    "# Combine all results\n",
    "all_results = stability_results + ri_stability_results\n",
    "\n",
    "# Create aggregated statistics\n",
    "aggregated_stats = []\n",
    "all_metrics = sm_metrics + ['refusal_index']\n",
    "\n",
    "for metric in all_metrics:\n",
    "    print(f\"Processing metric: {metric}\")\n",
    "    metric_results = [r for r in all_results if r['metric'] == metric]\n",
    "    \n",
    "    if metric_results:\n",
    "        norm_diffs = [r['score_diff_normalized'] for r in metric_results]\n",
    "        cvs = [r['cv'] for r in metric_results]\n",
    "\n",
    "        # print(f\"Norm diffs: {norm_diffs}\")\n",
    "        # print(f\"CVs: {cvs}\")\n",
    "\n",
    "        avg_norm_diff = np.mean(norm_diffs)\n",
    "        avg_cv = np.mean(cvs)\n",
    "        std_norm_diff = np.std(norm_diffs)\n",
    "        std_cv = np.std(cvs)\n",
    "        \n",
    "        aggregated_stats.append({\n",
    "            'metric': metric,\n",
    "            'avg_norm_diff': avg_norm_diff,\n",
    "            'std_norm_diff': std_norm_diff,\n",
    "            'avg_cv': avg_cv,\n",
    "            'std_cv': std_cv,\n",
    "            'n_models': len(metric_results)\n",
    "        })\n",
    "\n",
    "# Display aggregated results\n",
    "print(f\"{'Metric':<18} | {'Avg Norm Diff':<12} | {'Std Norm Diff':<12} | {'Avg CV':<8} | {'Std CV':<8} | {'N Models':<8}\")\n",
    "print(\"-\" * 80)\n",
    "for stat in aggregated_stats:\n",
    "    print(f\"{stat['metric']:<18} | {stat['avg_norm_diff']:>11.4f} | {stat['std_norm_diff']:>11.4f} | {stat['avg_cv']:>7.4f} | {stat['std_cv']:>7.4f} | {stat['n_models']:>7}\")\n",
    "\n",
    "# Create a summary DataFrame for easier manipulation\n",
    "aggregated_df = pd.DataFrame(aggregated_stats)\n",
    "print(f\"\\nAggregated Statistics DataFrame:\")\n",
    "print(aggregated_df)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "9fbffc7e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "============================================================\n",
      "STABILITY RANKINGS:\n",
      "============================================================\n",
      "\n",
      "C - Stability Ranking (by CV, lower = more stable):\n",
      "   1. mistral-123b         | CV: 0.1556 | Norm Diff: -0.4022\n",
      "   2. qwen235b             | CV: 0.2217 | Norm Diff: -0.4660\n",
      "   3. qwen25-72b           | CV: 0.3502 | Norm Diff: -0.8377\n",
      "   4. qwen32b_think        | CV: 0.3545 | Norm Diff: -0.9568\n",
      "   5. deepseek-chat-v3-0324 | CV: 0.3646 | Norm Diff: -0.9906\n",
      "   6. qwen32b              | CV: 0.3891 | Norm Diff: -1.0081\n",
      "   7. gemma3_12b           | CV: 0.4652 | Norm Diff: -1.1163\n",
      "\n",
      "R - Stability Ranking (by CV, lower = more stable):\n",
      "   1. qwen25-72b           | CV: 0.1723 | Norm Diff:  0.4288\n",
      "   2. qwen32b_think        | CV: 0.1943 | Norm Diff:  0.5447\n",
      "   3. qwen32b              | CV: 0.2716 | Norm Diff:  0.7358\n",
      "   4. deepseek-chat-v3-0324 | CV: 0.3484 | Norm Diff:  0.9445\n",
      "   5. mistral-123b         | CV: 0.3534 | Norm Diff:  0.9320\n",
      "   6. qwen235b             | CV: 0.4729 | Norm Diff:  0.9469\n",
      "   7. gemma3_12b           | CV: 0.8152 | Norm Diff:  1.9567\n",
      "\n",
      "CORRECT_ATTEMPTED - Stability Ranking (by CV, lower = more stable):\n",
      "   1. qwen235b             | CV: 0.0641 | Norm Diff:  0.1173\n",
      "   2. mistral-123b         | CV: 0.1415 | Norm Diff:  0.3655\n",
      "   3. deepseek-chat-v3-0324 | CV: 0.1505 | Norm Diff:  0.4053\n",
      "   4. qwen32b_think        | CV: 0.1717 | Norm Diff:  0.4845\n",
      "   5. qwen25-72b           | CV: 0.1885 | Norm Diff:  0.4962\n",
      "   6. qwen32b              | CV: 0.3982 | Norm Diff:  1.0457\n",
      "   7. gemma3_12b           | CV: 0.4520 | Norm Diff:  1.1263\n",
      "\n",
      "FSCORE - Stability Ranking (by CV, lower = more stable):\n",
      "   1. mistral-123b         | CV: 0.0647 | Norm Diff: -0.1607\n",
      "   2. qwen235b             | CV: 0.1379 | Norm Diff: -0.3060\n",
      "   3. deepseek-chat-v3-0324 | CV: 0.2468 | Norm Diff: -0.6515\n",
      "   4. qwen25-72b           | CV: 0.2564 | Norm Diff: -0.5997\n",
      "   5. gemma3_12b           | CV: 0.2778 | Norm Diff: -0.6737\n",
      "   6. qwen32b_think        | CV: 0.2781 | Norm Diff: -0.7089\n",
      "   7. qwen32b              | CV: 0.2978 | Norm Diff: -0.7201\n",
      "\n",
      "WEIGHTED - Stability Ranking (by CV, lower = more stable):\n",
      "   1. mistral-123b         | CV: 0.3141 | Norm Diff: -0.8319\n",
      "   2. qwen235b             | CV: 0.3241 | Norm Diff: -0.6154\n",
      "   3. qwen32b_think        | CV: 0.5071 | Norm Diff: -1.4220\n",
      "   4. qwen25-72b           | CV: 0.5275 | Norm Diff: -1.3199\n",
      "   5. deepseek-chat-v3-0324 | CV: 0.5548 | Norm Diff: -1.4969\n",
      "   6. qwen32b              | CV: 0.6094 | Norm Diff: -1.6525\n",
      "   7. gemma3_12b           | CV: 0.6818 | Norm Diff: -1.6365\n",
      "\n",
      "REFUSAL_INDEX - Stability Ranking (by CV, lower = more stable):\n",
      "   1. qwen25-72b           | CV: 0.0339 | Norm Diff: -0.0744\n",
      "   2. mistral-123b         | CV: 0.0397 | Norm Diff:  0.0595\n",
      "   3. qwen32b_think        | CV: 0.0727 | Norm Diff:  0.1421\n",
      "   4. qwen235b             | CV: 0.0860 | Norm Diff: -0.1939\n",
      "   5. qwen32b              | CV: 0.1874 | Norm Diff:  0.4605\n",
      "   6. gemma3_12b           | CV: 0.2450 | Norm Diff:  0.5082\n",
      "\n",
      "============================================================\n",
      "INTERPRETATION:\n",
      "============================================================\n",
      "Normalized Difference: (score[prompt_4] - score[prompt_1]) / mean(scores)\n",
      "  - Positive: Performance improves from prompt 1 to prompt 4\n",
      "  - Negative: Performance degrades from prompt 1 to prompt 4\n",
      "  - Closer to 0: More consistent performance across prompts\n",
      "\n",
      "Coefficient of Variance (CV): std(scores) / |mean(scores)|\n",
      "  - Lower CV: More stable/consistent across prompts\n",
      "  - Higher CV: More variable/sensitive to prompt changes\n"
     ]
    }
   ],
   "source": [
    "# Additional analysis: Identify most and least stable models per metric\n",
    "print(\"\\n\" + \"=\"*60)\n",
    "print(\"STABILITY RANKINGS:\")\n",
    "print(\"=\"*60)\n",
    "\n",
    "for metric in all_metrics:\n",
    "    metric_results = [r for r in all_results if r['metric'] == metric]\n",
    "    if not metric_results:\n",
    "        continue\n",
    "    \n",
    "    # Sort by CV (coefficient of variance) - lower CV means more stable\n",
    "    metric_results_sorted = sorted(metric_results, key=lambda x: x['cv'])\n",
    "    \n",
    "    print(f\"\\n{metric.upper()} - Stability Ranking (by CV, lower = more stable):\")\n",
    "    for i, result in enumerate(metric_results_sorted, 1):\n",
    "        print(f\"  {i:2}. {result['model']:20} | CV: {result['cv']:6.4f} | Norm Diff: {result['score_diff_normalized']:7.4f}\")\n",
    "\n",
    "# Summary interpretation\n",
    "print(\"\\n\" + \"=\"*60)\n",
    "print(\"INTERPRETATION:\")\n",
    "print(\"=\"*60)\n",
    "print(\"Normalized Difference: (score[prompt_4] - score[prompt_1]) / mean(scores)\")\n",
    "print(\"  - Positive: Performance improves from prompt 1 to prompt 4\")\n",
    "print(\"  - Negative: Performance degrades from prompt 1 to prompt 4\")\n",
    "print(\"  - Closer to 0: More consistent performance across prompts\")\n",
    "print()\n",
    "print(\"Coefficient of Variance (CV): std(scores) / |mean(scores)|\")\n",
    "print(\"  - Lower CV: More stable/consistent across prompts\")\n",
    "print(\"  - Higher CV: More variable/sensitive to prompt changes\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "684e498d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
