{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ab57ec82",
   "metadata": {},
   "source": [
    "For onboarding results into the Table. "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4e16d640",
   "metadata": {},
   "source": [
    "## Onboard Main Table for consistency evaluation. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "id": "7199b2f6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>dataset</th>\n",
       "      <th>action</th>\n",
       "      <th>llm</th>\n",
       "      <th>?A1=A2</th>\n",
       "      <th>?A1=A3+A4</th>\n",
       "      <th>?A1&gt;A3</th>\n",
       "      <th>?A1&gt;A4</th>\n",
       "      <th>?A3∅A4</th>\n",
       "      <th>?A4=A1|3</th>\n",
       "      <th>?A1=A1*</th>\n",
       "      <th>...</th>\n",
       "      <th>p(A1&gt;A3)_x</th>\n",
       "      <th>p(A1&gt;A4)_x</th>\n",
       "      <th>p(A3∅A4)_x</th>\n",
       "      <th>p(A4=A1|3)_x</th>\n",
       "      <th>p(A1=A2)_y</th>\n",
       "      <th>p(A1=A3+A4)_y</th>\n",
       "      <th>p(A1&gt;A3)_y</th>\n",
       "      <th>p(A1&gt;A4)_y</th>\n",
       "      <th>p(A3∅A4)_y</th>\n",
       "      <th>p(A4=A1|3)_y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>LC-QuAD</td>\n",
       "      <td>classification</td>\n",
       "      <td>deepseek-chat</td>\n",
       "      <td>0.9000</td>\n",
       "      <td>0.8400</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.9133</td>\n",
       "      <td>0.5800</td>\n",
       "      <td>0.4733</td>\n",
       "      <td>0.4400</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0883</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0883</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>LC-QuAD</td>\n",
       "      <td>classification</td>\n",
       "      <td>deepseek-reasoner</td>\n",
       "      <td>0.6333</td>\n",
       "      <td>0.5933</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.6933</td>\n",
       "      <td>0.8133</td>\n",
       "      <td>0.4867</td>\n",
       "      <td>0.3400</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.8721</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.8721</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>LC-QuAD</td>\n",
       "      <td>classification</td>\n",
       "      <td>gemini-2.0-flash</td>\n",
       "      <td>0.9267</td>\n",
       "      <td>0.8000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.9467</td>\n",
       "      <td>0.8267</td>\n",
       "      <td>0.6667</td>\n",
       "      <td>0.3067</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0001</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0001</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>LC-QuAD</td>\n",
       "      <td>classification</td>\n",
       "      <td>gemini-2.5-flash</td>\n",
       "      <td>0.9067</td>\n",
       "      <td>0.9400</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.9533</td>\n",
       "      <td>0.9267</td>\n",
       "      <td>0.8933</td>\n",
       "      <td>0.2267</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0145</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0145</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>LC-QuAD</td>\n",
       "      <td>classification</td>\n",
       "      <td>gemini-2.5-pro</td>\n",
       "      <td>0.9400</td>\n",
       "      <td>0.9000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.9667</td>\n",
       "      <td>0.9533</td>\n",
       "      <td>0.8667</td>\n",
       "      <td>0.3067</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0012</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0012</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 59 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   dataset          action                llm  ?A1=A2  ?A1=A3+A4  ?A1>A3  \\\n",
       "0  LC-QuAD  classification      deepseek-chat  0.9000     0.8400     1.0   \n",
       "1  LC-QuAD  classification  deepseek-reasoner  0.6333     0.5933     1.0   \n",
       "2  LC-QuAD  classification   gemini-2.0-flash  0.9267     0.8000     1.0   \n",
       "3  LC-QuAD  classification   gemini-2.5-flash  0.9067     0.9400     1.0   \n",
       "4  LC-QuAD  classification     gemini-2.5-pro  0.9400     0.9000     1.0   \n",
       "\n",
       "   ?A1>A4  ?A3∅A4  ?A4=A1|3  ?A1=A1*  ...  p(A1>A3)_x  p(A1>A4)_x  p(A3∅A4)_x  \\\n",
       "0  0.9133  0.5800    0.4733   0.4400  ...         0.0         0.0      0.0883   \n",
       "1  0.6933  0.8133    0.4867   0.3400  ...         0.0         0.0      0.8721   \n",
       "2  0.9467  0.8267    0.6667   0.3067  ...         0.0         0.0      0.0001   \n",
       "3  0.9533  0.9267    0.8933   0.2267  ...         0.0         0.0      0.0145   \n",
       "4  0.9667  0.9533    0.8667   0.3067  ...         0.0         0.0      0.0012   \n",
       "\n",
       "   p(A4=A1|3)_x  p(A1=A2)_y  p(A1=A3+A4)_y  p(A1>A3)_y  p(A1>A4)_y  \\\n",
       "0           0.0         0.0            0.0         0.0         0.0   \n",
       "1           0.0         0.0            0.0         0.0         0.0   \n",
       "2           0.0         0.0            0.0         0.0         0.0   \n",
       "3           0.0         0.0            0.0         0.0         0.0   \n",
       "4           0.0         0.0            0.0         0.0         0.0   \n",
       "\n",
       "   p(A3∅A4)_y  p(A4=A1|3)_y  \n",
       "0      0.0883           0.0  \n",
       "1      0.8721           0.0  \n",
       "2      0.0001           0.0  \n",
       "3      0.0145           0.0  \n",
       "4      0.0012           0.0  \n",
       "\n",
       "[5 rows x 59 columns]"
      ]
     },
     "execution_count": 95,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "\n",
    "# Load the data from summary_xidx.csv in the output folder\n",
    "folder = \"../../output/\"\n",
    "file = os.path.join(folder, \"summary_xidk.csv\")\n",
    "df = pd.read_csv(file)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "id": "e230e4bf",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>dataset</th>\n",
       "      <th>action</th>\n",
       "      <th>llm</th>\n",
       "      <th>?A1=A2</th>\n",
       "      <th>?A1=A3+A4</th>\n",
       "      <th>?A1&gt;A3</th>\n",
       "      <th>?A1&gt;A4</th>\n",
       "      <th>?A3∅A4</th>\n",
       "      <th>?A4=A1|3</th>\n",
       "      <th>?A1=A1*</th>\n",
       "      <th>...</th>\n",
       "      <th>p(A1&gt;A3)_x</th>\n",
       "      <th>p(A1&gt;A4)_x</th>\n",
       "      <th>p(A3∅A4)_x</th>\n",
       "      <th>p(A4=A1|3)_x</th>\n",
       "      <th>p(A1=A2)_y</th>\n",
       "      <th>p(A1=A3+A4)_y</th>\n",
       "      <th>p(A1&gt;A3)_y</th>\n",
       "      <th>p(A1&gt;A4)_y</th>\n",
       "      <th>p(A3∅A4)_y</th>\n",
       "      <th>p(A4=A1|3)_y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>275</th>\n",
       "      <td>overall</td>\n",
       "      <td>classification</td>\n",
       "      <td>deepseek-chat</td>\n",
       "      <td>0.9450</td>\n",
       "      <td>0.8250</td>\n",
       "      <td>0.8467</td>\n",
       "      <td>0.9117</td>\n",
       "      <td>0.6733</td>\n",
       "      <td>0.5417</td>\n",
       "      <td>0.50330</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0001</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0001</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>276</th>\n",
       "      <td>overall</td>\n",
       "      <td>classification</td>\n",
       "      <td>deepseek-reasoner</td>\n",
       "      <td>0.7850</td>\n",
       "      <td>0.7883</td>\n",
       "      <td>0.7533</td>\n",
       "      <td>0.8533</td>\n",
       "      <td>0.6967</td>\n",
       "      <td>0.5150</td>\n",
       "      <td>0.42670</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>277</th>\n",
       "      <td>overall</td>\n",
       "      <td>classification</td>\n",
       "      <td>gemini-2.0-flash</td>\n",
       "      <td>0.8817</td>\n",
       "      <td>0.8483</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.9450</td>\n",
       "      <td>0.6433</td>\n",
       "      <td>0.5383</td>\n",
       "      <td>0.42000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.2967</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.2967</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>278</th>\n",
       "      <td>overall</td>\n",
       "      <td>classification</td>\n",
       "      <td>gemini-2.5-flash</td>\n",
       "      <td>0.8983</td>\n",
       "      <td>0.9300</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.9683</td>\n",
       "      <td>0.9067</td>\n",
       "      <td>0.8483</td>\n",
       "      <td>0.33000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0004</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0004</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>279</th>\n",
       "      <td>overall</td>\n",
       "      <td>classification</td>\n",
       "      <td>gemini-2.5-pro</td>\n",
       "      <td>0.8533</td>\n",
       "      <td>0.7650</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.8800</td>\n",
       "      <td>0.9300</td>\n",
       "      <td>0.7133</td>\n",
       "      <td>0.37330</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>341</th>\n",
       "      <td>overall</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>grok-3-mini</td>\n",
       "      <td>0.3433</td>\n",
       "      <td>0.2617</td>\n",
       "      <td>0.5233</td>\n",
       "      <td>0.4383</td>\n",
       "      <td>0.8767</td>\n",
       "      <td>0.2300</td>\n",
       "      <td>0.37670</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>342</th>\n",
       "      <td>overall</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>llama3.1:70b</td>\n",
       "      <td>0.2067</td>\n",
       "      <td>0.0700</td>\n",
       "      <td>0.2883</td>\n",
       "      <td>0.2383</td>\n",
       "      <td>0.7100</td>\n",
       "      <td>0.0283</td>\n",
       "      <td>0.20330</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>343</th>\n",
       "      <td>overall</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>llama3.1:8b</td>\n",
       "      <td>0.0167</td>\n",
       "      <td>0.0017</td>\n",
       "      <td>0.0300</td>\n",
       "      <td>0.0117</td>\n",
       "      <td>0.8433</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.02925</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>344</th>\n",
       "      <td>overall</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>mistral-small:24b</td>\n",
       "      <td>0.4407</td>\n",
       "      <td>0.2407</td>\n",
       "      <td>0.5034</td>\n",
       "      <td>0.4525</td>\n",
       "      <td>0.5085</td>\n",
       "      <td>0.0153</td>\n",
       "      <td>0.43010</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>345</th>\n",
       "      <td>overall</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>o3</td>\n",
       "      <td>0.3441</td>\n",
       "      <td>0.2288</td>\n",
       "      <td>0.5068</td>\n",
       "      <td>0.3864</td>\n",
       "      <td>0.8661</td>\n",
       "      <td>0.1915</td>\n",
       "      <td>0.31420</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>71 rows × 59 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     dataset          action                llm  ?A1=A2  ?A1=A3+A4  ?A1>A3  \\\n",
       "275  overall  classification      deepseek-chat  0.9450     0.8250  0.8467   \n",
       "276  overall  classification  deepseek-reasoner  0.7850     0.7883  0.7533   \n",
       "277  overall  classification   gemini-2.0-flash  0.8817     0.8483  1.0000   \n",
       "278  overall  classification   gemini-2.5-flash  0.8983     0.9300  1.0000   \n",
       "279  overall  classification     gemini-2.5-pro  0.8533     0.7650  1.0000   \n",
       "..       ...             ...                ...     ...        ...     ...   \n",
       "341  overall       zero-shot        grok-3-mini  0.3433     0.2617  0.5233   \n",
       "342  overall       zero-shot       llama3.1:70b  0.2067     0.0700  0.2883   \n",
       "343  overall       zero-shot        llama3.1:8b  0.0167     0.0017  0.0300   \n",
       "344  overall       zero-shot  mistral-small:24b  0.4407     0.2407  0.5034   \n",
       "345  overall       zero-shot                 o3  0.3441     0.2288  0.5068   \n",
       "\n",
       "     ?A1>A4  ?A3∅A4  ?A4=A1|3  ?A1=A1*  ...  p(A1>A3)_x  p(A1>A4)_x  \\\n",
       "275  0.9117  0.6733    0.5417  0.50330  ...         0.0         0.0   \n",
       "276  0.8533  0.6967    0.5150  0.42670  ...         0.0         0.0   \n",
       "277  0.9450  0.6433    0.5383  0.42000  ...         0.0         0.0   \n",
       "278  0.9683  0.9067    0.8483  0.33000  ...         0.0         0.0   \n",
       "279  0.8800  0.9300    0.7133  0.37330  ...         0.0         0.0   \n",
       "..      ...     ...       ...      ...  ...         ...         ...   \n",
       "341  0.4383  0.8767    0.2300  0.37670  ...         1.0         1.0   \n",
       "342  0.2383  0.7100    0.0283  0.20330  ...         1.0         1.0   \n",
       "343  0.0117  0.8433    0.0000  0.02925  ...         1.0         1.0   \n",
       "344  0.4525  0.5085    0.0153  0.43010  ...         1.0         1.0   \n",
       "345  0.3864  0.8661    0.1915  0.31420  ...         1.0         1.0   \n",
       "\n",
       "     p(A3∅A4)_x  p(A4=A1|3)_x  p(A1=A2)_y  p(A1=A3+A4)_y  p(A1>A3)_y  \\\n",
       "275      0.0001           0.0         0.0            0.0         0.0   \n",
       "276      1.0000           0.0         0.0            0.0         0.0   \n",
       "277      0.2967           0.0         0.0            0.0         0.0   \n",
       "278      0.0004           0.0         0.0            0.0         0.0   \n",
       "279      0.0000           0.0         0.0            0.0         0.0   \n",
       "..          ...           ...         ...            ...         ...   \n",
       "341      1.0000           1.0         1.0            1.0         1.0   \n",
       "342      1.0000           1.0         1.0            1.0         1.0   \n",
       "343      1.0000           1.0         1.0            1.0         1.0   \n",
       "344      1.0000           1.0         1.0            1.0         1.0   \n",
       "345      1.0000           1.0         1.0            1.0         1.0   \n",
       "\n",
       "     p(A1>A4)_y  p(A3∅A4)_y  p(A4=A1|3)_y  \n",
       "275         0.0      0.0001           0.0  \n",
       "276         0.0      1.0000           0.0  \n",
       "277         0.0      0.2967           0.0  \n",
       "278         0.0      0.0004           0.0  \n",
       "279         0.0      0.0000           0.0  \n",
       "..          ...         ...           ...  \n",
       "341         1.0      1.0000           1.0  \n",
       "342         1.0      1.0000           1.0  \n",
       "343         1.0      1.0000           1.0  \n",
       "344         1.0      1.0000           1.0  \n",
       "345         1.0      1.0000           1.0  \n",
       "\n",
       "[71 rows x 59 columns]"
      ]
     },
     "execution_count": 96,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = df[df['dataset'] == 'overall']\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "id": "a93fc89f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>dataset</th>\n",
       "      <th>action</th>\n",
       "      <th>llm</th>\n",
       "      <th>?A1=A2</th>\n",
       "      <th>?A1=A3+A4</th>\n",
       "      <th>?A1&gt;A3</th>\n",
       "      <th>?A1&gt;A4</th>\n",
       "      <th>?A3∅A4</th>\n",
       "      <th>?A4=A1|3</th>\n",
       "      <th>?A1=A1*</th>\n",
       "      <th>...</th>\n",
       "      <th>p(A1&gt;A4)_x</th>\n",
       "      <th>p(A3∅A4)_x</th>\n",
       "      <th>p(A4=A1|3)_x</th>\n",
       "      <th>p(A1=A2)_y</th>\n",
       "      <th>p(A1=A3+A4)_y</th>\n",
       "      <th>p(A1&gt;A3)_y</th>\n",
       "      <th>p(A1&gt;A4)_y</th>\n",
       "      <th>p(A3∅A4)_y</th>\n",
       "      <th>p(A4=A1|3)_y</th>\n",
       "      <th>ave_?A</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>275</th>\n",
       "      <td>overall</td>\n",
       "      <td>classification</td>\n",
       "      <td>deepseek-chat</td>\n",
       "      <td>0.9450</td>\n",
       "      <td>0.8250</td>\n",
       "      <td>0.8467</td>\n",
       "      <td>0.9117</td>\n",
       "      <td>0.6733</td>\n",
       "      <td>0.5417</td>\n",
       "      <td>0.50330</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0001</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0001</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.78368</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>276</th>\n",
       "      <td>overall</td>\n",
       "      <td>classification</td>\n",
       "      <td>deepseek-reasoner</td>\n",
       "      <td>0.7850</td>\n",
       "      <td>0.7883</td>\n",
       "      <td>0.7533</td>\n",
       "      <td>0.8533</td>\n",
       "      <td>0.6967</td>\n",
       "      <td>0.5150</td>\n",
       "      <td>0.42670</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.72066</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>277</th>\n",
       "      <td>overall</td>\n",
       "      <td>classification</td>\n",
       "      <td>gemini-2.0-flash</td>\n",
       "      <td>0.8817</td>\n",
       "      <td>0.8483</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.9450</td>\n",
       "      <td>0.6433</td>\n",
       "      <td>0.5383</td>\n",
       "      <td>0.42000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.2967</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.2967</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.80166</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>278</th>\n",
       "      <td>overall</td>\n",
       "      <td>classification</td>\n",
       "      <td>gemini-2.5-flash</td>\n",
       "      <td>0.8983</td>\n",
       "      <td>0.9300</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.9683</td>\n",
       "      <td>0.9067</td>\n",
       "      <td>0.8483</td>\n",
       "      <td>0.33000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0004</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0004</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.92432</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>279</th>\n",
       "      <td>overall</td>\n",
       "      <td>classification</td>\n",
       "      <td>gemini-2.5-pro</td>\n",
       "      <td>0.8533</td>\n",
       "      <td>0.7650</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.8800</td>\n",
       "      <td>0.9300</td>\n",
       "      <td>0.7133</td>\n",
       "      <td>0.37330</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.87532</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>341</th>\n",
       "      <td>overall</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>grok-3-mini</td>\n",
       "      <td>0.3433</td>\n",
       "      <td>0.2617</td>\n",
       "      <td>0.5233</td>\n",
       "      <td>0.4383</td>\n",
       "      <td>0.8767</td>\n",
       "      <td>0.2300</td>\n",
       "      <td>0.37670</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.48232</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>342</th>\n",
       "      <td>overall</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>llama3.1:70b</td>\n",
       "      <td>0.2067</td>\n",
       "      <td>0.0700</td>\n",
       "      <td>0.2883</td>\n",
       "      <td>0.2383</td>\n",
       "      <td>0.7100</td>\n",
       "      <td>0.0283</td>\n",
       "      <td>0.20330</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.29432</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>343</th>\n",
       "      <td>overall</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>llama3.1:8b</td>\n",
       "      <td>0.0167</td>\n",
       "      <td>0.0017</td>\n",
       "      <td>0.0300</td>\n",
       "      <td>0.0117</td>\n",
       "      <td>0.8433</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.02925</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.18034</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>344</th>\n",
       "      <td>overall</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>mistral-small:24b</td>\n",
       "      <td>0.4407</td>\n",
       "      <td>0.2407</td>\n",
       "      <td>0.5034</td>\n",
       "      <td>0.4525</td>\n",
       "      <td>0.5085</td>\n",
       "      <td>0.0153</td>\n",
       "      <td>0.43010</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.38408</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>345</th>\n",
       "      <td>overall</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>o3</td>\n",
       "      <td>0.3441</td>\n",
       "      <td>0.2288</td>\n",
       "      <td>0.5068</td>\n",
       "      <td>0.3864</td>\n",
       "      <td>0.8661</td>\n",
       "      <td>0.1915</td>\n",
       "      <td>0.31420</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.45898</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>71 rows × 60 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     dataset          action                llm  ?A1=A2  ?A1=A3+A4  ?A1>A3  \\\n",
       "275  overall  classification      deepseek-chat  0.9450     0.8250  0.8467   \n",
       "276  overall  classification  deepseek-reasoner  0.7850     0.7883  0.7533   \n",
       "277  overall  classification   gemini-2.0-flash  0.8817     0.8483  1.0000   \n",
       "278  overall  classification   gemini-2.5-flash  0.8983     0.9300  1.0000   \n",
       "279  overall  classification     gemini-2.5-pro  0.8533     0.7650  1.0000   \n",
       "..       ...             ...                ...     ...        ...     ...   \n",
       "341  overall       zero-shot        grok-3-mini  0.3433     0.2617  0.5233   \n",
       "342  overall       zero-shot       llama3.1:70b  0.2067     0.0700  0.2883   \n",
       "343  overall       zero-shot        llama3.1:8b  0.0167     0.0017  0.0300   \n",
       "344  overall       zero-shot  mistral-small:24b  0.4407     0.2407  0.5034   \n",
       "345  overall       zero-shot                 o3  0.3441     0.2288  0.5068   \n",
       "\n",
       "     ?A1>A4  ?A3∅A4  ?A4=A1|3  ?A1=A1*  ...  p(A1>A4)_x  p(A3∅A4)_x  \\\n",
       "275  0.9117  0.6733    0.5417  0.50330  ...         0.0      0.0001   \n",
       "276  0.8533  0.6967    0.5150  0.42670  ...         0.0      1.0000   \n",
       "277  0.9450  0.6433    0.5383  0.42000  ...         0.0      0.2967   \n",
       "278  0.9683  0.9067    0.8483  0.33000  ...         0.0      0.0004   \n",
       "279  0.8800  0.9300    0.7133  0.37330  ...         0.0      0.0000   \n",
       "..      ...     ...       ...      ...  ...         ...         ...   \n",
       "341  0.4383  0.8767    0.2300  0.37670  ...         1.0      1.0000   \n",
       "342  0.2383  0.7100    0.0283  0.20330  ...         1.0      1.0000   \n",
       "343  0.0117  0.8433    0.0000  0.02925  ...         1.0      1.0000   \n",
       "344  0.4525  0.5085    0.0153  0.43010  ...         1.0      1.0000   \n",
       "345  0.3864  0.8661    0.1915  0.31420  ...         1.0      1.0000   \n",
       "\n",
       "     p(A4=A1|3)_x  p(A1=A2)_y  p(A1=A3+A4)_y  p(A1>A3)_y  p(A1>A4)_y  \\\n",
       "275           0.0         0.0            0.0         0.0         0.0   \n",
       "276           0.0         0.0            0.0         0.0         0.0   \n",
       "277           0.0         0.0            0.0         0.0         0.0   \n",
       "278           0.0         0.0            0.0         0.0         0.0   \n",
       "279           0.0         0.0            0.0         0.0         0.0   \n",
       "..            ...         ...            ...         ...         ...   \n",
       "341           1.0         1.0            1.0         1.0         1.0   \n",
       "342           1.0         1.0            1.0         1.0         1.0   \n",
       "343           1.0         1.0            1.0         1.0         1.0   \n",
       "344           1.0         1.0            1.0         1.0         1.0   \n",
       "345           1.0         1.0            1.0         1.0         1.0   \n",
       "\n",
       "     p(A3∅A4)_y  p(A4=A1|3)_y   ave_?A  \n",
       "275      0.0001           0.0  0.78368  \n",
       "276      1.0000           0.0  0.72066  \n",
       "277      0.2967           0.0  0.80166  \n",
       "278      0.0004           0.0  0.92432  \n",
       "279      0.0000           0.0  0.87532  \n",
       "..          ...           ...      ...  \n",
       "341      1.0000           1.0  0.48232  \n",
       "342      1.0000           1.0  0.29432  \n",
       "343      1.0000           1.0  0.18034  \n",
       "344      1.0000           1.0  0.38408  \n",
       "345      1.0000           1.0  0.45898  \n",
       "\n",
       "[71 rows x 60 columns]"
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[\"ave_?A\"] = df[[\"?A1=A2\", \"?A1>A3\",\"?A1>A4\",\"?A3∅A4\",\"?A4=A1|3\"]].mean(axis=1)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "id": "2c7d02a3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['llama3.1:8b',\n",
       " 'gpt-oss:20b',\n",
       " 'gpt-4.1-nano-2025-04-14',\n",
       " 'mistral-small:24b',\n",
       " 'llama3.1:70b',\n",
       " 'gemini-2.0-flash',\n",
       " 'gpt-4.1-mini-2025-04-14',\n",
       " 'gpt-4o',\n",
       " 'gpt-4.1-2025-04-14',\n",
       " 'grok-3-mini',\n",
       " 'deepseek-chat',\n",
       " 'gemini-2.5-flash',\n",
       " 'gpt-5-nano',\n",
       " 'deepseek-reasoner',\n",
       " 'gemini-2.5-pro',\n",
       " 'gpt-5-mini',\n",
       " 'o3',\n",
       " 'gpt-5']"
      ]
     },
     "execution_count": 98,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "llms = pd.read_csv(folder + \"model_index_map.csv\")[\"llm\"].to_list()\n",
    "llms"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "id": "587f2d68",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>llm</th>\n",
       "      <th>action</th>\n",
       "      <th>?A1=A1*</th>\n",
       "      <th>J(A1-A1*)</th>\n",
       "      <th>?A1=A2</th>\n",
       "      <th>?A1&gt;A3</th>\n",
       "      <th>?A1&gt;A4</th>\n",
       "      <th>?A3∅A4</th>\n",
       "      <th>?A4=A1|3</th>\n",
       "      <th>ave_?A</th>\n",
       "      <th>J(A1-A2)</th>\n",
       "      <th>J(A3-A4)</th>\n",
       "      <th>J(A4-A1|3)</th>\n",
       "      <th>idk</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>275</th>\n",
       "      <td>deepseek-chat</td>\n",
       "      <td>classification</td>\n",
       "      <td>0.50330</td>\n",
       "      <td>0.69370</td>\n",
       "      <td>0.9450</td>\n",
       "      <td>0.8467</td>\n",
       "      <td>0.9117</td>\n",
       "      <td>0.6733</td>\n",
       "      <td>0.5417</td>\n",
       "      <td>0.78368</td>\n",
       "      <td>0.9722</td>\n",
       "      <td>0.3101</td>\n",
       "      <td>0.6274</td>\n",
       "      <td>0.306700</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>276</th>\n",
       "      <td>deepseek-reasoner</td>\n",
       "      <td>classification</td>\n",
       "      <td>0.42670</td>\n",
       "      <td>0.62990</td>\n",
       "      <td>0.7850</td>\n",
       "      <td>0.7533</td>\n",
       "      <td>0.8533</td>\n",
       "      <td>0.6967</td>\n",
       "      <td>0.5150</td>\n",
       "      <td>0.72066</td>\n",
       "      <td>0.8240</td>\n",
       "      <td>0.2966</td>\n",
       "      <td>0.5637</td>\n",
       "      <td>0.285400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>277</th>\n",
       "      <td>gemini-2.0-flash</td>\n",
       "      <td>classification</td>\n",
       "      <td>0.42000</td>\n",
       "      <td>0.65900</td>\n",
       "      <td>0.8817</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.9450</td>\n",
       "      <td>0.6433</td>\n",
       "      <td>0.5383</td>\n",
       "      <td>0.80166</td>\n",
       "      <td>0.9384</td>\n",
       "      <td>0.2782</td>\n",
       "      <td>0.6268</td>\n",
       "      <td>0.405825</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>278</th>\n",
       "      <td>gemini-2.5-flash</td>\n",
       "      <td>classification</td>\n",
       "      <td>0.33000</td>\n",
       "      <td>0.63560</td>\n",
       "      <td>0.8983</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.9683</td>\n",
       "      <td>0.9067</td>\n",
       "      <td>0.8483</td>\n",
       "      <td>0.92432</td>\n",
       "      <td>0.9383</td>\n",
       "      <td>0.0882</td>\n",
       "      <td>0.8392</td>\n",
       "      <td>0.319600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>279</th>\n",
       "      <td>gemini-2.5-pro</td>\n",
       "      <td>classification</td>\n",
       "      <td>0.37330</td>\n",
       "      <td>0.65150</td>\n",
       "      <td>0.8533</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.8800</td>\n",
       "      <td>0.9300</td>\n",
       "      <td>0.7133</td>\n",
       "      <td>0.87532</td>\n",
       "      <td>0.8978</td>\n",
       "      <td>0.0622</td>\n",
       "      <td>0.7902</td>\n",
       "      <td>0.330000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>341</th>\n",
       "      <td>grok-3-mini</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>0.37670</td>\n",
       "      <td>0.63090</td>\n",
       "      <td>0.3433</td>\n",
       "      <td>0.5233</td>\n",
       "      <td>0.4383</td>\n",
       "      <td>0.8767</td>\n",
       "      <td>0.2300</td>\n",
       "      <td>0.48232</td>\n",
       "      <td>0.6346</td>\n",
       "      <td>0.0561</td>\n",
       "      <td>0.4478</td>\n",
       "      <td>0.081250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>342</th>\n",
       "      <td>llama3.1:70b</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>0.20330</td>\n",
       "      <td>0.48820</td>\n",
       "      <td>0.2067</td>\n",
       "      <td>0.2883</td>\n",
       "      <td>0.2383</td>\n",
       "      <td>0.7100</td>\n",
       "      <td>0.0283</td>\n",
       "      <td>0.29432</td>\n",
       "      <td>0.4712</td>\n",
       "      <td>0.1006</td>\n",
       "      <td>0.2067</td>\n",
       "      <td>0.033325</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>343</th>\n",
       "      <td>llama3.1:8b</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>0.02925</td>\n",
       "      <td>0.21525</td>\n",
       "      <td>0.0167</td>\n",
       "      <td>0.0300</td>\n",
       "      <td>0.0117</td>\n",
       "      <td>0.8433</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.18034</td>\n",
       "      <td>0.1667</td>\n",
       "      <td>0.0329</td>\n",
       "      <td>0.0644</td>\n",
       "      <td>0.016650</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>344</th>\n",
       "      <td>mistral-small:24b</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>0.43010</td>\n",
       "      <td>0.64105</td>\n",
       "      <td>0.4407</td>\n",
       "      <td>0.5034</td>\n",
       "      <td>0.4525</td>\n",
       "      <td>0.5085</td>\n",
       "      <td>0.0153</td>\n",
       "      <td>0.38408</td>\n",
       "      <td>0.6399</td>\n",
       "      <td>0.3447</td>\n",
       "      <td>0.2057</td>\n",
       "      <td>0.291925</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>345</th>\n",
       "      <td>o3</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>0.31420</td>\n",
       "      <td>0.58920</td>\n",
       "      <td>0.3441</td>\n",
       "      <td>0.5068</td>\n",
       "      <td>0.3864</td>\n",
       "      <td>0.8661</td>\n",
       "      <td>0.1915</td>\n",
       "      <td>0.45898</td>\n",
       "      <td>0.6246</td>\n",
       "      <td>0.0553</td>\n",
       "      <td>0.4383</td>\n",
       "      <td>0.089800</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>71 rows × 14 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                   llm          action  ?A1=A1*  J(A1-A1*)  ?A1=A2  ?A1>A3  \\\n",
       "275      deepseek-chat  classification  0.50330    0.69370  0.9450  0.8467   \n",
       "276  deepseek-reasoner  classification  0.42670    0.62990  0.7850  0.7533   \n",
       "277   gemini-2.0-flash  classification  0.42000    0.65900  0.8817  1.0000   \n",
       "278   gemini-2.5-flash  classification  0.33000    0.63560  0.8983  1.0000   \n",
       "279     gemini-2.5-pro  classification  0.37330    0.65150  0.8533  1.0000   \n",
       "..                 ...             ...      ...        ...     ...     ...   \n",
       "341        grok-3-mini       zero-shot  0.37670    0.63090  0.3433  0.5233   \n",
       "342       llama3.1:70b       zero-shot  0.20330    0.48820  0.2067  0.2883   \n",
       "343        llama3.1:8b       zero-shot  0.02925    0.21525  0.0167  0.0300   \n",
       "344  mistral-small:24b       zero-shot  0.43010    0.64105  0.4407  0.5034   \n",
       "345                 o3       zero-shot  0.31420    0.58920  0.3441  0.5068   \n",
       "\n",
       "     ?A1>A4  ?A3∅A4  ?A4=A1|3   ave_?A  J(A1-A2)  J(A3-A4)  J(A4-A1|3)  \\\n",
       "275  0.9117  0.6733    0.5417  0.78368    0.9722    0.3101      0.6274   \n",
       "276  0.8533  0.6967    0.5150  0.72066    0.8240    0.2966      0.5637   \n",
       "277  0.9450  0.6433    0.5383  0.80166    0.9384    0.2782      0.6268   \n",
       "278  0.9683  0.9067    0.8483  0.92432    0.9383    0.0882      0.8392   \n",
       "279  0.8800  0.9300    0.7133  0.87532    0.8978    0.0622      0.7902   \n",
       "..      ...     ...       ...      ...       ...       ...         ...   \n",
       "341  0.4383  0.8767    0.2300  0.48232    0.6346    0.0561      0.4478   \n",
       "342  0.2383  0.7100    0.0283  0.29432    0.4712    0.1006      0.2067   \n",
       "343  0.0117  0.8433    0.0000  0.18034    0.1667    0.0329      0.0644   \n",
       "344  0.4525  0.5085    0.0153  0.38408    0.6399    0.3447      0.2057   \n",
       "345  0.3864  0.8661    0.1915  0.45898    0.6246    0.0553      0.4383   \n",
       "\n",
       "          idk  \n",
       "275  0.306700  \n",
       "276  0.285400  \n",
       "277  0.405825  \n",
       "278  0.319600  \n",
       "279  0.330000  \n",
       "..        ...  \n",
       "341  0.081250  \n",
       "342  0.033325  \n",
       "343  0.016650  \n",
       "344  0.291925  \n",
       "345  0.089800  \n",
       "\n",
       "[71 rows x 14 columns]"
      ]
     },
     "execution_count": 99,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cols = [\"llm\",\"action\", \"?A1=A1*\", \"J(A1-A1*)\", \"?A1=A2\", \"?A1>A3\",\"?A1>A4\",\"?A3∅A4\",\"?A4=A1|3\",\"ave_?A\", \n",
    "        \"J(A1-A2)\",\"J(A3-A4)\",\"J(A4-A1|3)\",\"idk\"]\n",
    "action = [\"zero-shot\", \"classification\", \"fixing\"]\n",
    "llms = ['llama3.1:8b',\n",
    " 'gpt-oss:20b',\n",
    " 'gpt-4.1-nano-2025-04-14',\n",
    " 'mistral-small:24b',\n",
    " 'llama3.1:70b',\n",
    " 'gemini-2.0-flash',\n",
    " 'gpt-4.1-mini-2025-04-14',\n",
    " 'gpt-4o',\n",
    " 'gpt-4.1-2025-04-14',\n",
    " 'grok-3-mini',\n",
    " 'deepseek-chat',\n",
    " 'gemini-2.5-flash',\n",
    " 'gpt-5-nano',\n",
    " 'deepseek-reasoner',\n",
    " 'gemini-2.5-pro',\n",
    " 'gpt-5-mini',\n",
    " 'o3',\n",
    " 'gpt-5']\n",
    "\n",
    "df[cols]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e51eec79",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "id": "c84ce46a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>llm</th>\n",
       "      <th>action</th>\n",
       "      <th>?A1=A1*</th>\n",
       "      <th>J(A1-A1*)</th>\n",
       "      <th>?A1=A2</th>\n",
       "      <th>?A1&gt;A3</th>\n",
       "      <th>?A1&gt;A4</th>\n",
       "      <th>?A3∅A4</th>\n",
       "      <th>?A4=A1|3</th>\n",
       "      <th>ave_?A</th>\n",
       "      <th>J(A1-A2)</th>\n",
       "      <th>J(A3-A4)</th>\n",
       "      <th>J(A4-A1|3)</th>\n",
       "      <th>idk</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>275</th>\n",
       "      <td>deepseek-chat</td>\n",
       "      <td>classification</td>\n",
       "      <td>0.50330</td>\n",
       "      <td>0.69370</td>\n",
       "      <td>0.9450</td>\n",
       "      <td>0.8467</td>\n",
       "      <td>0.9117</td>\n",
       "      <td>0.6733</td>\n",
       "      <td>0.5417</td>\n",
       "      <td>0.78368</td>\n",
       "      <td>0.9722</td>\n",
       "      <td>0.3101</td>\n",
       "      <td>0.6274</td>\n",
       "      <td>0.306700</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>276</th>\n",
       "      <td>deepseek-reasoner</td>\n",
       "      <td>classification</td>\n",
       "      <td>0.42670</td>\n",
       "      <td>0.62990</td>\n",
       "      <td>0.7850</td>\n",
       "      <td>0.7533</td>\n",
       "      <td>0.8533</td>\n",
       "      <td>0.6967</td>\n",
       "      <td>0.5150</td>\n",
       "      <td>0.72066</td>\n",
       "      <td>0.8240</td>\n",
       "      <td>0.2966</td>\n",
       "      <td>0.5637</td>\n",
       "      <td>0.285400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>277</th>\n",
       "      <td>gemini-2.0-flash</td>\n",
       "      <td>classification</td>\n",
       "      <td>0.42000</td>\n",
       "      <td>0.65900</td>\n",
       "      <td>0.8817</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.9450</td>\n",
       "      <td>0.6433</td>\n",
       "      <td>0.5383</td>\n",
       "      <td>0.80166</td>\n",
       "      <td>0.9384</td>\n",
       "      <td>0.2782</td>\n",
       "      <td>0.6268</td>\n",
       "      <td>0.405825</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>278</th>\n",
       "      <td>gemini-2.5-flash</td>\n",
       "      <td>classification</td>\n",
       "      <td>0.33000</td>\n",
       "      <td>0.63560</td>\n",
       "      <td>0.8983</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.9683</td>\n",
       "      <td>0.9067</td>\n",
       "      <td>0.8483</td>\n",
       "      <td>0.92432</td>\n",
       "      <td>0.9383</td>\n",
       "      <td>0.0882</td>\n",
       "      <td>0.8392</td>\n",
       "      <td>0.319600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>279</th>\n",
       "      <td>gemini-2.5-pro</td>\n",
       "      <td>classification</td>\n",
       "      <td>0.37330</td>\n",
       "      <td>0.65150</td>\n",
       "      <td>0.8533</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.8800</td>\n",
       "      <td>0.9300</td>\n",
       "      <td>0.7133</td>\n",
       "      <td>0.87532</td>\n",
       "      <td>0.8978</td>\n",
       "      <td>0.0622</td>\n",
       "      <td>0.7902</td>\n",
       "      <td>0.330000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>341</th>\n",
       "      <td>grok-3-mini</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>0.37670</td>\n",
       "      <td>0.63090</td>\n",
       "      <td>0.3433</td>\n",
       "      <td>0.5233</td>\n",
       "      <td>0.4383</td>\n",
       "      <td>0.8767</td>\n",
       "      <td>0.2300</td>\n",
       "      <td>0.48232</td>\n",
       "      <td>0.6346</td>\n",
       "      <td>0.0561</td>\n",
       "      <td>0.4478</td>\n",
       "      <td>0.081250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>342</th>\n",
       "      <td>llama3.1:70b</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>0.20330</td>\n",
       "      <td>0.48820</td>\n",
       "      <td>0.2067</td>\n",
       "      <td>0.2883</td>\n",
       "      <td>0.2383</td>\n",
       "      <td>0.7100</td>\n",
       "      <td>0.0283</td>\n",
       "      <td>0.29432</td>\n",
       "      <td>0.4712</td>\n",
       "      <td>0.1006</td>\n",
       "      <td>0.2067</td>\n",
       "      <td>0.033325</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>343</th>\n",
       "      <td>llama3.1:8b</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>0.02925</td>\n",
       "      <td>0.21525</td>\n",
       "      <td>0.0167</td>\n",
       "      <td>0.0300</td>\n",
       "      <td>0.0117</td>\n",
       "      <td>0.8433</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.18034</td>\n",
       "      <td>0.1667</td>\n",
       "      <td>0.0329</td>\n",
       "      <td>0.0644</td>\n",
       "      <td>0.016650</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>344</th>\n",
       "      <td>mistral-small:24b</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>0.43010</td>\n",
       "      <td>0.64105</td>\n",
       "      <td>0.4407</td>\n",
       "      <td>0.5034</td>\n",
       "      <td>0.4525</td>\n",
       "      <td>0.5085</td>\n",
       "      <td>0.0153</td>\n",
       "      <td>0.38408</td>\n",
       "      <td>0.6399</td>\n",
       "      <td>0.3447</td>\n",
       "      <td>0.2057</td>\n",
       "      <td>0.291925</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>345</th>\n",
       "      <td>o3</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>0.31420</td>\n",
       "      <td>0.58920</td>\n",
       "      <td>0.3441</td>\n",
       "      <td>0.5068</td>\n",
       "      <td>0.3864</td>\n",
       "      <td>0.8661</td>\n",
       "      <td>0.1915</td>\n",
       "      <td>0.45898</td>\n",
       "      <td>0.6246</td>\n",
       "      <td>0.0553</td>\n",
       "      <td>0.4383</td>\n",
       "      <td>0.089800</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>71 rows × 14 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                   llm          action  ?A1=A1*  J(A1-A1*)  ?A1=A2  ?A1>A3  \\\n",
       "275      deepseek-chat  classification  0.50330    0.69370  0.9450  0.8467   \n",
       "276  deepseek-reasoner  classification  0.42670    0.62990  0.7850  0.7533   \n",
       "277   gemini-2.0-flash  classification  0.42000    0.65900  0.8817  1.0000   \n",
       "278   gemini-2.5-flash  classification  0.33000    0.63560  0.8983  1.0000   \n",
       "279     gemini-2.5-pro  classification  0.37330    0.65150  0.8533  1.0000   \n",
       "..                 ...             ...      ...        ...     ...     ...   \n",
       "341        grok-3-mini       zero-shot  0.37670    0.63090  0.3433  0.5233   \n",
       "342       llama3.1:70b       zero-shot  0.20330    0.48820  0.2067  0.2883   \n",
       "343        llama3.1:8b       zero-shot  0.02925    0.21525  0.0167  0.0300   \n",
       "344  mistral-small:24b       zero-shot  0.43010    0.64105  0.4407  0.5034   \n",
       "345                 o3       zero-shot  0.31420    0.58920  0.3441  0.5068   \n",
       "\n",
       "     ?A1>A4  ?A3∅A4  ?A4=A1|3   ave_?A  J(A1-A2)  J(A3-A4)  J(A4-A1|3)  \\\n",
       "275  0.9117  0.6733    0.5417  0.78368    0.9722    0.3101      0.6274   \n",
       "276  0.8533  0.6967    0.5150  0.72066    0.8240    0.2966      0.5637   \n",
       "277  0.9450  0.6433    0.5383  0.80166    0.9384    0.2782      0.6268   \n",
       "278  0.9683  0.9067    0.8483  0.92432    0.9383    0.0882      0.8392   \n",
       "279  0.8800  0.9300    0.7133  0.87532    0.8978    0.0622      0.7902   \n",
       "..      ...     ...       ...      ...       ...       ...         ...   \n",
       "341  0.4383  0.8767    0.2300  0.48232    0.6346    0.0561      0.4478   \n",
       "342  0.2383  0.7100    0.0283  0.29432    0.4712    0.1006      0.2067   \n",
       "343  0.0117  0.8433    0.0000  0.18034    0.1667    0.0329      0.0644   \n",
       "344  0.4525  0.5085    0.0153  0.38408    0.6399    0.3447      0.2057   \n",
       "345  0.3864  0.8661    0.1915  0.45898    0.6246    0.0553      0.4383   \n",
       "\n",
       "          idk  \n",
       "275  0.306700  \n",
       "276  0.285400  \n",
       "277  0.405825  \n",
       "278  0.319600  \n",
       "279  0.330000  \n",
       "..        ...  \n",
       "341  0.081250  \n",
       "342  0.033325  \n",
       "343  0.016650  \n",
       "344  0.291925  \n",
       "345  0.089800  \n",
       "\n",
       "[71 rows x 14 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# New cell: show only the selected columns (llm, action, and the requested metrics)\n",
    "from IPython.display import display\n",
    "import pandas as pd\n",
    "\n",
    "cols = [\"llm\",\"action\", \"?A1=A1*\", \"J(A1-A1*)\", \"?A1=A2\", \"?A1>A3\",\"?A1>A4\",\"?A3∅A4\",\"?A4=A1|3\",\"ave_?A\",\n",
    "        \"J(A1-A2)\",\"J(A3-A4)\",\"J(A4-A1|3)\",\"idk\"]\n",
    "\n",
    "# Ensure DataFrame is available\n",
    "try:\n",
    "    df  # reference to raise NameError if missing\n",
    "except NameError:\n",
    "    raise RuntimeError(\"DataFrame `df` not found in the notebook namespace. Please run the earlier cell that loads the CSV first.\")\n",
    "\n",
    "# Filter to overall dataset if present\n",
    "if 'dataset' in df.columns:\n",
    "    df = df[df['dataset'] == 'overall'].copy()\n",
    "\n",
    "# Compute ave_?A if missing and the components exist\n",
    "ave_components = [\"?A1=A2\", \"?A1>A3\",\"?A1>A4\",\"?A3∅A4\",\"?A4=A1|3\"]\n",
    "if \"ave_?A\" not in df.columns and set(ave_components).issubset(df.columns):\n",
    "    df[\"ave_?A\"] = df[ave_components].mean(axis=1)\n",
    "\n",
    "# Select only columns that are present to avoid KeyError\n",
    "present_cols = [c for c in cols if c in df.columns]\n",
    "missing_cols = [c for c in cols if c not in df.columns]\n",
    "if missing_cols:\n",
    "    print('Warning: these requested columns are missing and will be omitted:', missing_cols)\n",
    "\n",
    "# Display the selected columns\n",
    "display(df[present_cols])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "id": "1c9442eb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>llm</th>\n",
       "      <th>action</th>\n",
       "      <th>?A1=A1*</th>\n",
       "      <th>J(A1-A1*)</th>\n",
       "      <th>?A1=A2</th>\n",
       "      <th>?A1&gt;A3</th>\n",
       "      <th>?A1&gt;A4</th>\n",
       "      <th>?A3∅A4</th>\n",
       "      <th>?A4=A1|3</th>\n",
       "      <th>ave_?A</th>\n",
       "      <th>J(A1-A2)</th>\n",
       "      <th>J(A3-A4)</th>\n",
       "      <th>J(A4-A1|3)</th>\n",
       "      <th>idk</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>llama3.1:8b</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>0.02925</td>\n",
       "      <td>0.21525</td>\n",
       "      <td>0.0167</td>\n",
       "      <td>0.0300</td>\n",
       "      <td>0.0117</td>\n",
       "      <td>0.8433</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.18034</td>\n",
       "      <td>0.1667</td>\n",
       "      <td>0.0329</td>\n",
       "      <td>0.0644</td>\n",
       "      <td>0.016650</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>llama3.1:8b</td>\n",
       "      <td>classification</td>\n",
       "      <td>0.04500</td>\n",
       "      <td>0.29440</td>\n",
       "      <td>0.1633</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.3383</td>\n",
       "      <td>0.6817</td>\n",
       "      <td>0.0500</td>\n",
       "      <td>0.44666</td>\n",
       "      <td>0.5603</td>\n",
       "      <td>0.0974</td>\n",
       "      <td>0.3899</td>\n",
       "      <td>0.250825</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>llama3.1:8b</td>\n",
       "      <td>fixing</td>\n",
       "      <td>0.02170</td>\n",
       "      <td>0.16590</td>\n",
       "      <td>0.1267</td>\n",
       "      <td>0.1550</td>\n",
       "      <td>0.1233</td>\n",
       "      <td>0.6183</td>\n",
       "      <td>0.0183</td>\n",
       "      <td>0.20832</td>\n",
       "      <td>0.4549</td>\n",
       "      <td>0.1126</td>\n",
       "      <td>0.2414</td>\n",
       "      <td>0.002100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>gpt-oss:20b</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>0.22705</td>\n",
       "      <td>0.44435</td>\n",
       "      <td>0.2167</td>\n",
       "      <td>0.3800</td>\n",
       "      <td>0.3250</td>\n",
       "      <td>0.8300</td>\n",
       "      <td>0.1000</td>\n",
       "      <td>0.37034</td>\n",
       "      <td>0.4413</td>\n",
       "      <td>0.0775</td>\n",
       "      <td>0.2927</td>\n",
       "      <td>0.125425</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>gpt-oss:20b</td>\n",
       "      <td>classification</td>\n",
       "      <td>0.27170</td>\n",
       "      <td>0.48920</td>\n",
       "      <td>0.6717</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.7317</td>\n",
       "      <td>0.8467</td>\n",
       "      <td>0.3800</td>\n",
       "      <td>0.72602</td>\n",
       "      <td>0.7939</td>\n",
       "      <td>0.1073</td>\n",
       "      <td>0.6099</td>\n",
       "      <td>0.370425</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>gpt-oss:20b</td>\n",
       "      <td>fixing</td>\n",
       "      <td>0.24330</td>\n",
       "      <td>0.49460</td>\n",
       "      <td>0.5833</td>\n",
       "      <td>0.8067</td>\n",
       "      <td>0.9233</td>\n",
       "      <td>0.9533</td>\n",
       "      <td>0.6200</td>\n",
       "      <td>0.77732</td>\n",
       "      <td>0.7650</td>\n",
       "      <td>0.0397</td>\n",
       "      <td>0.7864</td>\n",
       "      <td>0.133350</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>gpt-4.1-nano-2025-04-14</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>0.58525</td>\n",
       "      <td>0.68235</td>\n",
       "      <td>0.2783</td>\n",
       "      <td>0.4567</td>\n",
       "      <td>0.3717</td>\n",
       "      <td>0.6200</td>\n",
       "      <td>0.0400</td>\n",
       "      <td>0.35334</td>\n",
       "      <td>0.5129</td>\n",
       "      <td>0.1915</td>\n",
       "      <td>0.2037</td>\n",
       "      <td>0.125825</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>gpt-4.1-nano-2025-04-14</td>\n",
       "      <td>classification</td>\n",
       "      <td>0.96000</td>\n",
       "      <td>0.96080</td>\n",
       "      <td>0.9733</td>\n",
       "      <td>0.7167</td>\n",
       "      <td>0.5667</td>\n",
       "      <td>0.3817</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.52768</td>\n",
       "      <td>0.9757</td>\n",
       "      <td>0.6149</td>\n",
       "      <td>0.0017</td>\n",
       "      <td>0.808725</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>gpt-4.1-nano-2025-04-14</td>\n",
       "      <td>fixing</td>\n",
       "      <td>0.34830</td>\n",
       "      <td>0.48520</td>\n",
       "      <td>0.5467</td>\n",
       "      <td>0.5917</td>\n",
       "      <td>0.6017</td>\n",
       "      <td>0.6183</td>\n",
       "      <td>0.2150</td>\n",
       "      <td>0.51468</td>\n",
       "      <td>0.6237</td>\n",
       "      <td>0.2257</td>\n",
       "      <td>0.3509</td>\n",
       "      <td>0.193750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>mistral-small:24b</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>0.43010</td>\n",
       "      <td>0.64105</td>\n",
       "      <td>0.4407</td>\n",
       "      <td>0.5034</td>\n",
       "      <td>0.4525</td>\n",
       "      <td>0.5085</td>\n",
       "      <td>0.0153</td>\n",
       "      <td>0.38408</td>\n",
       "      <td>0.6399</td>\n",
       "      <td>0.3447</td>\n",
       "      <td>0.2057</td>\n",
       "      <td>0.291925</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>mistral-small:24b</td>\n",
       "      <td>classification</td>\n",
       "      <td>0.34670</td>\n",
       "      <td>0.59220</td>\n",
       "      <td>0.8450</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.8067</td>\n",
       "      <td>0.7983</td>\n",
       "      <td>0.4233</td>\n",
       "      <td>0.77466</td>\n",
       "      <td>0.9257</td>\n",
       "      <td>0.1774</td>\n",
       "      <td>0.6371</td>\n",
       "      <td>0.335825</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>mistral-small:24b</td>\n",
       "      <td>fixing</td>\n",
       "      <td>0.52330</td>\n",
       "      <td>0.75180</td>\n",
       "      <td>0.7200</td>\n",
       "      <td>0.7467</td>\n",
       "      <td>0.9000</td>\n",
       "      <td>0.9583</td>\n",
       "      <td>0.5017</td>\n",
       "      <td>0.76534</td>\n",
       "      <td>0.8296</td>\n",
       "      <td>0.0148</td>\n",
       "      <td>0.6782</td>\n",
       "      <td>0.155425</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>llama3.1:70b</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>0.20330</td>\n",
       "      <td>0.48820</td>\n",
       "      <td>0.2067</td>\n",
       "      <td>0.2883</td>\n",
       "      <td>0.2383</td>\n",
       "      <td>0.7100</td>\n",
       "      <td>0.0283</td>\n",
       "      <td>0.29432</td>\n",
       "      <td>0.4712</td>\n",
       "      <td>0.1006</td>\n",
       "      <td>0.2067</td>\n",
       "      <td>0.033325</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>llama3.1:70b</td>\n",
       "      <td>classification</td>\n",
       "      <td>0.20330</td>\n",
       "      <td>0.48820</td>\n",
       "      <td>0.8033</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.8633</td>\n",
       "      <td>0.9350</td>\n",
       "      <td>0.5617</td>\n",
       "      <td>0.83266</td>\n",
       "      <td>0.9071</td>\n",
       "      <td>0.0295</td>\n",
       "      <td>0.8120</td>\n",
       "      <td>0.269150</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>llama3.1:70b</td>\n",
       "      <td>fixing</td>\n",
       "      <td>0.21000</td>\n",
       "      <td>0.47020</td>\n",
       "      <td>0.6100</td>\n",
       "      <td>0.7267</td>\n",
       "      <td>0.8217</td>\n",
       "      <td>0.9350</td>\n",
       "      <td>0.5067</td>\n",
       "      <td>0.72002</td>\n",
       "      <td>0.7529</td>\n",
       "      <td>0.0337</td>\n",
       "      <td>0.7635</td>\n",
       "      <td>0.050400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>gemini-2.0-flash</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>0.48035</td>\n",
       "      <td>0.70080</td>\n",
       "      <td>0.3267</td>\n",
       "      <td>0.4150</td>\n",
       "      <td>0.3533</td>\n",
       "      <td>0.6267</td>\n",
       "      <td>0.0500</td>\n",
       "      <td>0.35434</td>\n",
       "      <td>0.6008</td>\n",
       "      <td>0.1300</td>\n",
       "      <td>0.2914</td>\n",
       "      <td>0.004175</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>gemini-2.0-flash</td>\n",
       "      <td>classification</td>\n",
       "      <td>0.42000</td>\n",
       "      <td>0.65900</td>\n",
       "      <td>0.8817</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.9450</td>\n",
       "      <td>0.6433</td>\n",
       "      <td>0.5383</td>\n",
       "      <td>0.80166</td>\n",
       "      <td>0.9384</td>\n",
       "      <td>0.2782</td>\n",
       "      <td>0.6268</td>\n",
       "      <td>0.405825</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>gemini-2.0-flash</td>\n",
       "      <td>fixing</td>\n",
       "      <td>0.77500</td>\n",
       "      <td>0.87060</td>\n",
       "      <td>0.7900</td>\n",
       "      <td>0.8417</td>\n",
       "      <td>0.9200</td>\n",
       "      <td>0.9667</td>\n",
       "      <td>0.7383</td>\n",
       "      <td>0.85134</td>\n",
       "      <td>0.8233</td>\n",
       "      <td>0.0137</td>\n",
       "      <td>0.8415</td>\n",
       "      <td>0.029575</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>gpt-4.1-mini-2025-04-14</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>0.28015</td>\n",
       "      <td>0.53845</td>\n",
       "      <td>0.3367</td>\n",
       "      <td>0.4817</td>\n",
       "      <td>0.3833</td>\n",
       "      <td>0.5633</td>\n",
       "      <td>0.0550</td>\n",
       "      <td>0.36400</td>\n",
       "      <td>0.6296</td>\n",
       "      <td>0.1574</td>\n",
       "      <td>0.2812</td>\n",
       "      <td>0.048350</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>gpt-4.1-mini-2025-04-14</td>\n",
       "      <td>classification</td>\n",
       "      <td>0.34500</td>\n",
       "      <td>0.59460</td>\n",
       "      <td>0.8983</td>\n",
       "      <td>0.6400</td>\n",
       "      <td>0.9483</td>\n",
       "      <td>0.7717</td>\n",
       "      <td>0.6400</td>\n",
       "      <td>0.77966</td>\n",
       "      <td>0.9573</td>\n",
       "      <td>0.2183</td>\n",
       "      <td>0.7241</td>\n",
       "      <td>0.232525</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>gpt-4.1-mini-2025-04-14</td>\n",
       "      <td>fixing</td>\n",
       "      <td>0.44500</td>\n",
       "      <td>0.71140</td>\n",
       "      <td>0.7750</td>\n",
       "      <td>0.8767</td>\n",
       "      <td>0.9633</td>\n",
       "      <td>0.8967</td>\n",
       "      <td>0.6883</td>\n",
       "      <td>0.84000</td>\n",
       "      <td>0.8881</td>\n",
       "      <td>0.0610</td>\n",
       "      <td>0.7953</td>\n",
       "      <td>0.087525</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>gpt-4o</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>0.48985</td>\n",
       "      <td>0.52095</td>\n",
       "      <td>0.4517</td>\n",
       "      <td>0.5333</td>\n",
       "      <td>0.4700</td>\n",
       "      <td>0.6283</td>\n",
       "      <td>0.0617</td>\n",
       "      <td>0.42900</td>\n",
       "      <td>0.6534</td>\n",
       "      <td>0.2669</td>\n",
       "      <td>0.2612</td>\n",
       "      <td>0.297900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>gpt-4o</td>\n",
       "      <td>classification</td>\n",
       "      <td>0.60000</td>\n",
       "      <td>0.43330</td>\n",
       "      <td>0.9733</td>\n",
       "      <td>0.9933</td>\n",
       "      <td>0.9833</td>\n",
       "      <td>0.3633</td>\n",
       "      <td>0.3067</td>\n",
       "      <td>0.72398</td>\n",
       "      <td>0.5867</td>\n",
       "      <td>0.6279</td>\n",
       "      <td>0.3508</td>\n",
       "      <td>0.666650</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>gpt-4o</td>\n",
       "      <td>fixing</td>\n",
       "      <td>0.54000</td>\n",
       "      <td>0.54860</td>\n",
       "      <td>0.8533</td>\n",
       "      <td>0.9183</td>\n",
       "      <td>0.9483</td>\n",
       "      <td>0.8650</td>\n",
       "      <td>0.6783</td>\n",
       "      <td>0.85264</td>\n",
       "      <td>0.7343</td>\n",
       "      <td>0.1209</td>\n",
       "      <td>0.5879</td>\n",
       "      <td>0.338750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>gpt-4.1-2025-04-14</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>0.28730</td>\n",
       "      <td>0.59490</td>\n",
       "      <td>0.3967</td>\n",
       "      <td>0.4983</td>\n",
       "      <td>0.3867</td>\n",
       "      <td>0.6333</td>\n",
       "      <td>0.1017</td>\n",
       "      <td>0.40334</td>\n",
       "      <td>0.6771</td>\n",
       "      <td>0.1118</td>\n",
       "      <td>0.3581</td>\n",
       "      <td>0.037925</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>gpt-4.1-2025-04-14</td>\n",
       "      <td>classification</td>\n",
       "      <td>0.29830</td>\n",
       "      <td>0.62090</td>\n",
       "      <td>0.9183</td>\n",
       "      <td>0.7033</td>\n",
       "      <td>0.9750</td>\n",
       "      <td>0.8700</td>\n",
       "      <td>0.7483</td>\n",
       "      <td>0.84298</td>\n",
       "      <td>0.9754</td>\n",
       "      <td>0.1174</td>\n",
       "      <td>0.8484</td>\n",
       "      <td>0.137500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>gpt-4.1-2025-04-14</td>\n",
       "      <td>fixing</td>\n",
       "      <td>0.52170</td>\n",
       "      <td>0.77660</td>\n",
       "      <td>0.8250</td>\n",
       "      <td>0.9183</td>\n",
       "      <td>0.9450</td>\n",
       "      <td>0.9533</td>\n",
       "      <td>0.7483</td>\n",
       "      <td>0.87798</td>\n",
       "      <td>0.9207</td>\n",
       "      <td>0.0309</td>\n",
       "      <td>0.8364</td>\n",
       "      <td>0.066250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>grok-3-mini</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>0.37670</td>\n",
       "      <td>0.63090</td>\n",
       "      <td>0.3433</td>\n",
       "      <td>0.5233</td>\n",
       "      <td>0.4383</td>\n",
       "      <td>0.8767</td>\n",
       "      <td>0.2300</td>\n",
       "      <td>0.48232</td>\n",
       "      <td>0.6346</td>\n",
       "      <td>0.0561</td>\n",
       "      <td>0.4478</td>\n",
       "      <td>0.081250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>grok-3-mini</td>\n",
       "      <td>classification</td>\n",
       "      <td>0.37670</td>\n",
       "      <td>0.63090</td>\n",
       "      <td>0.9083</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.9000</td>\n",
       "      <td>0.8667</td>\n",
       "      <td>0.6683</td>\n",
       "      <td>0.86866</td>\n",
       "      <td>0.9639</td>\n",
       "      <td>0.1277</td>\n",
       "      <td>0.7869</td>\n",
       "      <td>0.350825</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>grok-3-mini</td>\n",
       "      <td>fixing</td>\n",
       "      <td>0.40830</td>\n",
       "      <td>0.63730</td>\n",
       "      <td>0.8817</td>\n",
       "      <td>0.9267</td>\n",
       "      <td>0.9850</td>\n",
       "      <td>0.9550</td>\n",
       "      <td>0.8100</td>\n",
       "      <td>0.91168</td>\n",
       "      <td>0.9363</td>\n",
       "      <td>0.0400</td>\n",
       "      <td>0.7916</td>\n",
       "      <td>0.134575</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>deepseek-chat</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>0.38810</td>\n",
       "      <td>0.60820</td>\n",
       "      <td>0.3250</td>\n",
       "      <td>0.4550</td>\n",
       "      <td>0.4017</td>\n",
       "      <td>0.5600</td>\n",
       "      <td>0.0733</td>\n",
       "      <td>0.36300</td>\n",
       "      <td>0.5703</td>\n",
       "      <td>0.1661</td>\n",
       "      <td>0.2573</td>\n",
       "      <td>0.113750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>deepseek-chat</td>\n",
       "      <td>classification</td>\n",
       "      <td>0.50330</td>\n",
       "      <td>0.69370</td>\n",
       "      <td>0.9450</td>\n",
       "      <td>0.8467</td>\n",
       "      <td>0.9117</td>\n",
       "      <td>0.6733</td>\n",
       "      <td>0.5417</td>\n",
       "      <td>0.78368</td>\n",
       "      <td>0.9722</td>\n",
       "      <td>0.3101</td>\n",
       "      <td>0.6274</td>\n",
       "      <td>0.306700</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>deepseek-chat</td>\n",
       "      <td>fixing</td>\n",
       "      <td>0.48830</td>\n",
       "      <td>0.71400</td>\n",
       "      <td>0.8167</td>\n",
       "      <td>0.8883</td>\n",
       "      <td>0.9500</td>\n",
       "      <td>0.9467</td>\n",
       "      <td>0.7100</td>\n",
       "      <td>0.86234</td>\n",
       "      <td>0.9117</td>\n",
       "      <td>0.0396</td>\n",
       "      <td>0.7980</td>\n",
       "      <td>0.100000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>gemini-2.5-flash</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>0.37685</td>\n",
       "      <td>0.64855</td>\n",
       "      <td>0.3350</td>\n",
       "      <td>0.4950</td>\n",
       "      <td>0.4150</td>\n",
       "      <td>0.8450</td>\n",
       "      <td>0.2183</td>\n",
       "      <td>0.46166</td>\n",
       "      <td>0.6101</td>\n",
       "      <td>0.0360</td>\n",
       "      <td>0.4382</td>\n",
       "      <td>0.053325</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>gemini-2.5-flash</td>\n",
       "      <td>classification</td>\n",
       "      <td>0.33000</td>\n",
       "      <td>0.63560</td>\n",
       "      <td>0.8983</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.9683</td>\n",
       "      <td>0.9067</td>\n",
       "      <td>0.8483</td>\n",
       "      <td>0.92432</td>\n",
       "      <td>0.9383</td>\n",
       "      <td>0.0882</td>\n",
       "      <td>0.8392</td>\n",
       "      <td>0.319600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>gemini-2.5-flash</td>\n",
       "      <td>fixing</td>\n",
       "      <td>0.68170</td>\n",
       "      <td>0.78060</td>\n",
       "      <td>0.8917</td>\n",
       "      <td>0.9017</td>\n",
       "      <td>0.9533</td>\n",
       "      <td>0.9567</td>\n",
       "      <td>0.8550</td>\n",
       "      <td>0.91168</td>\n",
       "      <td>0.9256</td>\n",
       "      <td>0.0408</td>\n",
       "      <td>0.8349</td>\n",
       "      <td>0.080025</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>gpt-5-nano</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>0.33485</td>\n",
       "      <td>0.42090</td>\n",
       "      <td>0.5900</td>\n",
       "      <td>0.6400</td>\n",
       "      <td>0.6317</td>\n",
       "      <td>0.6767</td>\n",
       "      <td>0.1733</td>\n",
       "      <td>0.54234</td>\n",
       "      <td>0.7228</td>\n",
       "      <td>0.2988</td>\n",
       "      <td>0.3852</td>\n",
       "      <td>0.417925</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>gpt-5-nano</td>\n",
       "      <td>classification</td>\n",
       "      <td>0.09000</td>\n",
       "      <td>0.09600</td>\n",
       "      <td>0.7783</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.7683</td>\n",
       "      <td>0.1950</td>\n",
       "      <td>0.0050</td>\n",
       "      <td>0.54932</td>\n",
       "      <td>0.7816</td>\n",
       "      <td>0.8039</td>\n",
       "      <td>0.0782</td>\n",
       "      <td>0.310025</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>gpt-5-nano</td>\n",
       "      <td>fixing</td>\n",
       "      <td>0.62500</td>\n",
       "      <td>0.74800</td>\n",
       "      <td>0.8417</td>\n",
       "      <td>0.8317</td>\n",
       "      <td>0.9717</td>\n",
       "      <td>0.7783</td>\n",
       "      <td>0.5333</td>\n",
       "      <td>0.79134</td>\n",
       "      <td>0.8965</td>\n",
       "      <td>0.2140</td>\n",
       "      <td>0.5845</td>\n",
       "      <td>0.412500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>deepseek-reasoner</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>0.29950</td>\n",
       "      <td>0.51855</td>\n",
       "      <td>0.1867</td>\n",
       "      <td>0.4017</td>\n",
       "      <td>0.3017</td>\n",
       "      <td>0.8200</td>\n",
       "      <td>0.1083</td>\n",
       "      <td>0.36368</td>\n",
       "      <td>0.4610</td>\n",
       "      <td>0.0396</td>\n",
       "      <td>0.3115</td>\n",
       "      <td>0.042075</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>deepseek-reasoner</td>\n",
       "      <td>classification</td>\n",
       "      <td>0.42670</td>\n",
       "      <td>0.62990</td>\n",
       "      <td>0.7850</td>\n",
       "      <td>0.7533</td>\n",
       "      <td>0.8533</td>\n",
       "      <td>0.6967</td>\n",
       "      <td>0.5150</td>\n",
       "      <td>0.72066</td>\n",
       "      <td>0.8240</td>\n",
       "      <td>0.2966</td>\n",
       "      <td>0.5637</td>\n",
       "      <td>0.285400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>deepseek-reasoner</td>\n",
       "      <td>fixing</td>\n",
       "      <td>0.20330</td>\n",
       "      <td>0.45820</td>\n",
       "      <td>0.6833</td>\n",
       "      <td>0.7567</td>\n",
       "      <td>0.8633</td>\n",
       "      <td>0.9300</td>\n",
       "      <td>0.5833</td>\n",
       "      <td>0.76332</td>\n",
       "      <td>0.8221</td>\n",
       "      <td>0.0537</td>\n",
       "      <td>0.7955</td>\n",
       "      <td>0.062500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>gemini-2.5-pro</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>0.36885</td>\n",
       "      <td>0.66435</td>\n",
       "      <td>0.3100</td>\n",
       "      <td>0.4367</td>\n",
       "      <td>0.3883</td>\n",
       "      <td>0.8150</td>\n",
       "      <td>0.1950</td>\n",
       "      <td>0.42900</td>\n",
       "      <td>0.6314</td>\n",
       "      <td>0.0325</td>\n",
       "      <td>0.4698</td>\n",
       "      <td>0.033325</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>gemini-2.5-pro</td>\n",
       "      <td>classification</td>\n",
       "      <td>0.37330</td>\n",
       "      <td>0.65150</td>\n",
       "      <td>0.8533</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.8800</td>\n",
       "      <td>0.9300</td>\n",
       "      <td>0.7133</td>\n",
       "      <td>0.87532</td>\n",
       "      <td>0.8978</td>\n",
       "      <td>0.0622</td>\n",
       "      <td>0.7902</td>\n",
       "      <td>0.330000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>gemini-2.5-pro</td>\n",
       "      <td>fixing</td>\n",
       "      <td>0.47170</td>\n",
       "      <td>0.62470</td>\n",
       "      <td>0.7733</td>\n",
       "      <td>0.8017</td>\n",
       "      <td>0.9217</td>\n",
       "      <td>0.9783</td>\n",
       "      <td>0.7483</td>\n",
       "      <td>0.84466</td>\n",
       "      <td>0.8197</td>\n",
       "      <td>0.0206</td>\n",
       "      <td>0.8089</td>\n",
       "      <td>0.105000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>gpt-5-mini</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>0.63145</td>\n",
       "      <td>0.76220</td>\n",
       "      <td>0.6533</td>\n",
       "      <td>0.6383</td>\n",
       "      <td>0.6817</td>\n",
       "      <td>0.6150</td>\n",
       "      <td>0.1433</td>\n",
       "      <td>0.54632</td>\n",
       "      <td>0.7718</td>\n",
       "      <td>0.3593</td>\n",
       "      <td>0.3725</td>\n",
       "      <td>0.471675</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>gpt-5-mini</td>\n",
       "      <td>classification</td>\n",
       "      <td>0.59000</td>\n",
       "      <td>0.73100</td>\n",
       "      <td>0.8800</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.7500</td>\n",
       "      <td>0.6733</td>\n",
       "      <td>0.3400</td>\n",
       "      <td>0.72866</td>\n",
       "      <td>0.9258</td>\n",
       "      <td>0.3267</td>\n",
       "      <td>0.4464</td>\n",
       "      <td>0.550825</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>gpt-5-mini</td>\n",
       "      <td>fixing</td>\n",
       "      <td>0.68000</td>\n",
       "      <td>0.78390</td>\n",
       "      <td>0.8683</td>\n",
       "      <td>0.9167</td>\n",
       "      <td>0.9500</td>\n",
       "      <td>0.7367</td>\n",
       "      <td>0.5450</td>\n",
       "      <td>0.80334</td>\n",
       "      <td>0.9104</td>\n",
       "      <td>0.2617</td>\n",
       "      <td>0.5503</td>\n",
       "      <td>0.500400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>o3</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>0.31420</td>\n",
       "      <td>0.58920</td>\n",
       "      <td>0.3441</td>\n",
       "      <td>0.5068</td>\n",
       "      <td>0.3864</td>\n",
       "      <td>0.8661</td>\n",
       "      <td>0.1915</td>\n",
       "      <td>0.45898</td>\n",
       "      <td>0.6246</td>\n",
       "      <td>0.0553</td>\n",
       "      <td>0.4383</td>\n",
       "      <td>0.089800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>o3</td>\n",
       "      <td>classification</td>\n",
       "      <td>0.32500</td>\n",
       "      <td>0.60340</td>\n",
       "      <td>0.8200</td>\n",
       "      <td>0.8833</td>\n",
       "      <td>0.9133</td>\n",
       "      <td>0.9250</td>\n",
       "      <td>0.7467</td>\n",
       "      <td>0.85766</td>\n",
       "      <td>0.9324</td>\n",
       "      <td>0.0674</td>\n",
       "      <td>0.8397</td>\n",
       "      <td>0.265825</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50</th>\n",
       "      <td>o3</td>\n",
       "      <td>fixing</td>\n",
       "      <td>0.37830</td>\n",
       "      <td>0.56030</td>\n",
       "      <td>0.7367</td>\n",
       "      <td>0.9200</td>\n",
       "      <td>0.9450</td>\n",
       "      <td>0.9733</td>\n",
       "      <td>0.7717</td>\n",
       "      <td>0.86934</td>\n",
       "      <td>0.8218</td>\n",
       "      <td>0.0244</td>\n",
       "      <td>0.8082</td>\n",
       "      <td>0.138750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>51</th>\n",
       "      <td>gpt-5</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>0.58595</td>\n",
       "      <td>0.76895</td>\n",
       "      <td>0.6100</td>\n",
       "      <td>0.6417</td>\n",
       "      <td>0.6500</td>\n",
       "      <td>0.7350</td>\n",
       "      <td>0.2167</td>\n",
       "      <td>0.57068</td>\n",
       "      <td>0.7816</td>\n",
       "      <td>0.2211</td>\n",
       "      <td>0.4816</td>\n",
       "      <td>0.320000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52</th>\n",
       "      <td>gpt-5</td>\n",
       "      <td>classification</td>\n",
       "      <td>0.55830</td>\n",
       "      <td>0.75340</td>\n",
       "      <td>0.9233</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.9650</td>\n",
       "      <td>0.7067</td>\n",
       "      <td>0.5983</td>\n",
       "      <td>0.83866</td>\n",
       "      <td>0.9663</td>\n",
       "      <td>0.2920</td>\n",
       "      <td>0.6683</td>\n",
       "      <td>0.475000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>53</th>\n",
       "      <td>gpt-5</td>\n",
       "      <td>fixing</td>\n",
       "      <td>0.64500</td>\n",
       "      <td>0.80590</td>\n",
       "      <td>0.9067</td>\n",
       "      <td>0.9400</td>\n",
       "      <td>0.9817</td>\n",
       "      <td>0.7900</td>\n",
       "      <td>0.6533</td>\n",
       "      <td>0.85434</td>\n",
       "      <td>0.9611</td>\n",
       "      <td>0.2100</td>\n",
       "      <td>0.6917</td>\n",
       "      <td>0.345850</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                        llm          action  ?A1=A1*  J(A1-A1*)  ?A1=A2  \\\n",
       "0               llama3.1:8b       zero-shot  0.02925    0.21525  0.0167   \n",
       "1               llama3.1:8b  classification  0.04500    0.29440  0.1633   \n",
       "2               llama3.1:8b          fixing  0.02170    0.16590  0.1267   \n",
       "3               gpt-oss:20b       zero-shot  0.22705    0.44435  0.2167   \n",
       "4               gpt-oss:20b  classification  0.27170    0.48920  0.6717   \n",
       "5               gpt-oss:20b          fixing  0.24330    0.49460  0.5833   \n",
       "6   gpt-4.1-nano-2025-04-14       zero-shot  0.58525    0.68235  0.2783   \n",
       "7   gpt-4.1-nano-2025-04-14  classification  0.96000    0.96080  0.9733   \n",
       "8   gpt-4.1-nano-2025-04-14          fixing  0.34830    0.48520  0.5467   \n",
       "9         mistral-small:24b       zero-shot  0.43010    0.64105  0.4407   \n",
       "10        mistral-small:24b  classification  0.34670    0.59220  0.8450   \n",
       "11        mistral-small:24b          fixing  0.52330    0.75180  0.7200   \n",
       "12             llama3.1:70b       zero-shot  0.20330    0.48820  0.2067   \n",
       "13             llama3.1:70b  classification  0.20330    0.48820  0.8033   \n",
       "14             llama3.1:70b          fixing  0.21000    0.47020  0.6100   \n",
       "15         gemini-2.0-flash       zero-shot  0.48035    0.70080  0.3267   \n",
       "16         gemini-2.0-flash  classification  0.42000    0.65900  0.8817   \n",
       "17         gemini-2.0-flash          fixing  0.77500    0.87060  0.7900   \n",
       "18  gpt-4.1-mini-2025-04-14       zero-shot  0.28015    0.53845  0.3367   \n",
       "19  gpt-4.1-mini-2025-04-14  classification  0.34500    0.59460  0.8983   \n",
       "20  gpt-4.1-mini-2025-04-14          fixing  0.44500    0.71140  0.7750   \n",
       "21                   gpt-4o       zero-shot  0.48985    0.52095  0.4517   \n",
       "22                   gpt-4o  classification  0.60000    0.43330  0.9733   \n",
       "23                   gpt-4o          fixing  0.54000    0.54860  0.8533   \n",
       "24       gpt-4.1-2025-04-14       zero-shot  0.28730    0.59490  0.3967   \n",
       "25       gpt-4.1-2025-04-14  classification  0.29830    0.62090  0.9183   \n",
       "26       gpt-4.1-2025-04-14          fixing  0.52170    0.77660  0.8250   \n",
       "27              grok-3-mini       zero-shot  0.37670    0.63090  0.3433   \n",
       "28              grok-3-mini  classification  0.37670    0.63090  0.9083   \n",
       "29              grok-3-mini          fixing  0.40830    0.63730  0.8817   \n",
       "30            deepseek-chat       zero-shot  0.38810    0.60820  0.3250   \n",
       "31            deepseek-chat  classification  0.50330    0.69370  0.9450   \n",
       "32            deepseek-chat          fixing  0.48830    0.71400  0.8167   \n",
       "33         gemini-2.5-flash       zero-shot  0.37685    0.64855  0.3350   \n",
       "34         gemini-2.5-flash  classification  0.33000    0.63560  0.8983   \n",
       "35         gemini-2.5-flash          fixing  0.68170    0.78060  0.8917   \n",
       "36               gpt-5-nano       zero-shot  0.33485    0.42090  0.5900   \n",
       "37               gpt-5-nano  classification  0.09000    0.09600  0.7783   \n",
       "38               gpt-5-nano          fixing  0.62500    0.74800  0.8417   \n",
       "39        deepseek-reasoner       zero-shot  0.29950    0.51855  0.1867   \n",
       "40        deepseek-reasoner  classification  0.42670    0.62990  0.7850   \n",
       "41        deepseek-reasoner          fixing  0.20330    0.45820  0.6833   \n",
       "42           gemini-2.5-pro       zero-shot  0.36885    0.66435  0.3100   \n",
       "43           gemini-2.5-pro  classification  0.37330    0.65150  0.8533   \n",
       "44           gemini-2.5-pro          fixing  0.47170    0.62470  0.7733   \n",
       "45               gpt-5-mini       zero-shot  0.63145    0.76220  0.6533   \n",
       "46               gpt-5-mini  classification  0.59000    0.73100  0.8800   \n",
       "47               gpt-5-mini          fixing  0.68000    0.78390  0.8683   \n",
       "48                       o3       zero-shot  0.31420    0.58920  0.3441   \n",
       "49                       o3  classification  0.32500    0.60340  0.8200   \n",
       "50                       o3          fixing  0.37830    0.56030  0.7367   \n",
       "51                    gpt-5       zero-shot  0.58595    0.76895  0.6100   \n",
       "52                    gpt-5  classification  0.55830    0.75340  0.9233   \n",
       "53                    gpt-5          fixing  0.64500    0.80590  0.9067   \n",
       "\n",
       "    ?A1>A3  ?A1>A4  ?A3∅A4  ?A4=A1|3   ave_?A  J(A1-A2)  J(A3-A4)  J(A4-A1|3)  \\\n",
       "0   0.0300  0.0117  0.8433    0.0000  0.18034    0.1667    0.0329      0.0644   \n",
       "1   1.0000  0.3383  0.6817    0.0500  0.44666    0.5603    0.0974      0.3899   \n",
       "2   0.1550  0.1233  0.6183    0.0183  0.20832    0.4549    0.1126      0.2414   \n",
       "3   0.3800  0.3250  0.8300    0.1000  0.37034    0.4413    0.0775      0.2927   \n",
       "4   1.0000  0.7317  0.8467    0.3800  0.72602    0.7939    0.1073      0.6099   \n",
       "5   0.8067  0.9233  0.9533    0.6200  0.77732    0.7650    0.0397      0.7864   \n",
       "6   0.4567  0.3717  0.6200    0.0400  0.35334    0.5129    0.1915      0.2037   \n",
       "7   0.7167  0.5667  0.3817    0.0000  0.52768    0.9757    0.6149      0.0017   \n",
       "8   0.5917  0.6017  0.6183    0.2150  0.51468    0.6237    0.2257      0.3509   \n",
       "9   0.5034  0.4525  0.5085    0.0153  0.38408    0.6399    0.3447      0.2057   \n",
       "10  1.0000  0.8067  0.7983    0.4233  0.77466    0.9257    0.1774      0.6371   \n",
       "11  0.7467  0.9000  0.9583    0.5017  0.76534    0.8296    0.0148      0.6782   \n",
       "12  0.2883  0.2383  0.7100    0.0283  0.29432    0.4712    0.1006      0.2067   \n",
       "13  1.0000  0.8633  0.9350    0.5617  0.83266    0.9071    0.0295      0.8120   \n",
       "14  0.7267  0.8217  0.9350    0.5067  0.72002    0.7529    0.0337      0.7635   \n",
       "15  0.4150  0.3533  0.6267    0.0500  0.35434    0.6008    0.1300      0.2914   \n",
       "16  1.0000  0.9450  0.6433    0.5383  0.80166    0.9384    0.2782      0.6268   \n",
       "17  0.8417  0.9200  0.9667    0.7383  0.85134    0.8233    0.0137      0.8415   \n",
       "18  0.4817  0.3833  0.5633    0.0550  0.36400    0.6296    0.1574      0.2812   \n",
       "19  0.6400  0.9483  0.7717    0.6400  0.77966    0.9573    0.2183      0.7241   \n",
       "20  0.8767  0.9633  0.8967    0.6883  0.84000    0.8881    0.0610      0.7953   \n",
       "21  0.5333  0.4700  0.6283    0.0617  0.42900    0.6534    0.2669      0.2612   \n",
       "22  0.9933  0.9833  0.3633    0.3067  0.72398    0.5867    0.6279      0.3508   \n",
       "23  0.9183  0.9483  0.8650    0.6783  0.85264    0.7343    0.1209      0.5879   \n",
       "24  0.4983  0.3867  0.6333    0.1017  0.40334    0.6771    0.1118      0.3581   \n",
       "25  0.7033  0.9750  0.8700    0.7483  0.84298    0.9754    0.1174      0.8484   \n",
       "26  0.9183  0.9450  0.9533    0.7483  0.87798    0.9207    0.0309      0.8364   \n",
       "27  0.5233  0.4383  0.8767    0.2300  0.48232    0.6346    0.0561      0.4478   \n",
       "28  1.0000  0.9000  0.8667    0.6683  0.86866    0.9639    0.1277      0.7869   \n",
       "29  0.9267  0.9850  0.9550    0.8100  0.91168    0.9363    0.0400      0.7916   \n",
       "30  0.4550  0.4017  0.5600    0.0733  0.36300    0.5703    0.1661      0.2573   \n",
       "31  0.8467  0.9117  0.6733    0.5417  0.78368    0.9722    0.3101      0.6274   \n",
       "32  0.8883  0.9500  0.9467    0.7100  0.86234    0.9117    0.0396      0.7980   \n",
       "33  0.4950  0.4150  0.8450    0.2183  0.46166    0.6101    0.0360      0.4382   \n",
       "34  1.0000  0.9683  0.9067    0.8483  0.92432    0.9383    0.0882      0.8392   \n",
       "35  0.9017  0.9533  0.9567    0.8550  0.91168    0.9256    0.0408      0.8349   \n",
       "36  0.6400  0.6317  0.6767    0.1733  0.54234    0.7228    0.2988      0.3852   \n",
       "37  1.0000  0.7683  0.1950    0.0050  0.54932    0.7816    0.8039      0.0782   \n",
       "38  0.8317  0.9717  0.7783    0.5333  0.79134    0.8965    0.2140      0.5845   \n",
       "39  0.4017  0.3017  0.8200    0.1083  0.36368    0.4610    0.0396      0.3115   \n",
       "40  0.7533  0.8533  0.6967    0.5150  0.72066    0.8240    0.2966      0.5637   \n",
       "41  0.7567  0.8633  0.9300    0.5833  0.76332    0.8221    0.0537      0.7955   \n",
       "42  0.4367  0.3883  0.8150    0.1950  0.42900    0.6314    0.0325      0.4698   \n",
       "43  1.0000  0.8800  0.9300    0.7133  0.87532    0.8978    0.0622      0.7902   \n",
       "44  0.8017  0.9217  0.9783    0.7483  0.84466    0.8197    0.0206      0.8089   \n",
       "45  0.6383  0.6817  0.6150    0.1433  0.54632    0.7718    0.3593      0.3725   \n",
       "46  1.0000  0.7500  0.6733    0.3400  0.72866    0.9258    0.3267      0.4464   \n",
       "47  0.9167  0.9500  0.7367    0.5450  0.80334    0.9104    0.2617      0.5503   \n",
       "48  0.5068  0.3864  0.8661    0.1915  0.45898    0.6246    0.0553      0.4383   \n",
       "49  0.8833  0.9133  0.9250    0.7467  0.85766    0.9324    0.0674      0.8397   \n",
       "50  0.9200  0.9450  0.9733    0.7717  0.86934    0.8218    0.0244      0.8082   \n",
       "51  0.6417  0.6500  0.7350    0.2167  0.57068    0.7816    0.2211      0.4816   \n",
       "52  1.0000  0.9650  0.7067    0.5983  0.83866    0.9663    0.2920      0.6683   \n",
       "53  0.9400  0.9817  0.7900    0.6533  0.85434    0.9611    0.2100      0.6917   \n",
       "\n",
       "         idk  \n",
       "0   0.016650  \n",
       "1   0.250825  \n",
       "2   0.002100  \n",
       "3   0.125425  \n",
       "4   0.370425  \n",
       "5   0.133350  \n",
       "6   0.125825  \n",
       "7   0.808725  \n",
       "8   0.193750  \n",
       "9   0.291925  \n",
       "10  0.335825  \n",
       "11  0.155425  \n",
       "12  0.033325  \n",
       "13  0.269150  \n",
       "14  0.050400  \n",
       "15  0.004175  \n",
       "16  0.405825  \n",
       "17  0.029575  \n",
       "18  0.048350  \n",
       "19  0.232525  \n",
       "20  0.087525  \n",
       "21  0.297900  \n",
       "22  0.666650  \n",
       "23  0.338750  \n",
       "24  0.037925  \n",
       "25  0.137500  \n",
       "26  0.066250  \n",
       "27  0.081250  \n",
       "28  0.350825  \n",
       "29  0.134575  \n",
       "30  0.113750  \n",
       "31  0.306700  \n",
       "32  0.100000  \n",
       "33  0.053325  \n",
       "34  0.319600  \n",
       "35  0.080025  \n",
       "36  0.417925  \n",
       "37  0.310025  \n",
       "38  0.412500  \n",
       "39  0.042075  \n",
       "40  0.285400  \n",
       "41  0.062500  \n",
       "42  0.033325  \n",
       "43  0.330000  \n",
       "44  0.105000  \n",
       "45  0.471675  \n",
       "46  0.550825  \n",
       "47  0.500400  \n",
       "48  0.089800  \n",
       "49  0.265825  \n",
       "50  0.138750  \n",
       "51  0.320000  \n",
       "52  0.475000  \n",
       "53  0.345850  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved ordered table to: ../../output/selected_ordered_llm_action.csv\n"
     ]
    }
   ],
   "source": [
    "# New cell: create DataFrame ordered by LLM then action and show only selected columns\n",
    "from IPython.display import display\n",
    "import pandas as pd\n",
    "import os\n",
    "\n",
    "# Requested order and columns\n",
    "llms = ['llama3.1:8b',\n",
    " 'gpt-oss:20b',\n",
    " 'gpt-4.1-nano-2025-04-14',\n",
    " 'mistral-small:24b',\n",
    " 'llama3.1:70b',\n",
    " 'gemini-2.0-flash',\n",
    " 'gpt-4.1-mini-2025-04-14',\n",
    " 'gpt-4o',\n",
    " 'gpt-4.1-2025-04-14',\n",
    " 'grok-3-mini',\n",
    " 'deepseek-chat',\n",
    " 'gemini-2.5-flash',\n",
    " 'gpt-5-nano',\n",
    " 'deepseek-reasoner',\n",
    " 'gemini-2.5-pro',\n",
    " 'gpt-5-mini',\n",
    " 'o3',\n",
    " 'gpt-5']\n",
    "\n",
    "actions = ['zero-shot', 'classification', 'fixing']\n",
    "cols = [\"llm\",\"action\", \"?A1=A1*\", \"J(A1-A1*)\", \"?A1=A2\", \"?A1>A3\",\"?A1>A4\",\"?A3∅A4\",\"?A4=A1|3\",\"ave_?A\",\n",
    "        \"J(A1-A2)\",\"J(A3-A4)\",\"J(A4-A1|3)\",\"idk\"]\n",
    "\n",
    "# Verify DataFrame exists\n",
    "try:\n",
    "    df  # raise NameError if not present\n",
    "except NameError:\n",
    "    raise RuntimeError(\"DataFrame `df` not found in the notebook namespace. Run the cell that loads the CSV first.\")\n",
    "\n",
    "# Filter to overall if present\n",
    "if 'dataset' in df.columns:\n",
    "    df = df[df['dataset']=='overall'].copy()\n",
    "\n",
    "# Compute ave_?A if components exist and column missing\n",
    "ave_components = [\"?A1=A2\", \"?A1>A3\",\"?A1>A4\",\"?A3∅A4\",\"?A4=A1|3\"]\n",
    "if 'ave_?A' not in df.columns and set(ave_components).issubset(df.columns):\n",
    "    df['ave_?A'] = df[ave_components].mean(axis=1)\n",
    "\n",
    "# Ensure llm and action columns exist (warn if not)\n",
    "if 'llm' not in df.columns:\n",
    "    print(\"Warning: 'llm' column not found in data; ordering by llms will have no effect.\")\n",
    "if 'action' not in df.columns:\n",
    "    print(\"Warning: 'action' column not found in data; ordering by actions will have no effect.\")\n",
    "\n",
    "# Determine present requested columns and warn on missing ones\n",
    "present_cols = [c for c in cols if c in df.columns]\n",
    "missing_cols = [c for c in cols if c not in df.columns]\n",
    "if missing_cols:\n",
    "    print('Warning: these requested columns are missing and will be omitted:', missing_cols)\n",
    "\n",
    "# Build ordered rows: for each llm in list, for each action in list, gather row if exists (aggregate numeric by mean)\n",
    "rows = []\n",
    "index = []\n",
    "for l in llms:\n",
    "    for a in actions:\n",
    "        if 'llm' in df.columns and 'action' in df.columns:\n",
    "            sel = df[(df['llm']==l) & (df['action']==a)]\n",
    "        elif 'llm' in df.columns:\n",
    "            sel = df[df['llm']==l]\n",
    "        elif 'action' in df.columns:\n",
    "            sel = df[df['action']==a]\n",
    "        else:\n",
    "            sel = df.copy()\n",
    "        if not sel.empty:\n",
    "            # aggregate numeric columns by mean and pick first for non-numeric\n",
    "            numeric = sel.select_dtypes(include='number')\n",
    "            non_numeric = sel.select_dtypes(exclude='number')\n",
    "            row = {}\n",
    "            # keep llm and action values as requested\n",
    "            row['llm'] = l\n",
    "            row['action'] = a\n",
    "            for c in cols:\n",
    "                if c in ['llm','action']:\n",
    "                    continue\n",
    "                if c in numeric.columns:\n",
    "                    row[c] = numeric[c].mean()\n",
    "                elif c in non_numeric.columns:\n",
    "                    row[c] = non_numeric[c].iloc[0]\n",
    "                else:\n",
    "                    row[c] = pd.NA\n",
    "            rows.append(row)\n",
    "        else:\n",
    "            # missing pair -> filled with NaNs but keep llm and action values\n",
    "            row = {c: pd.NA for c in cols}\n",
    "            row['llm'] = l\n",
    "            row['action'] = a\n",
    "            rows.append(row)\n",
    "\n",
    "# Create DataFrame and ensure column order\n",
    "ordered_df = pd.DataFrame(rows)\n",
    "# Keep only columns that exist in ordered_df and in requested order\n",
    "final_cols = [c for c in cols if c in ordered_df.columns]\n",
    "ordered_df = ordered_df[final_cols]\n",
    "\n",
    "# Display and save to CSV (in output folder)\n",
    "display(ordered_df)\n",
    "out_folder = '../../output'\n",
    "os.makedirs(out_folder, exist_ok=True)\n",
    "out_csv = os.path.join(out_folder, 'selected_ordered_llm_action.csv')\n",
    "ordered_df.to_csv(out_csv, index=False)\n",
    "print(f'Saved ordered table to: {out_csv}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "id": "2c9812ec",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A & \\sf Llama-3.1-8b & Base  & 2.92 & 21.52 & 1.67 & 3.00 & 1.17 & 84.33 & 0.00 & 18.03 & 16.67 & 3.29 & 6.44 & 1.66 \\\\\n",
      "  &   & CtE & \\ditto & \\ditto & 16.33 & \\textbf{100.00} & 33.83 & 68.17 & 5.00 & 44.67 & 56.03 & 9.74 & 38.99 & 25.08 \\\\\n",
      "  &   & Ora. & \\ditto & \\ditto & 12.67 & 15.50 & 12.33 & 61.83 & 1.83 & 20.83 & 45.49 & 11.26 & 24.14 & 0.21 \\\\\n",
      "\\midrule\n",
      "B & \\sf GPT-oss-20b & Base  & 22.71 & 44.44 & 21.67 & 38.00 & 32.50 & 83.00 & 10.00 & 37.03 & 44.13 & 7.75 & 29.27 & 12.54 \\\\\n",
      "  &   & CtE & \\ditto & \\ditto & 67.17 & \\textbf{100.00} & 73.17 & 84.67 & 38.00 & 72.60 & 79.39 & 10.73 & 60.99 & 37.04 \\\\\n",
      "  &   & Ora. & \\ditto & \\ditto & 58.33 & 80.67 & 92.33 & 95.33 & 62.00 & 77.73 & 76.50 & 3.97 & 78.64 & 13.33 \\\\\n",
      "\\midrule\n",
      "C & \\sf gpt-4.1-nano-2025-04-14 & Base  & 58.52 & 68.23 & 27.83 & 45.67 & 37.17 & 62.00 & 4.00 & 35.33 & 51.29 & 19.15 & 20.37 & 12.58 \\\\\n",
      "  &   & CtE & \\ditto & \\ditto & \\textbf{97.33} & 71.67 & 56.67 & 38.17 & 0.00 & 52.77 & \\textbf{97.57} & 61.49 & 0.17 & \\textbf{80.87} \\\\\n",
      "  &   & Ora. & \\ditto & \\ditto & 54.67 & 59.17 & 60.17 & 61.83 & 21.50 & 51.47 & 62.37 & 22.57 & 35.09 & 19.37 \\\\\n",
      "\\midrule\n",
      "D & \\sf Mistral-small:24b & Base  & 43.01 & 64.10 & 44.07 & 50.34 & 45.25 & 50.85 & 1.53 & 38.41 & 63.99 & 34.47 & 20.57 & 29.19 \\\\\n",
      "  &   & CtE & \\ditto & \\ditto & 84.50 & \\textbf{100.00} & 80.67 & 79.83 & 42.33 & 77.47 & 92.57 & 17.74 & 63.71 & 33.58 \\\\\n",
      "  &   & Ora. & \\ditto & \\ditto & 72.00 & 74.67 & 90.00 & 95.83 & 50.17 & 76.53 & 82.96 & 1.48 & 67.82 & 15.54 \\\\\n",
      "\\midrule\n",
      "E & \\sf Llama-3.1-70b & Base  & 20.33 & 48.82 & 20.67 & 28.83 & 23.83 & 71.00 & 2.83 & 29.43 & 47.12 & 10.06 & 20.67 & 3.33 \\\\\n",
      "  &   & CtE & \\ditto & \\ditto & 80.33 & \\textbf{100.00} & 86.33 & 93.50 & 56.17 & 83.27 & 90.71 & 2.95 & 81.20 & 26.91 \\\\\n",
      "  &   & Ora. & \\ditto & \\ditto & 61.00 & 72.67 & 82.17 & 93.50 & 50.67 & 72.00 & 75.29 & 3.37 & 76.35 & 5.04 \\\\\n",
      "\\midrule\n",
      "F & \\sf Gemini-2.0-flash & Base  & 48.03 & 70.08 & 32.67 & 41.50 & 35.33 & 62.67 & 5.00 & 35.43 & 60.08 & 13.00 & 29.14 & 0.42 \\\\\n",
      "  &   & CtE & \\ditto & \\ditto & 88.17 & \\textbf{100.00} & 94.50 & 64.33 & 53.83 & 80.17 & 93.84 & 27.82 & 62.68 & 40.58 \\\\\n",
      "  &   & Ora. & \\ditto & \\ditto & 79.00 & 84.17 & 92.00 & 96.67 & 73.83 & 85.13 & 82.33 & 1.37 & 84.15 & 2.96 \\\\\n",
      "\\midrule\n",
      "G & \\sf gpt-4.1-mini-2025-04-14 & Base  & 28.02 & 53.84 & 33.67 & 48.17 & 38.33 & 56.33 & 5.50 & 36.40 & 62.96 & 15.74 & 28.12 & 4.83 \\\\\n",
      "  &   & CtE & \\ditto & \\ditto & 89.83 & 64.00 & 94.83 & 77.17 & 64.00 & 77.97 & 95.73 & 21.83 & 72.41 & 23.25 \\\\\n",
      "  &   & Ora. & \\ditto & \\ditto & 77.50 & 87.67 & 96.33 & 89.67 & 68.83 & 84.00 & 88.81 & 6.10 & 79.53 & 8.75 \\\\\n",
      "\\midrule\n",
      "H & \\sf GPT-4o & Base  & 48.98 & 52.09 & 45.17 & 53.33 & 47.00 & 62.83 & 6.17 & 42.90 & 65.34 & 26.69 & 26.12 & 29.79 \\\\\n",
      "  &   & CtE & \\ditto & \\ditto & \\textbf{97.33} & 99.33 & 98.33 & 36.33 & 30.67 & 72.40 & 58.67 & 62.79 & 35.08 & 66.66 \\\\\n",
      "  &   & Ora. & \\ditto & \\ditto & 85.33 & 91.83 & 94.83 & 86.50 & 67.83 & 85.26 & 73.43 & 12.09 & 58.79 & 33.88 \\\\\n",
      "\\midrule\n",
      "I & \\sf gpt-4.1-2025-04-14 & Base  & 28.73 & 59.49 & 39.67 & 49.83 & 38.67 & 63.33 & 10.17 & 40.33 & 67.71 & 11.18 & 35.81 & 3.79 \\\\\n",
      "  &   & CtE & \\ditto & \\ditto & 91.83 & 70.33 & 97.50 & 87.00 & 74.83 & 84.30 & 97.54 & 11.74 & \\textbf{84.84} & 13.75 \\\\\n",
      "  &   & Ora. & \\ditto & \\ditto & 82.50 & 91.83 & 94.50 & 95.33 & 74.83 & 87.80 & 92.07 & 3.09 & 83.64 & 6.62 \\\\\n",
      "\\midrule\n",
      "J & \\sf Grok-3-mini & Base  & 37.67 & 63.09 & 34.33 & 52.33 & 43.83 & 87.67 & 23.00 & 48.23 & 63.46 & 5.61 & 44.78 & 8.12 \\\\\n",
      "  &   & CtE & \\ditto & \\ditto & 90.83 & \\textbf{100.00} & 90.00 & 86.67 & 66.83 & 86.87 & 96.39 & 12.77 & 78.69 & 35.08 \\\\\n",
      "  &   & Ora. & \\ditto & \\ditto & 88.17 & 92.67 & \\textbf{98.50} & 95.50 & 81.00 & 91.17 & 93.63 & 4.00 & 79.16 & 13.46 \\\\\n",
      "\\midrule\n",
      "K & \\sf DeepSeek-V3.1 & Base  & 38.81 & 60.82 & 32.50 & 45.50 & 40.17 & 56.00 & 7.33 & 36.30 & 57.03 & 16.61 & 25.73 & 11.38 \\\\\n",
      "  &   & CtE & \\ditto & \\ditto & 94.50 & 84.67 & 91.17 & 67.33 & 54.17 & 78.37 & 97.22 & 31.01 & 62.74 & 30.67 \\\\\n",
      "  &   & Ora. & \\ditto & \\ditto & 81.67 & 88.83 & 95.00 & 94.67 & 71.00 & 86.23 & 91.17 & 3.96 & 79.80 & 10.00 \\\\\n",
      "\\midrule\n",
      "L & \\sf Gemini-2.5-flash & Base  & 37.69 & 64.85 & 33.50 & 49.50 & 41.50 & 84.50 & 21.83 & 46.17 & 61.01 & 3.60 & 43.82 & 5.33 \\\\\n",
      "  &   & CtE & \\ditto & \\ditto & 89.83 & \\textbf{100.00} & 96.83 & 90.67 & 84.83 & \\textbf{92.43} & 93.83 & 8.82 & 83.92 & 31.96 \\\\\n",
      "  &   & Ora. & \\ditto & \\ditto & 89.17 & 90.17 & 95.33 & 95.67 & \\textbf{85.50} & 91.17 & 92.56 & 4.08 & 83.49 & 8.00 \\\\\n",
      "\\midrule\n",
      "M & \\sf GPT-5-nano & Base  & 33.48 & 42.09 & 59.00 & 64.00 & 63.17 & 67.67 & 17.33 & 54.23 & 72.28 & 29.88 & 38.52 & 41.79 \\\\\n",
      "  &   & CtE & \\ditto & \\ditto & 77.83 & \\textbf{100.00} & 76.83 & 19.50 & 0.50 & 54.93 & 78.16 & \\textbf{80.39} & 7.82 & 31.00 \\\\\n",
      "  &   & Ora. & \\ditto & \\ditto & 84.17 & 83.17 & 97.17 & 77.83 & 53.33 & 79.13 & 89.65 & 21.40 & 58.45 & 41.25 \\\\\n",
      "\\midrule\n",
      "N & \\sf DeepSeek-reasoner & Base  & 29.95 & 51.86 & 18.67 & 40.17 & 30.17 & 82.00 & 10.83 & 36.37 & 46.10 & 3.96 & 31.15 & 4.21 \\\\\n",
      "  &   & CtE & \\ditto & \\ditto & 78.50 & 75.33 & 85.33 & 69.67 & 51.50 & 72.07 & 82.40 & 29.66 & 56.37 & 28.54 \\\\\n",
      "  &   & Ora. & \\ditto & \\ditto & 68.33 & 75.67 & 86.33 & 93.00 & 58.33 & 76.33 & 82.21 & 5.37 & 79.55 & 6.25 \\\\\n",
      "\\midrule\n",
      "O & \\sf Gemini-2.5-pro & Base  & 36.88 & 66.44 & 31.00 & 43.67 & 38.83 & 81.50 & 19.50 & 42.90 & 63.14 & 3.25 & 46.98 & 3.33 \\\\\n",
      "  &   & CtE & \\ditto & \\ditto & 85.33 & \\textbf{100.00} & 88.00 & 93.00 & 71.33 & 87.53 & 89.78 & 6.22 & 79.02 & 33.00 \\\\\n",
      "  &   & Ora. & \\ditto & \\ditto & 77.33 & 80.17 & 92.17 & \\textbf{97.83} & 74.83 & 84.47 & 81.97 & 2.06 & 80.89 & 10.50 \\\\\n",
      "\\midrule\n",
      "P & \\sf GPT-5-mini & Base  & 63.15 & 76.22 & 65.33 & 63.83 & 68.17 & 61.50 & 14.33 & 54.63 & 77.18 & 35.93 & 37.25 & 47.17 \\\\\n",
      "  &   & CtE & \\ditto & \\ditto & 88.00 & \\textbf{100.00} & 75.00 & 67.33 & 34.00 & 72.87 & 92.58 & 32.67 & 44.64 & 55.08 \\\\\n",
      "  &   & Ora. & \\ditto & \\ditto & 86.83 & 91.67 & 95.00 & 73.67 & 54.50 & 80.33 & 91.04 & 26.17 & 55.03 & 50.04 \\\\\n",
      "\\midrule\n",
      "Q & \\sf GPT-o3 & Base  & 31.42 & 58.92 & 34.41 & 50.68 & 38.64 & 86.61 & 19.15 & 45.90 & 62.46 & 5.53 & 43.83 & 8.98 \\\\\n",
      "  &   & CtE & \\ditto & \\ditto & 82.00 & 88.33 & 91.33 & 92.50 & 74.67 & 85.77 & 93.24 & 6.74 & 83.97 & 26.58 \\\\\n",
      "  &   & Ora. & \\ditto & \\ditto & 73.67 & 92.00 & 94.50 & 97.33 & 77.17 & 86.93 & 82.18 & 2.44 & 80.82 & 13.88 \\\\\n",
      "\\midrule\n",
      "R & \\sf GPT-5 & Base  & 58.59 & 76.89 & 61.00 & 64.17 & 65.00 & 73.50 & 21.67 & 57.07 & 78.16 & 22.11 & 48.16 & 32.00 \\\\\n",
      "  &   & CtE & \\ditto & \\ditto & 92.33 & \\textbf{100.00} & 96.50 & 70.67 & 59.83 & 83.87 & 96.63 & 29.20 & 66.83 & 47.50 \\\\\n",
      "  &   & Ora. & \\ditto & \\ditto & 90.67 & 94.00 & 98.17 & 79.00 & 65.33 & 85.43 & 96.11 & 21.00 & 69.17 & 34.59 \\\\\n",
      "\\midrule\n",
      "Saved LaTeX table rows to: ../../output/onboard_updated_table.tex\n"
     ]
    }
   ],
   "source": [
    "# Define display names for LLMs\n",
    "display_names = {\n",
    "    'llama3.1:8b': 'Llama-3.1-8b',\n",
    "    'gpt-oss:20b': 'GPT-oss-20b',\n",
    "    'gpt-4.1-nano': 'GPT-4.1-nano',\n",
    "    'mistral-small:24b': 'Mistral-small:24b',\n",
    "    'llama3.1:70b': 'Llama-3.1-70b',\n",
    "    'gemini-2.0-flash': 'Gemini-2.0-flash',\n",
    "    'gpt-4.1-mini': 'GPT-4.1-mini',\n",
    "    'gpt-4o': 'GPT-4o',\n",
    "    'gpt-4.1': 'GPT-4.1',\n",
    "    'grok-3-mini': 'Grok-3-mini',\n",
    "    'deepseek-chat': 'DeepSeek-V3.1',\n",
    "    'gemini-2.5-flash': 'Gemini-2.5-flash',\n",
    "    'gpt-5-nano': 'GPT-5-nano',\n",
    "    'deepseek-reasoner': 'DeepSeek-reasoner',\n",
    "    'gemini-2.5-pro': 'Gemini-2.5-pro',\n",
    "    'gpt-5-mini': 'GPT-5-mini',\n",
    "    'o3': 'GPT-o3',\n",
    "    'gpt-5': 'GPT-5'\n",
    "}\n",
    "\n",
    "# Helper to format numeric values and bold max values\n",
    "def fmt(v, max_val=False):\n",
    "    if pd.isna(v):\n",
    "        return '---'\n",
    "    try:\n",
    "        formatted = f\"{float(v):.2f}\"\n",
    "        return f\"\\\\textbf{{{formatted}}}\" if max_val else formatted\n",
    "    except Exception:\n",
    "        return str(v)\n",
    "\n",
    "# Only use metric columns (exclude 'llm' and 'action')\n",
    "metric_cols = [c for c in cols if c not in ['llm', 'action']]\n",
    "\n",
    "# First pass: collect all scaled values for each column to find global max\n",
    "all_values = {c: [] for c in metric_cols}\n",
    "all_rows_data = []\n",
    "\n",
    "for l in llms:\n",
    "    for a in actions:\n",
    "        if 'llm' in df.columns and 'action' in df.columns:\n",
    "            sel = df[(df['llm']==l) & (df['action']==a)]\n",
    "        elif 'llm' in df.columns:\n",
    "            sel = df[df['llm']==l]\n",
    "        elif 'action' in df.columns:\n",
    "            sel = df[df['action']==a]\n",
    "        else:\n",
    "            sel = df.copy()\n",
    "        \n",
    "        if sel.empty:\n",
    "            row_data = {c: pd.NA for c in metric_cols}\n",
    "        else:\n",
    "            numeric = sel.select_dtypes(include='number')\n",
    "            non_numeric = sel.select_dtypes(exclude='number')\n",
    "            row_data = {}\n",
    "            for c in metric_cols:\n",
    "                if c in numeric.columns:\n",
    "                    # convert to 0-100 scale\n",
    "                    val = numeric[c].mean() * 100\n",
    "                    row_data[c] = val\n",
    "                    all_values[c].append(val)\n",
    "                elif c in non_numeric.columns:\n",
    "                    row_data[c] = non_numeric[c].iloc[0]\n",
    "                else:\n",
    "                    row_data[c] = pd.NA\n",
    "        \n",
    "        all_rows_data.append({'llm': l, 'action': a, 'data': row_data})\n",
    "\n",
    "# Determine global max values for each column (from scaled values)\n",
    "global_max_vals = {}\n",
    "for c in metric_cols:\n",
    "    if all_values[c]:  # if there are numeric values for this column\n",
    "        valid_vals = [v for v in all_values[c] if pd.notna(v)]\n",
    "        global_max_vals[c] = max(valid_vals) if valid_vals else None\n",
    "    else:\n",
    "        global_max_vals[c] = None\n",
    "\n",
    "# Second pass: generate LaTeX rows with bolding\n",
    "lines = []\n",
    "letters = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')\n",
    "letter_idx = 0\n",
    "row_idx = 0\n",
    "\n",
    "for l in llms:\n",
    "    letter = letters[letter_idx] if letter_idx < len(letters) else '?'\n",
    "    letter_idx += 1\n",
    "    model_name = display_names.get(l, l)\n",
    "    \n",
    "    # Get the three rows for this LLM (zero-shot, classification, fixing)\n",
    "    rows_by_action = {}\n",
    "    for a in actions:\n",
    "        rows_by_action[a] = all_rows_data[row_idx]['data']\n",
    "        row_idx += 1\n",
    "    \n",
    "    # Base row\n",
    "    base_vals = [fmt(rows_by_action['zero-shot'].get(c), \n",
    "                     pd.notna(rows_by_action['zero-shot'].get(c)) and rows_by_action['zero-shot'].get(c) == global_max_vals.get(c)) \n",
    "                 for c in metric_cols]\n",
    "    base_line = f\"{letter} & \\\\sf {model_name} & Base  & \" + \" & \".join(base_vals) + \" \\\\\\\\\"\n",
    "    lines.append(base_line)\n",
    "    \n",
    "    # CtE row: use \\\\ditto for the first two metric columns\n",
    "    cte_vals = [\"\\\\ditto\", \"\\\\ditto\"] + [fmt(rows_by_action['classification'].get(c), \n",
    "                                               pd.notna(rows_by_action['classification'].get(c)) and rows_by_action['classification'].get(c) == global_max_vals.get(c)) \n",
    "                                           for c in metric_cols[2:]]\n",
    "    cte_line = f\"  &   & CtE & \" + \" & \".join(cte_vals) + \" \\\\\\\\\"\n",
    "    lines.append(cte_line)\n",
    "    \n",
    "    # Ora row\n",
    "    ora_vals = [\"\\\\ditto\", \"\\\\ditto\"] + [fmt(rows_by_action['fixing'].get(c), \n",
    "                                              pd.notna(rows_by_action['fixing'].get(c)) and rows_by_action['fixing'].get(c) == global_max_vals.get(c)) \n",
    "                                          for c in metric_cols[2:]]\n",
    "    ora_line = f\"  &   & Ora. & \" + \" & \".join(ora_vals) + \" \\\\\\\\\"\n",
    "    lines.append(ora_line)\n",
    "    lines.append(\"\\\\midrule\")\n",
    "\n",
    "# Output: print and save\n",
    "latex_text = \"\\n\".join(lines)\n",
    "print(latex_text)\n",
    "\n",
    "out_folder = '../../output'\n",
    "os.makedirs(out_folder, exist_ok=True)\n",
    "out_path = os.path.join(out_folder, 'onboard_updated_table.tex')\n",
    "with open(out_path, 'w') as fh:\n",
    "    fh.write(latex_text)\n",
    "print(f\"Saved LaTeX table rows to: {out_path}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9b95553b",
   "metadata": {},
   "source": [
    "## Onboard relation typing table.\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "id": "1a27bbf5",
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = \"relation_summery.csv\"\n",
    "df_rel = pd.read_csv(os.path.join(folder, filename))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "id": "b61a05aa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>llm</th>\n",
       "      <th>dataset</th>\n",
       "      <th>relation</th>\n",
       "      <th>True</th>\n",
       "      <th>Accuracy</th>\n",
       "      <th>Size</th>\n",
       "      <th>Equivalence</th>\n",
       "      <th>Contains</th>\n",
       "      <th>ContainedBy</th>\n",
       "      <th>Overlap</th>\n",
       "      <th>Disjoint</th>\n",
       "      <th>Unknown</th>\n",
       "      <th>Else</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>360</th>\n",
       "      <td>deepseek-chat</td>\n",
       "      <td>overall</td>\n",
       "      <td>R(1-2)</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>0.9600(576)</td>\n",
       "      <td>600</td>\n",
       "      <td>0.9600(576)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "      <td>0.0317(19)</td>\n",
       "      <td>0.0033(2)</td>\n",
       "      <td>0.0050(3)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>361</th>\n",
       "      <td>deepseek-chat</td>\n",
       "      <td>overall</td>\n",
       "      <td>R(1-3)</td>\n",
       "      <td>Contains</td>\n",
       "      <td>0.3550(213)</td>\n",
       "      <td>600</td>\n",
       "      <td>0.0067(4)</td>\n",
       "      <td>0.3550(213)</td>\n",
       "      <td>0.5867(352)</td>\n",
       "      <td>0.0417(25)</td>\n",
       "      <td>0.0100(6)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>362</th>\n",
       "      <td>deepseek-chat</td>\n",
       "      <td>overall</td>\n",
       "      <td>R(1-4)</td>\n",
       "      <td>Contains</td>\n",
       "      <td>0.2083(125)</td>\n",
       "      <td>600</td>\n",
       "      <td>0.0017(1)</td>\n",
       "      <td>0.2083(125)</td>\n",
       "      <td>0.0517(31)</td>\n",
       "      <td>0.5367(322)</td>\n",
       "      <td>0.2017(121)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>363</th>\n",
       "      <td>deepseek-chat</td>\n",
       "      <td>overall</td>\n",
       "      <td>R(3-4)</td>\n",
       "      <td>Disjoint</td>\n",
       "      <td>0.9933(596)</td>\n",
       "      <td>600</td>\n",
       "      <td>0.0017(1)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "      <td>0.0017(1)</td>\n",
       "      <td>0.0033(2)</td>\n",
       "      <td>0.9933(596)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>364</th>\n",
       "      <td>deepseek-chat</td>\n",
       "      <td>overall</td>\n",
       "      <td>R(1-34)</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>0.9050(543)</td>\n",
       "      <td>600</td>\n",
       "      <td>0.9050(543)</td>\n",
       "      <td>0.0067(4)</td>\n",
       "      <td>0.0033(2)</td>\n",
       "      <td>0.0167(10)</td>\n",
       "      <td>0.0683(41)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               llm  dataset relation         True     Accuracy  Size  \\\n",
       "360  deepseek-chat  overall   R(1-2)  Equivalence  0.9600(576)   600   \n",
       "361  deepseek-chat  overall   R(1-3)     Contains  0.3550(213)   600   \n",
       "362  deepseek-chat  overall   R(1-4)     Contains  0.2083(125)   600   \n",
       "363  deepseek-chat  overall   R(3-4)     Disjoint  0.9933(596)   600   \n",
       "364  deepseek-chat  overall  R(1-34)  Equivalence  0.9050(543)   600   \n",
       "\n",
       "     Equivalence     Contains  ContainedBy      Overlap     Disjoint  \\\n",
       "360  0.9600(576)    0.0000(0)    0.0000(0)   0.0317(19)    0.0033(2)   \n",
       "361    0.0067(4)  0.3550(213)  0.5867(352)   0.0417(25)    0.0100(6)   \n",
       "362    0.0017(1)  0.2083(125)   0.0517(31)  0.5367(322)  0.2017(121)   \n",
       "363    0.0017(1)    0.0000(0)    0.0017(1)    0.0033(2)  0.9933(596)   \n",
       "364  0.9050(543)    0.0067(4)    0.0033(2)   0.0167(10)   0.0683(41)   \n",
       "\n",
       "       Unknown       Else  \n",
       "360  0.0050(3)  0.0000(0)  \n",
       "361  0.0000(0)  0.0000(0)  \n",
       "362  0.0000(0)  0.0000(0)  \n",
       "363  0.0000(0)  0.0000(0)  \n",
       "364  0.0000(0)  0.0000(0)  "
      ]
     },
     "execution_count": 104,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filename = \"relation_summery.csv\"\n",
    "df_rel = pd.read_csv(os.path.join(folder, filename))\n",
    "\n",
    "# Filter to overall dataset\n",
    "df_rel = df_rel[df_rel['dataset'] == 'overall'].copy()\n",
    "\n",
    "df_rel.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "id": "0ae31673",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Columns in df_rel:\n",
      "['llm', 'dataset', 'relation', 'True', 'Accuracy', 'Size', 'Equivalence', 'Contains', 'ContainedBy', 'Overlap', 'Disjoint', 'Unknown', 'Else']\n",
      "\n",
      "First few rows:\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>llm</th>\n",
       "      <th>dataset</th>\n",
       "      <th>relation</th>\n",
       "      <th>True</th>\n",
       "      <th>Accuracy</th>\n",
       "      <th>Size</th>\n",
       "      <th>Equivalence</th>\n",
       "      <th>Contains</th>\n",
       "      <th>ContainedBy</th>\n",
       "      <th>Overlap</th>\n",
       "      <th>Disjoint</th>\n",
       "      <th>Unknown</th>\n",
       "      <th>Else</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>360</th>\n",
       "      <td>deepseek-chat</td>\n",
       "      <td>overall</td>\n",
       "      <td>R(1-2)</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>0.9600(576)</td>\n",
       "      <td>600</td>\n",
       "      <td>0.9600(576)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "      <td>0.0317(19)</td>\n",
       "      <td>0.0033(2)</td>\n",
       "      <td>0.0050(3)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>361</th>\n",
       "      <td>deepseek-chat</td>\n",
       "      <td>overall</td>\n",
       "      <td>R(1-3)</td>\n",
       "      <td>Contains</td>\n",
       "      <td>0.3550(213)</td>\n",
       "      <td>600</td>\n",
       "      <td>0.0067(4)</td>\n",
       "      <td>0.3550(213)</td>\n",
       "      <td>0.5867(352)</td>\n",
       "      <td>0.0417(25)</td>\n",
       "      <td>0.0100(6)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>362</th>\n",
       "      <td>deepseek-chat</td>\n",
       "      <td>overall</td>\n",
       "      <td>R(1-4)</td>\n",
       "      <td>Contains</td>\n",
       "      <td>0.2083(125)</td>\n",
       "      <td>600</td>\n",
       "      <td>0.0017(1)</td>\n",
       "      <td>0.2083(125)</td>\n",
       "      <td>0.0517(31)</td>\n",
       "      <td>0.5367(322)</td>\n",
       "      <td>0.2017(121)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>363</th>\n",
       "      <td>deepseek-chat</td>\n",
       "      <td>overall</td>\n",
       "      <td>R(3-4)</td>\n",
       "      <td>Disjoint</td>\n",
       "      <td>0.9933(596)</td>\n",
       "      <td>600</td>\n",
       "      <td>0.0017(1)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "      <td>0.0017(1)</td>\n",
       "      <td>0.0033(2)</td>\n",
       "      <td>0.9933(596)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>364</th>\n",
       "      <td>deepseek-chat</td>\n",
       "      <td>overall</td>\n",
       "      <td>R(1-34)</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>0.9050(543)</td>\n",
       "      <td>600</td>\n",
       "      <td>0.9050(543)</td>\n",
       "      <td>0.0067(4)</td>\n",
       "      <td>0.0033(2)</td>\n",
       "      <td>0.0167(10)</td>\n",
       "      <td>0.0683(41)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               llm  dataset relation         True     Accuracy  Size  \\\n",
       "360  deepseek-chat  overall   R(1-2)  Equivalence  0.9600(576)   600   \n",
       "361  deepseek-chat  overall   R(1-3)     Contains  0.3550(213)   600   \n",
       "362  deepseek-chat  overall   R(1-4)     Contains  0.2083(125)   600   \n",
       "363  deepseek-chat  overall   R(3-4)     Disjoint  0.9933(596)   600   \n",
       "364  deepseek-chat  overall  R(1-34)  Equivalence  0.9050(543)   600   \n",
       "\n",
       "     Equivalence     Contains  ContainedBy      Overlap     Disjoint  \\\n",
       "360  0.9600(576)    0.0000(0)    0.0000(0)   0.0317(19)    0.0033(2)   \n",
       "361    0.0067(4)  0.3550(213)  0.5867(352)   0.0417(25)    0.0100(6)   \n",
       "362    0.0017(1)  0.2083(125)   0.0517(31)  0.5367(322)  0.2017(121)   \n",
       "363    0.0017(1)    0.0000(0)    0.0017(1)    0.0033(2)  0.9933(596)   \n",
       "364  0.9050(543)    0.0067(4)    0.0033(2)   0.0167(10)   0.0683(41)   \n",
       "\n",
       "       Unknown       Else  \n",
       "360  0.0050(3)  0.0000(0)  \n",
       "361  0.0000(0)  0.0000(0)  \n",
       "362  0.0000(0)  0.0000(0)  \n",
       "363  0.0000(0)  0.0000(0)  \n",
       "364  0.0000(0)  0.0000(0)  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Data types:\n",
      "llm            object\n",
      "dataset        object\n",
      "relation       object\n",
      "True           object\n",
      "Accuracy       object\n",
      "Size            int64\n",
      "Equivalence    object\n",
      "Contains       object\n",
      "ContainedBy    object\n",
      "Overlap        object\n",
      "Disjoint       object\n",
      "Unknown        object\n",
      "Else           object\n",
      "dtype: object\n"
     ]
    }
   ],
   "source": [
    "# Check the actual columns and data structure\n",
    "print('Columns in df_rel:')\n",
    "print(df_rel.columns.tolist())\n",
    "print('\\nFirst few rows:')\n",
    "display(df_rel.head())\n",
    "print('\\nData types:')\n",
    "print(df_rel.dtypes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "id": "8f23a0e1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A & \\textsf{Llama-3.1-8b} & 5.83 & 11.33 & 11.50 & 19.17 & 2.83 & 10.13 \\\\\n",
      "B & \\textsf{GPT-oss-20b} & 93.83 & 92.33 & 96.33 & 91.33 & 94.00 & 93.56 \\\\\n",
      "C & \\textsf{GPT-4.1-nano} & 69.17 & 39.33 & 41.33 & 62.33 & 0.00 & 42.43 \\\\\n",
      "D & \\textsf{Mistral-small:24b} & 91.69 & 0.17 & 1.19 & 99.49 & 11.86 & 40.88 \\\\\n",
      "E & \\textsf{Llama-3.1-70b} & \\textbf{97.33} & 7.67 & 27.00 & 99.17 & 98.50 & 65.93 \\\\\n",
      "F & \\textsf{Gemini-2.0-flash} & 90.83 & 79.17 & 21.33 & 95.17 & 80.33 & 73.37 \\\\\n",
      "G & \\textsf{GPT-4.1-mini} & 91.00 & 34.50 & 20.83 & 99.00 & 95.83 & 68.23 \\\\\n",
      "H & \\textsf{GPT-4o} & 94.50 & 31.67 & 12.17 & 99.33 & 65.33 & 60.60 \\\\\n",
      "I & \\textsf{GPT-4.1} & 94.50 & \\textbf{98.67} & 69.67 & \\textbf{99.50} & \\textbf{99.33} & 92.33 \\\\\n",
      "J & \\textsf{Grok-3-mini} & 91.67 & 94.83 & 96.50 & 97.67 & 96.83 & 95.50 \\\\\n",
      "K & \\textsf{DeepSeek-V3.1} & 96.00 & 35.50 & 20.83 & 99.33 & 90.50 & 68.43 \\\\\n",
      "L & \\textsf{Gemini-2.5-flash} & 86.33 & 95.67 & 96.33 & 96.83 & 95.50 & 94.13 \\\\\n",
      "M & \\textsf{GPT-5-nano} & 91.83 & 85.00 & 91.50 & 88.50 & 91.17 & 89.60 \\\\\n",
      "N & \\textsf{DeepSeek-reasoner} & 88.83 & 97.00 & 96.67 & 96.33 & 94.33 & 94.63 \\\\\n",
      "O & \\textsf{Gemini-2.5-pro} & 91.33 & 97.67 & 94.50 & 98.50 & 96.17 & \\textbf{95.63} \\\\\n",
      "P & \\textsf{GPT-5-mini} & 90.33 & 97.17 & 97.83 & 95.00 & 95.17 & 95.10 \\\\\n",
      "Q & \\textsf{GPT-o3} & 92.71 & 97.12 & 97.63 & 95.25 & 93.73 & 95.29 \\\\\n",
      "R & \\textsf{GPT-5} & 90.50 & 97.67 & \\textbf{98.33} & 96.83 & 94.50 & 95.57 \\\\\n",
      "\\midrule\n",
      "\\multicolumn{2}{l !{\\color{white!80!black} \\vline width 1pt}}{\\textit{Average}} & 86.01 & 66.25 & 60.64 & 90.48 & 77.55 & 76.19 \\\\\n",
      "Saved relation LaTeX table rows to: ../../output/onboard_relation_table.tex\n"
     ]
    }
   ],
   "source": [
    "# Generate LaTeX table for relation classification\n",
    "\n",
    "# Relation columns to include\n",
    "rel_cols = ['R(1-2)', 'R(1-3)', 'R(1-4)', 'R(3-4)', 'R(1-34)']\n",
    "\n",
    "# LLMs in order (same as before)\n",
    "llms = ['llama3.1:8b',\n",
    " 'gpt-oss:20b',\n",
    " 'gpt-4.1-nano-2025-04-14',\n",
    " 'mistral-small:24b',\n",
    " 'llama3.1:70b',\n",
    " 'gemini-2.0-flash',\n",
    " 'gpt-4.1-mini-2025-04-14',\n",
    " 'gpt-4o',\n",
    " 'gpt-4.1-2025-04-14',\n",
    " 'grok-3-mini',\n",
    " 'deepseek-chat',\n",
    " 'gemini-2.5-flash',\n",
    " 'gpt-5-nano',\n",
    " 'deepseek-reasoner',\n",
    " 'gemini-2.5-pro',\n",
    " 'gpt-5-mini',\n",
    " 'o3',\n",
    " 'gpt-5']\n",
    "\n",
    "# Display names (same as before)\n",
    "display_names = {\n",
    "    'llama3.1:8b': 'Llama-3.1-8b',\n",
    "    'gpt-oss:20b': 'GPT-oss-20b',\n",
    "    'gpt-4.1-nano': 'GPT-4.1-nano',\n",
    "    'gpt-4.1-nano-2025-04-14': 'GPT-4.1-nano',\n",
    "    'mistral-small:24b': 'Mistral-small:24b',\n",
    "    'llama3.1:70b': 'Llama-3.1-70b',\n",
    "    'gemini-2.0-flash': 'Gemini-2.0-flash',\n",
    "    'gpt-4.1-mini': 'GPT-4.1-mini',\n",
    "    'gpt-4.1-mini-2025-04-14': 'GPT-4.1-mini',\n",
    "    'gpt-4o': 'GPT-4o',\n",
    "    'gpt-4.1': 'GPT-4.1',\n",
    "    'gpt-4.1-2025-04-14': 'GPT-4.1',\n",
    "    'grok-3-mini': 'Grok-3-mini',\n",
    "    'deepseek-chat': 'DeepSeek-V3.1',\n",
    "    'gemini-2.5-flash': 'Gemini-2.5-flash',\n",
    "    'gpt-5-nano': 'GPT-5-nano',\n",
    "    'deepseek-reasoner': 'DeepSeek-reasoner',\n",
    "    'gemini-2.5-pro': 'Gemini-2.5-pro',\n",
    "    'gpt-5-mini': 'GPT-5-mini',\n",
    "    'o3': 'GPT-o3',\n",
    "    'gpt-5': 'GPT-5'\n",
    "}\n",
    "\n",
    "# Helper to parse values like '0.9797(145)' to extract the numeric value\n",
    "import re\n",
    "def parse_value(val):\n",
    "    if pd.isna(val):\n",
    "        return pd.NA\n",
    "    if isinstance(val, (int, float)):\n",
    "        return float(val)\n",
    "    if isinstance(val, str):\n",
    "        # Extract the first number from format like '0.9797(145)'\n",
    "        match = re.match(r'([0-9.]+)', str(val))\n",
    "        if match:\n",
    "            return float(match.group(1))\n",
    "    return pd.NA\n",
    "\n",
    "# Helper to format numeric values and bold max values\n",
    "def fmt_rel(v, max_val=False):\n",
    "    if pd.isna(v):\n",
    "        return '---'\n",
    "    try:\n",
    "        formatted = f\"{float(v):.2f}\"\n",
    "        return f\"\\\\textbf{{{formatted}}}\" if max_val else formatted\n",
    "    except Exception:\n",
    "        return str(v)\n",
    "\n",
    "# First pass: collect all scaled values for each relation column\n",
    "all_rel_values = {c: [] for c in rel_cols}\n",
    "all_rel_data = []\n",
    "\n",
    "for l in llms:\n",
    "    if 'llm' in df_rel.columns:\n",
    "        # Get all rows for this LLM\n",
    "        sel = df_rel[df_rel['llm'] == l]\n",
    "    else:\n",
    "        sel = df_rel.copy()\n",
    "    \n",
    "    if sel.empty:\n",
    "        row_data = {c: pd.NA for c in rel_cols}\n",
    "    else:\n",
    "        row_data = {}\n",
    "        for rel in rel_cols:\n",
    "            # Find the row for this relation\n",
    "            rel_row = sel[sel['relation'] == rel]\n",
    "            if not rel_row.empty and 'Accuracy' in rel_row.columns:\n",
    "                # Parse the Accuracy value\n",
    "                raw_val = rel_row['Accuracy'].iloc[0]\n",
    "                parsed_val = parse_value(raw_val)\n",
    "                if pd.notna(parsed_val):\n",
    "                    val = parsed_val * 100\n",
    "                    row_data[rel] = val\n",
    "                    all_rel_values[rel].append(val)\n",
    "                else:\n",
    "                    row_data[rel] = pd.NA\n",
    "            else:\n",
    "                row_data[rel] = pd.NA\n",
    "    \n",
    "    # Calculate mean accuracy\n",
    "    valid_vals = [row_data[c] for c in rel_cols if pd.notna(row_data[c])]\n",
    "    row_data['mean_acc'] = sum(valid_vals) / len(valid_vals) if valid_vals else pd.NA\n",
    "    if pd.notna(row_data['mean_acc']):\n",
    "        all_rel_values.setdefault('mean_acc', []).append(row_data['mean_acc'])\n",
    "    \n",
    "    all_rel_data.append({'llm': l, 'data': row_data})\n",
    "\n",
    "# Determine global max values for each column\n",
    "global_rel_max = {}\n",
    "for c in rel_cols + ['mean_acc']:\n",
    "    if c in all_rel_values and all_rel_values[c]:\n",
    "        valid_vals = [v for v in all_rel_values[c] if pd.notna(v)]\n",
    "        global_rel_max[c] = max(valid_vals) if valid_vals else None\n",
    "    else:\n",
    "        global_rel_max[c] = None\n",
    "\n",
    "# Second pass: generate LaTeX rows\n",
    "lines_rel = []\n",
    "letters = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')\n",
    "\n",
    "for idx, row_info in enumerate(all_rel_data):\n",
    "    letter = letters[idx] if idx < len(letters) else '?'\n",
    "    l = row_info['llm']\n",
    "    model_name = display_names.get(l, l)\n",
    "    data = row_info['data']\n",
    "    \n",
    "    # Format each column value with bolding for max\n",
    "    vals = [fmt_rel(data.get(c), \n",
    "                    pd.notna(data.get(c)) and data.get(c) == global_rel_max.get(c))\n",
    "            for c in rel_cols]\n",
    "    \n",
    "    # Add mean accuracy\n",
    "    mean_val = fmt_rel(data.get('mean_acc'),\n",
    "                       pd.notna(data.get('mean_acc')) and data.get('mean_acc') == global_rel_max.get('mean_acc'))\n",
    "    vals.append(mean_val)\n",
    "    \n",
    "    line = f\"{letter} & \\\\textsf{{{model_name}}} & \" + \" & \".join(vals) + \" \\\\\\\\\"\n",
    "    lines_rel.append(line)\n",
    "\n",
    "lines_rel.append(\"\\\\midrule\")\n",
    "\n",
    "# Calculate average row\n",
    "avg_vals = []\n",
    "for c in rel_cols + ['mean_acc']:\n",
    "    if c in all_rel_values and all_rel_values[c]:\n",
    "        valid_vals = [v for v in all_rel_values[c] if pd.notna(v)]\n",
    "        avg = sum(valid_vals) / len(valid_vals) if valid_vals else float('nan')\n",
    "        avg_vals.append(fmt_rel(avg, avg == global_rel_max.get(c)))\n",
    "    else:\n",
    "        avg_vals.append('---')\n",
    "\n",
    "avg_line = \"\\\\multicolumn{2}{l !{\\\\color{white!80!black} \\\\vline width 1pt}}{\\\\textit{Average}} & \" + \" & \".join(avg_vals) + \" \\\\\\\\\"\n",
    "lines_rel.append(avg_line)\n",
    "\n",
    "# Output: print and save\n",
    "latex_rel_text = \"\\n\".join(lines_rel)\n",
    "print(latex_rel_text)\n",
    "\n",
    "out_folder = '../../output'\n",
    "os.makedirs(out_folder, exist_ok=True)\n",
    "out_path_rel = os.path.join(out_folder, 'onboard_relation_table.tex')\n",
    "with open(out_path_rel, 'w') as fh:\n",
    "    fh.write(latex_rel_text)\n",
    "print(f\"Saved relation LaTeX table rows to: {out_path_rel}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "id": "fa1869b4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                       llm  R(1-2)  R(1-3)  R(1-4)  R(3-4)  R(1-34)  mean_acc\n",
      "0              llama3.1:8b    5.83   11.33   11.50   19.17     2.83    10.132\n",
      "1              gpt-oss:20b   93.83   92.33   96.33   91.33    94.00    93.564\n",
      "2  gpt-4.1-nano-2025-04-14   69.17   39.33   41.33   62.33     0.00    42.432\n",
      "3        mistral-small:24b   91.69    0.17    1.19   99.49    11.86    40.880\n",
      "4             llama3.1:70b   97.33    7.67   27.00   99.17    98.50    65.934\n",
      "Saved to: ../../output/relation_classification_ordered.csv\n"
     ]
    }
   ],
   "source": [
    "# Create DataFrame with relation classification data and save to CSV\n",
    "\n",
    "import re\n",
    "import pandas as pd\n",
    "\n",
    "def parse_value(val):\n",
    "    if pd.isna(val):\n",
    "        return pd.NA\n",
    "    if isinstance(val, (int, float)):\n",
    "        return float(val)\n",
    "    if isinstance(val, str):\n",
    "        match = re.match(r'([0-9.]+)', str(val))\n",
    "        if match:\n",
    "            return float(match.group(1))\n",
    "    return pd.NA\n",
    "\n",
    "rel_cols = ['R(1-2)', 'R(1-3)', 'R(1-4)', 'R(3-4)', 'R(1-34)']\n",
    "\n",
    "llms = ['llama3.1:8b', 'gpt-oss:20b', 'gpt-4.1-nano-2025-04-14', 'mistral-small:24b',\n",
    "        'llama3.1:70b', 'gemini-2.0-flash', 'gpt-4.1-mini-2025-04-14', 'gpt-4o',\n",
    "        'gpt-4.1-2025-04-14', 'grok-3-mini', 'deepseek-chat', 'gemini-2.5-flash',\n",
    "        'gpt-5-nano', 'deepseek-reasoner', 'gemini-2.5-pro', 'gpt-5-mini', 'o3', 'gpt-5']\n",
    "\n",
    "# Build rows\n",
    "rows = []\n",
    "for l in llms:\n",
    "    if 'llm' in df_rel.columns:\n",
    "        sel = df_rel[df_rel['llm'] == l]\n",
    "    else:\n",
    "        sel = df_rel.copy()\n",
    "    \n",
    "    row = {'llm': l}\n",
    "    for rel in rel_cols:\n",
    "        rel_row = sel[sel['relation'] == rel]\n",
    "        if not rel_row.empty and 'Accuracy' in rel_row.columns:\n",
    "            raw_val = rel_row['Accuracy'].iloc[0]\n",
    "            parsed_val = parse_value(raw_val)\n",
    "            row[rel] = parsed_val * 100 if pd.notna(parsed_val) else pd.NA\n",
    "        else:\n",
    "            row[rel] = pd.NA\n",
    "    \n",
    "    # Calculate mean\n",
    "    valid_vals = [row[c] for c in rel_cols if pd.notna(row[c])]\n",
    "    row['mean_acc'] = sum(valid_vals) / len(valid_vals) if valid_vals else pd.NA\n",
    "    rows.append(row)\n",
    "\n",
    "df_relation_output = pd.DataFrame(rows)\n",
    "print(df_relation_output.head())\n",
    "\n",
    "# Save\n",
    "out_path_csv = os.path.join(out_folder, 'relation_classification_ordered.csv')\n",
    "df_relation_output.to_csv(out_path_csv, index=False)\n",
    "print(f\"Saved to: {out_path_csv}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d55c5685",
   "metadata": {},
   "source": [
    "## Onboard (+/-) tables"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ac7bf2ce",
   "metadata": {},
   "source": [
    "## Onboard CFR (Contradiction-Free Rates) Table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "id": "6df24985",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Classification data shape: (18, 60)\n",
      "Zero-shot data shape: (18, 60)\n",
      "\n",
      "Columns available:\n",
      "['ID', 'dataset', 'action', 'llm', '?A1=A2', '?A1=A3+A4', '?A1>A3', '?A1>A4', '?A3∅A4', '?A4=A1|3', '?A1=A1*', '?A1=A1**', '?A1*=A1**', 'J(A1-A2)', 'J(A1-A34)', 'J(A3-A4)', 'J(A4-A1|3)', 'J(A1-A1*)', 'J(A1-A1**)', 'J(A1*-A1**)', '?SC(A1=A2)', '?SC(A1>A3)', '?SC(A1>A4)', '?SC(A3∅A4)', '?SC(A4=A1|3)', 'idk_A1', 'idk_A2', 'idk_A3', 'idk_A4', '?A1=A1(ave)', 'J_A1_ave', 'idk', '?A1=A2(+)', '?A1=A2(-)', 'J(1-2)+', 'J(1-2)-', '?A1>A3(+)', '?A1>A3(-)', '?A1>A4(+)', '?A1>A4(-)', '?A3∅A4(+)', '?A3∅A4(-)', 'J(3-4)+', 'J(3-4)-', '?A1=A3+A4(+)', '?A1=A3+A4(-)', 'J(1-34)+', 'J(1-34)-', 'p(A1=A2)_x', 'p(A1=A3+A4)_x', 'p(A1>A3)_x', 'p(A1>A4)_x', 'p(A3∅A4)_x', 'p(A4=A1|3)_x', 'p(A1=A2)_y', 'p(A1=A3+A4)_y', 'p(A1>A3)_y', 'p(A1>A4)_y', 'p(A3∅A4)_y', 'p(A4=A1|3)_y']\n"
     ]
    }
   ],
   "source": [
    "# Load CFR data from classification and zero-shot CSV files\n",
    "import pandas as pd\n",
    "import os\n",
    "\n",
    "folder = \"../../output/\"\n",
    "\n",
    "# Read the CSV files\n",
    "classification_df = pd.read_csv(os.path.join(folder, 'summary_xidk_classification.csv'))\n",
    "zero_shot_df = pd.read_csv(os.path.join(folder, 'summary_xidk_zero-shot.csv'))\n",
    "\n",
    "# Filter to overall dataset\n",
    "classification_df = classification_df[classification_df['dataset'] == 'overall'].copy()\n",
    "zero_shot_df = zero_shot_df[zero_shot_df['dataset'] == 'overall'].copy()\n",
    "\n",
    "print(\"Classification data shape:\", classification_df.shape)\n",
    "print(\"Zero-shot data shape:\", zero_shot_df.shape)\n",
    "print(\"\\nColumns available:\")\n",
    "print(classification_df.columns.tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "id": "d6ad7f98",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LaTeX table rows:\n",
      "\n",
      "\\sf Llama-3.1-8b  & Base & 93.83 & 86.33 & 87.33 & 28.50 & 97.17 \\\\\n",
      "& CtE & 81.00 & 13.83 & \\ditto & \\ditto & 82.67 \\\\ \\midrule\n",
      "\\sf GPT-oss-20b  & Base & 26.17 & 40.33 & 32.17 & 80.33 & 16.00 \\\\\n",
      "& CtE & 66.50 & 89.33 & \\ditto & \\ditto & 40.33 \\\\ \\midrule\n",
      "\\sf GPT-4.1-nano  & Base & 45.67 & 50.00 & 53.17 & 50.00 & 96.00 \\\\\n",
      "& CtE & 69.33 & 36.50 & \\ditto & \\ditto & 100.00 \\\\ \\midrule\n",
      "\\sf Mistral-small:24b  & Base & 48.98 & 49.49 & 54.58 & 50.68 & 87.29 \\\\\n",
      "& CtE & 88.50 & 0.17 & \\ditto & \\ditto & 56.50 \\\\ \\midrule\n",
      "\\sf Llama-3.1-70b  & Base & 23.00 & 68.17 & 62.17 & 71.17 & 4.33 \\\\\n",
      "& CtE & 82.33 & 9.00 & \\ditto & \\ditto & 53.67 \\\\ \\midrule\n",
      "\\sf Gemini-2.0-flash  & Base & 37.33 & 44.50 & 62.33 & 62.50 & 23.33 \\\\\n",
      "& CtE & 88.50 & 46.83 & \\ditto & \\ditto & 46.17 \\\\ \\midrule\n",
      "\\sf GPT-4.1-mini  & Base & 38.67 & 46.00 & 59.17 & 56.33 & 8.67 \\\\\n",
      "& CtE & 89.33 & 44.83 & \\ditto & \\ditto & 62.50 \\\\ \\midrule\n",
      "\\sf GPT-4o  & Base & 45.33 & 49.00 & 53.50 & 63.17 & 37.67 \\\\\n",
      "& CtE & 96.50 & 66.33 & \\ditto & \\ditto & 73.50 \\\\ \\midrule\n",
      "\\sf GPT-4.1  & Base & 42.83 & 49.83 & 50.33 & 63.83 & 10.83 \\\\\n",
      "& CtE & 92.33 & 69.67 & \\ditto & \\ditto & 74.67 \\\\ \\midrule\n",
      "\\sf Grok-3-mini  & Base & 39.33 & 52.17 & 44.00 & 88.00 & 26.17 \\\\\n",
      "& CtE & 91.83 & 95.17 & \\ditto & \\ditto & 69.33 \\\\ \\midrule\n",
      "\\sf DeepSeek-V3.1  & Base & 34.83 & 50.33 & 54.33 & 56.00 & 14.83 \\\\\n",
      "& CtE & 89.33 & 10.00 & \\ditto & \\ditto & 52.00 \\\\ \\midrule\n",
      "\\sf Gemini-2.5-flash  & Base & 41.83 & 50.83 & 41.83 & 84.00 & 25.67 \\\\\n",
      "& CtE & 87.50 & 94.83 & \\ditto & \\ditto & 85.33 \\\\ \\midrule\n",
      "\\sf GPT-5-nano  & Base & 57.83 & 61.33 & 62.67 & 65.83 & 24.50 \\\\\n",
      "& CtE & 77.67 & 89.33 & \\ditto & \\ditto & 8.83 \\\\ \\midrule\n",
      "\\sf DeepSeek-reasoner  & Base & 28.17 & 41.83 & 31.50 & 79.67 & 16.50 \\\\\n",
      "& CtE & 65.33 & 92.00 & \\ditto & \\ditto & 54.00 \\\\ \\midrule\n",
      "\\sf Gemini-2.5-pro  & Base & 38.67 & 43.67 & 39.67 & 81.67 & 23.33 \\\\\n",
      "& CtE & 90.67 & 95.83 & \\ditto & \\ditto & 74.67 \\\\ \\midrule\n",
      "\\sf GPT-5-mini  & Base & 63.33 & 63.00 & 68.33 & 63.50 & 19.17 \\\\\n",
      "& CtE & 84.67 & 96.17 & \\ditto & \\ditto & 38.00 \\\\ \\midrule\n",
      "\\sf GPT-o3  & Base & 39.32 & 49.49 & 39.66 & 84.92 & 25.42 \\\\\n",
      "& CtE & 84.33 & 85.67 & \\ditto & \\ditto & 77.50 \\\\ \\midrule\n",
      "\\sf GPT-5  & Base & 60.83 & 63.50 & 64.67 & 75.67 & 26.83 \\\\\n",
      "& CtE & 91.33 & 98.17 & \\ditto & \\ditto & 63.00 \\\\ \\midrule\n",
      "\n",
      "\n",
      "Output saved to: ../../output/cfr_table_rows.tex\n"
     ]
    }
   ],
   "source": [
    "# Generate LaTeX table for CFR\n",
    "\n",
    "# Define the mapping from LLM names to display names\n",
    "llm_mapping = {\n",
    "    'llama3.1:8b': r'\\sf Llama-3.1-8b',\n",
    "    'gpt-oss:20b': r'\\sf GPT-oss-20b',\n",
    "    'gpt-4.1-nano-2025-04-14': r'\\sf GPT-4.1-nano',\n",
    "    'mistral-small:24b': r'\\sf Mistral-small:24b',\n",
    "    'llama3.1:70b': r'\\sf Llama-3.1-70b',\n",
    "    'gemini-2.0-flash': r'\\sf Gemini-2.0-flash',\n",
    "    'gpt-4.1-mini-2025-04-14': r'\\sf GPT-4.1-mini',\n",
    "    'gpt-4o': r'\\sf GPT-4o',\n",
    "    'gpt-4.1-2025-04-14': r'\\sf GPT-4.1',\n",
    "    'grok-3-mini': r'\\sf Grok-3-mini',\n",
    "    'deepseek-chat': r'\\sf DeepSeek-V3.1',\n",
    "    'gemini-2.5-flash': r'\\sf Gemini-2.5-flash',\n",
    "    'gpt-5-nano': r'\\sf GPT-5-nano',\n",
    "    'deepseek-reasoner': r'\\sf DeepSeek-reasoner',\n",
    "    'gemini-2.5-pro': r'\\sf Gemini-2.5-pro',\n",
    "    'gpt-5-mini': r'\\sf GPT-5-mini',\n",
    "    'o3': r'\\sf GPT-o3',\n",
    "    'gpt-5': r'\\sf GPT-5'\n",
    "}\n",
    "\n",
    "# LLMs in order (from the table)\n",
    "llms_order = [\n",
    "    'llama3.1:8b',\n",
    "    'gpt-oss:20b',\n",
    "    'gpt-4.1-nano-2025-04-14',\n",
    "    'mistral-small:24b',\n",
    "    'llama3.1:70b',\n",
    "    'gemini-2.0-flash',\n",
    "    'gpt-4.1-mini-2025-04-14',\n",
    "    'gpt-4o',\n",
    "    'gpt-4.1-2025-04-14',\n",
    "    'grok-3-mini',\n",
    "    'deepseek-chat',\n",
    "    'gemini-2.5-flash',\n",
    "    'gpt-5-nano',\n",
    "    'deepseek-reasoner',\n",
    "    'gemini-2.5-pro',\n",
    "    'gpt-5-mini',\n",
    "    'o3',\n",
    "    'gpt-5'\n",
    "]\n",
    "\n",
    "# CFR columns: ?SC(A1=A2), ?SC(A1>A3), ?SC(A1>A4), ?SC(A3∅A4), ?SC(A4=A1|3)\n",
    "cfr_columns = ['?SC(A1=A2)', '?SC(A1>A3)', '?SC(A1>A4)', '?SC(A3∅A4)', '?SC(A4=A1|3)']\n",
    "\n",
    "# Function to format values (multiply by 100 and format to 2 decimal places)\n",
    "def format_cfr(val):\n",
    "    if pd.isna(val):\n",
    "        return r'\\ditto'\n",
    "    return f'{val * 100:.2f}'\n",
    "\n",
    "# Create the LaTeX output\n",
    "output_lines = []\n",
    "\n",
    "for llm_name in llms_order:\n",
    "    # Get CtE (classification) row\n",
    "    cte_row = classification_df[classification_df['llm'] == llm_name]\n",
    "    # Get Base (zero-shot) row\n",
    "    base_row = zero_shot_df[zero_shot_df['llm'] == llm_name]\n",
    "    \n",
    "    if not cte_row.empty and not base_row.empty:\n",
    "        display_name = llm_mapping.get(llm_name, llm_name)\n",
    "\n",
    "        # Base line\n",
    "        base_values = [format_cfr(base_row[col].values[0]) for col in cfr_columns]\n",
    "        base_line = f'{display_name}  & Base & {\" & \".join(base_values)} \\\\\\\\' \n",
    "        output_lines.append(base_line)\n",
    "        \n",
    "        # CtE line\n",
    "        cte_values = [format_cfr(cte_row[col].values[0]) for col in cfr_columns]\n",
    "        cte_line = f'& CtE & {\" & \".join(cte_values)} \\\\\\\\ \\\\midrule'\n",
    "        output_lines.append(cte_line)\n",
    "    \n",
    "\n",
    "# Print the output\n",
    "print(\"LaTeX table rows:\")\n",
    "print()\n",
    "for line in output_lines:\n",
    "    print(line)\n",
    "\n",
    "# Save to file\n",
    "out_file = os.path.join(folder, 'cfr_table_rows.tex')\n",
    "with open(out_file, 'w') as f:\n",
    "    for line in output_lines:\n",
    "        f.write(line + '\\n')\n",
    "\n",
    "print(f\"\\n\\nOutput saved to: {out_file}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "668343c0",
   "metadata": {},
   "source": [
    "## Consistency Rate and Similarity Table (When Predicted Relation is Correct)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "id": "34d502d4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LaTeX table rows for consistency rate and similarity:\n",
      "\n",
      " \\sf Llama-3.1-8b & Base & 11.43 & 2.94 & 0.00 & 83.48 & 0.00 & 27.31 & 3.64 & 16.82 \\\\\n",
      " & CtE & 19.23 & 100.00 & \\ditto & \\ditto & 10.26 & 73.28 & \\ditto & 50.77 \\\\ \\midrule\n",
      " \\sf GPT-oss-20b & Base & 22.20 & 38.27 & 31.66 & 84.67 & 15.60 & 44.83 & 6.45 & 40.29 \\\\\n",
      " & CtE & 67.84 & 100.00 & \\ditto & \\ditto & 48.26 & 79.69 & \\ditto & 76.58 \\\\ \\midrule\n",
      " \\sf GPT-4.1-nano & Base & 30.84 & 44.49 & 38.31 & 59.63 & \\ditto & 54.88 & 20.64 & \\ditto \\\\\n",
      " & CtE & 96.51 & 66.23 & \\ditto & \\ditto & \\ditto & 96.72 & \\ditto & \\ditto \\\\ \\midrule\n",
      " \\sf Mistral-small:24b & Base & 46.21 & 0.00 & 42.86 & 50.77 & 28.57 & 65.68 & 34.47 & 55.83 \\\\\n",
      " & CtE & 89.82 & 100.00 & \\ditto & \\ditto & 51.16 & 95.08 & \\ditto & 81.67 \\\\ \\midrule\n",
      " \\sf Llama-3.1-70b & Base & 21.06 & 30.43 & 24.07 & 71.26 & 6.77 & 47.69 & 10.00 & 35.24 \\\\\n",
      " & CtE & 82.30 & 100.00 & \\ditto & \\ditto & 57.27 & 92.09 & \\ditto & 86.26 \\\\ \\midrule\n",
      " \\sf Gemini-2.0-flash & Base & 33.21 & 41.03 & 44.53 & 63.46 & 11.60 & 60.45 & 13.01 & 46.20 \\\\\n",
      " & CtE & 89.25 & 100.00 & \\ditto & \\ditto & 84.55 & 94.75 & \\ditto & 93.57 \\\\ \\midrule\n",
      " \\sf GPT-4.1-mini & Base & 34.80 & 41.55 & 44.00 & 56.40 & 13.39 & 64.19 & 15.52 & 50.27 \\\\\n",
      " & CtE & 93.42 & 65.32 & \\ditto & \\ditto & 87.39 & 97.17 & \\ditto & 95.15 \\\\ \\midrule\n",
      " \\sf GPT-4o & Base & 44.97 & 53.68 & 52.05 & 63.09 & 23.79 & 65.84 & 26.36 & 55.03 \\\\\n",
      " & CtE & 98.28 & 99.25 & \\ditto & \\ditto & 92.57 & 59.12 & \\ditto & 98.35 \\\\ \\midrule\n",
      " \\sf GPT-4.1 & Base & 40.74 & 49.83 & 42.11 & 63.65 & 16.44 & 69.17 & 10.90 & 54.46 \\\\\n",
      " & CtE & 94.53 & 70.34 & \\ditto & \\ditto & 87.14 & 98.65 & \\ditto & 97.24 \\\\ \\midrule\n",
      " \\sf Grok-3-mini & Base & 35.64 & 52.37 & 43.70 & 88.74 & 26.85 & 64.90 & 5.04 & 57.93 \\\\\n",
      " & CtE & 95.42 & 100.00 & \\ditto & \\ditto & 79.52 & 98.67 & \\ditto & 93.89 \\\\ \\midrule\n",
      " \\sf DeepSeek-V3.1 & Base & 32.99 & 44.13 & 36.80 & 56.04 & 15.84 & 57.66 & 16.63 & 46.73 \\\\\n",
      " & CtE & 92.81 & 100.00 & \\ditto & \\ditto & 85.71 & 95.98 & \\ditto & 96.43 \\\\ \\midrule\n",
      " \\sf Gemini-2.5-flash & Base & 35.71 & 50.17 & 41.35 & 85.37 & 25.31 & 62.83 & 3.01 & 54.82 \\\\\n",
      " & CtE & 94.79 & 100.00 & \\ditto & \\ditto & 94.57 & 95.60 & \\ditto & 96.53 \\\\ \\midrule\n",
      " \\sf GPT-5-nano & Base & 59.17 & 64.90 & 64.12 & 68.93 & 43.33 & 72.58 & 28.56 & 62.84 \\\\\n",
      " & CtE & 80.55 & 100.00 & \\ditto & \\ditto & 72.00 & 80.81 & \\ditto & 78.26 \\\\ \\midrule\n",
      " \\sf DeepSeek-reasoner & Base & 20.08 & 40.72 & 30.17 & 82.01 & 14.31 & 48.13 & 3.94 & 41.90 \\\\\n",
      " & CtE & 65.47 & 100.00 & \\ditto & \\ditto & 60.56 & 67.65 & \\ditto & 75.33 \\\\ \\midrule\n",
      " \\sf Gemini-2.5-pro & Base & 33.39 & 43.52 & 38.62 & 82.06 & 23.22 & 64.90 & 2.95 & 57.36 \\\\\n",
      " & CtE & 92.70 & 100.00 & \\ditto & \\ditto & 79.50 & 93.07 & \\ditto & 88.22 \\\\ \\midrule\n",
      " \\sf GPT-5-mini & Base & 65.87 & 63.81 & 68.65 & 63.16 & 47.29 & 77.73 & 34.18 & 66.80 \\\\\n",
      " & CtE & 89.35 & 100.00 & \\ditto & \\ditto & 64.34 & 93.71 & \\ditto & 76.66 \\\\ \\midrule\n",
      " \\sf GPT-o3 & Base & 35.83 & 50.09 & 38.89 & 87.54 & 23.87 & 63.97 & 4.98 & 55.25 \\\\\n",
      " & CtE & 85.54 & 88.41 & \\ditto & \\ditto & 83.01 & 94.99 & \\ditto & 92.23 \\\\ \\midrule\n",
      " \\sf GPT-5 & Base & 62.06 & 64.16 & 65.08 & 75.39 & 42.68 & 78.91 & 20.50 & 68.34 \\\\\n",
      " & CtE & 95.47 & 100.00 & \\ditto & \\ditto & 89.49 & 98.21 & \\ditto & 93.82 \\\\ \\midrule\n",
      "\n",
      "\n",
      "Output saved to: ../../output/consistency_sim_table_rows+.tex\n"
     ]
    }
   ],
   "source": [
    "# Generate LaTeX table for consistency rate and similarity when predicted relation is correct\n",
    "# Columns: ?A1=A2(+), ?A1>A3(+), ?A1>A4(+), ?A3∅A4(+), ?A1=A3+A4(+), J(1-2)+, J(3-4)+, J(1-34)+\n",
    "\n",
    "# Column names from the CSV\n",
    "consistency_sim_columns = [\n",
    "    '?A1=A2(+)',      # Column 1\n",
    "    '?A1>A3(+)',      # Column 2  \n",
    "    '?A1>A4(+)',      # Column 3\n",
    "    '?A3∅A4(+)',      # Column 4\n",
    "    '?A1=A3+A4(+)',   # Column 5\n",
    "    'J(1-2)+',        # Column 6\n",
    "    'J(3-4)+',        # Column 7\n",
    "    'J(1-34)+'        # Column 8\n",
    "]\n",
    "\n",
    "\n",
    "\n",
    "# Function to format values (multiply by 100 and format to 2 decimal places)\n",
    "def format_value(val):\n",
    "    if pd.isna(val):\n",
    "        return r'\\ditto'\n",
    "    return f'{val * 100:.2f}'\n",
    "\n",
    "# Create the LaTeX output\n",
    "output_lines_2 = []\n",
    "\n",
    "for llm_name in llms_order:\n",
    "    # Get CtE (classification) row\n",
    "    cte_row = classification_df[classification_df['llm'] == llm_name]\n",
    "    # Get Base (zero-shot) row\n",
    "    base_row = zero_shot_df[zero_shot_df['llm'] == llm_name]\n",
    "    \n",
    "    if not cte_row.empty and not base_row.empty:\n",
    "        display_name = llm_mapping.get(llm_name, llm_name)\n",
    "\n",
    "        # Base line\n",
    "        base_values = [format_value(base_row[col].values[0]) for col in consistency_sim_columns]\n",
    "        base_line = f' {display_name} & Base & {\" & \".join(base_values)} \\\\\\\\'\n",
    "        output_lines_2.append(base_line)\n",
    "        \n",
    "        # CtE line\n",
    "        cte_values = [format_value(cte_row[col].values[0]) for col in consistency_sim_columns]\n",
    "        cte_line = f' & CtE & {\" & \".join(cte_values)} \\\\\\\\ \\\\midrule'\n",
    "        output_lines_2.append(cte_line)\n",
    "        \n",
    "\n",
    "\n",
    "# Print the output\n",
    "print(\"LaTeX table rows for consistency rate and similarity:\")\n",
    "print()\n",
    "for line in output_lines_2:\n",
    "    print(line)\n",
    "\n",
    "# Save to file\n",
    "out_file_2 = os.path.join(folder, 'consistency_sim_table_rows+.tex')\n",
    "with open(out_file_2, 'w') as f:\n",
    "    for line in output_lines_2:\n",
    "        f.write(line + '\\n')\n",
    "\n",
    "print(f\"\\n\\nOutput saved to: {out_file_2}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "id": "d0ef9efa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>?A1=A2(+)</th>\n",
       "      <th>?A1&gt;A3(+)</th>\n",
       "      <th>?A1&gt;A4(+)</th>\n",
       "      <th>?A3∅A4(+)</th>\n",
       "      <th>?A1=A3+A4(+)</th>\n",
       "      <th>J(1-2)+</th>\n",
       "      <th>J(3-4)+</th>\n",
       "      <th>J(1-34)+</th>\n",
       "      <th>llm</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.192308</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.102564</td>\n",
       "      <td>0.732812</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.507729</td>\n",
       "      <td>llama3.1:8b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.678445</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.482578</td>\n",
       "      <td>0.796881</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.765841</td>\n",
       "      <td>gpt-oss:20b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.965116</td>\n",
       "      <td>0.662252</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.967220</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>gpt-4.1-nano-2025-04-14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.898182</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.511628</td>\n",
       "      <td>0.950821</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.816660</td>\n",
       "      <td>mistral-small:24b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.823024</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.572707</td>\n",
       "      <td>0.920930</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.862551</td>\n",
       "      <td>llama3.1:70b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.892491</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.845506</td>\n",
       "      <td>0.947494</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.935710</td>\n",
       "      <td>gemini-2.0-flash</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0.934186</td>\n",
       "      <td>0.653179</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.873857</td>\n",
       "      <td>0.971663</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.951548</td>\n",
       "      <td>gpt-4.1-mini-2025-04-14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>0.982847</td>\n",
       "      <td>0.992500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.925714</td>\n",
       "      <td>0.591154</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.983455</td>\n",
       "      <td>gpt-4o</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>0.945326</td>\n",
       "      <td>0.703390</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.871355</td>\n",
       "      <td>0.986507</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.972427</td>\n",
       "      <td>gpt-4.1-2025-04-14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>0.954212</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.795181</td>\n",
       "      <td>0.986679</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.938938</td>\n",
       "      <td>grok-3-mini</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>0.928058</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.857143</td>\n",
       "      <td>0.959783</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.964286</td>\n",
       "      <td>deepseek-chat</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>0.947876</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.945709</td>\n",
       "      <td>0.956022</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.965346</td>\n",
       "      <td>gemini-2.5-flash</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>0.805505</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.720000</td>\n",
       "      <td>0.808127</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.782636</td>\n",
       "      <td>gpt-5-nano</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>0.654676</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.605634</td>\n",
       "      <td>0.676512</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.753264</td>\n",
       "      <td>deepseek-reasoner</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>0.926966</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.794964</td>\n",
       "      <td>0.930664</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.882205</td>\n",
       "      <td>gemini-2.5-pro</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>0.893502</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.643357</td>\n",
       "      <td>0.937116</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.766588</td>\n",
       "      <td>gpt-5-mini</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>0.855357</td>\n",
       "      <td>0.884083</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.830123</td>\n",
       "      <td>0.949939</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.922297</td>\n",
       "      <td>o3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>0.954710</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.894921</td>\n",
       "      <td>0.982064</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.938174</td>\n",
       "      <td>gpt-5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    ?A1=A2(+)  ?A1>A3(+)  ?A1>A4(+)  ?A3∅A4(+)  ?A1=A3+A4(+)   J(1-2)+  \\\n",
       "0    0.192308   1.000000        NaN        NaN      0.102564  0.732812   \n",
       "1    0.678445   1.000000        NaN        NaN      0.482578  0.796881   \n",
       "2    0.965116   0.662252        NaN        NaN           NaN  0.967220   \n",
       "3    0.898182   1.000000        NaN        NaN      0.511628  0.950821   \n",
       "4    0.823024   1.000000        NaN        NaN      0.572707  0.920930   \n",
       "5    0.892491   1.000000        NaN        NaN      0.845506  0.947494   \n",
       "6    0.934186   0.653179        NaN        NaN      0.873857  0.971663   \n",
       "7    0.982847   0.992500        NaN        NaN      0.925714  0.591154   \n",
       "8    0.945326   0.703390        NaN        NaN      0.871355  0.986507   \n",
       "9    0.954212   1.000000        NaN        NaN      0.795181  0.986679   \n",
       "10   0.928058   1.000000        NaN        NaN      0.857143  0.959783   \n",
       "11   0.947876   1.000000        NaN        NaN      0.945709  0.956022   \n",
       "12   0.805505   1.000000        NaN        NaN      0.720000  0.808127   \n",
       "13   0.654676   1.000000        NaN        NaN      0.605634  0.676512   \n",
       "14   0.926966   1.000000        NaN        NaN      0.794964  0.930664   \n",
       "15   0.893502   1.000000        NaN        NaN      0.643357  0.937116   \n",
       "16   0.855357   0.884083        NaN        NaN      0.830123  0.949939   \n",
       "17   0.954710   1.000000        NaN        NaN      0.894921  0.982064   \n",
       "\n",
       "    J(3-4)+  J(1-34)+                      llm  \n",
       "0       NaN  0.507729              llama3.1:8b  \n",
       "1       NaN  0.765841              gpt-oss:20b  \n",
       "2       NaN       NaN  gpt-4.1-nano-2025-04-14  \n",
       "3       NaN  0.816660        mistral-small:24b  \n",
       "4       NaN  0.862551             llama3.1:70b  \n",
       "5       NaN  0.935710         gemini-2.0-flash  \n",
       "6       NaN  0.951548  gpt-4.1-mini-2025-04-14  \n",
       "7       NaN  0.983455                   gpt-4o  \n",
       "8       NaN  0.972427       gpt-4.1-2025-04-14  \n",
       "9       NaN  0.938938              grok-3-mini  \n",
       "10      NaN  0.964286            deepseek-chat  \n",
       "11      NaN  0.965346         gemini-2.5-flash  \n",
       "12      NaN  0.782636               gpt-5-nano  \n",
       "13      NaN  0.753264        deepseek-reasoner  \n",
       "14      NaN  0.882205           gemini-2.5-pro  \n",
       "15      NaN  0.766588               gpt-5-mini  \n",
       "16      NaN  0.922297                       o3  \n",
       "17      NaN  0.938174                    gpt-5  "
      ]
     },
     "execution_count": 112,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "classification_df[consistency_sim_columns + ['llm']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "id": "27d2394c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LaTeX table rows for consistency rate and similarity:\n",
      "\n",
      " \\sf Llama-3.1-8b & Base & 1.06 & 3.01 & 1.32 & 84.54 & 0.17 & 16.01 & 3.21 & 11.58 \\\\\n",
      " & CtE & 16.20 & 100.00 & \\ditto & \\ditto & 8.05 & 55.25 & \\ditto & 50.09 \\\\ \\midrule\n",
      " \\sf GPT-oss-20b & Base & 13.51 & 34.78 & 54.55 & 65.38 & 11.11 & 33.38 & 21.46 & 32.12 \\\\\n",
      " & CtE & 55.88 & 100.00 & \\ditto & \\ditto & 50.00 & 74.50 & \\ditto & 80.90 \\\\ \\midrule\n",
      " \\sf GPT-4.1-nano & Base & 21.08 & 46.43 & 36.36 & 65.93 & 11.83 & 43.23 & 16.67 & 39.49 \\\\\n",
      " & CtE & 99.41 & 73.50 & \\ditto & \\ditto & 56.50 & 99.71 & \\ditto & 72.72 \\\\ \\midrule\n",
      " \\sf Mistral-small:24b & Base & 20.41 & 50.42 & 45.28 & 66.67 & 23.46 & 45.42 & 33.33 & 51.40 \\\\\n",
      " & CtE & 26.00 & 100.00 & \\ditto & \\ditto & 56.91 & 64.90 & \\ditto & 81.06 \\\\ \\midrule\n",
      " \\sf Llama-3.1-70b & Base & 6.25 & 28.70 & 23.74 & 40.00 & 22.22 & 26.46 & 17.00 & 47.39 \\\\\n",
      " & CtE & 16.67 & 100.00 & \\ditto & \\ditto & 57.52 & 46.07 & \\ditto & 86.09 \\\\ \\midrule\n",
      " \\sf Gemini-2.0-flash & Base & 28.12 & 43.18 & 32.84 & 51.28 & 14.29 & 56.97 & 12.86 & 45.45 \\\\\n",
      " & CtE & 42.86 & 100.00 & \\ditto & \\ditto & 85.25 & 55.66 & \\ditto & 91.98 \\\\ \\midrule\n",
      " \\sf GPT-4.1-mini & Base & 22.22 & 51.65 & 36.84 & 50.00 & 28.00 & 50.59 & 37.72 & 47.75 \\\\\n",
      " & CtE & 52.83 & 63.47 & \\ditto & \\ditto & 75.47 & 80.86 & \\ditto & 91.40 \\\\ \\midrule\n",
      " \\sf GPT-4o & Base & 48.48 & 53.17 & 46.30 & 25.00 & 28.71 & 56.77 & 75.00 & 51.64 \\\\\n",
      " & CtE & 64.71 & 99.50 & \\ditto & \\ditto & 94.12 & 43.37 & \\ditto & 97.87 \\\\ \\midrule\n",
      " \\sf GPT-4.1 & Base & 21.21 & 50.00 & 30.77 & 0.00 & 25.00 & 42.59 & 68.06 & 37.38 \\\\\n",
      " & CtE & 45.45 & 70.00 & \\ditto & \\ditto & 82.35 & 78.50 & \\ditto & 92.73 \\\\ \\midrule\n",
      " \\sf Grok-3-mini & Base & 20.00 & 51.61 & 47.62 & 42.86 & 5.26 & 47.73 & 29.75 & 33.28 \\\\\n",
      " & CtE & 44.44 & 100.00 & \\ditto & \\ditto & 57.89 & 73.36 & \\ditto & 86.46 \\\\ \\midrule\n",
      " \\sf DeepSeek-V3.1 & Base & 20.83 & 46.25 & 41.05 & 50.00 & 22.81 & 41.98 & 14.06 & 45.65 \\\\\n",
      " & CtE & 54.55 & 100.00 & \\ditto & \\ditto & 83.92 & 74.09 & \\ditto & 91.83 \\\\ \\midrule\n",
      " \\sf Gemini-2.5-flash & Base & 19.51 & 34.62 & 45.45 & 57.89 & 11.11 & 49.53 & 21.45 & 36.10 \\\\\n",
      " & CtE & 58.54 & 100.00 & \\ditto & \\ditto & 62.07 & 82.66 & \\ditto & 86.86 \\\\ \\midrule\n",
      " \\sf GPT-5-nano & Base & 57.14 & 58.89 & 52.94 & 57.97 & 47.17 & 68.94 & 40.02 & 68.06 \\\\\n",
      " & CtE & 50.91 & 100.00 & \\ditto & \\ditto & 36.00 & 51.92 & \\ditto & 52.07 \\\\ \\midrule\n",
      " \\sf DeepSeek-reasoner & Base & 7.46 & 22.22 & 30.00 & 81.82 & 0.00 & 29.98 & 4.50 & 26.44 \\\\\n",
      " & CtE & 36.36 & 100.00 & \\ditto & \\ditto & 37.50 & 55.45 & \\ditto & 73.20 \\\\ \\midrule\n",
      " \\sf Gemini-2.5-pro & Base & 5.77 & 50.00 & 42.42 & 44.44 & 0.00 & 44.58 & 23.57 & 39.00 \\\\\n",
      " & CtE & 25.76 & 100.00 & \\ditto & \\ditto & 38.64 & 63.17 & \\ditto & 60.09 \\\\ \\midrule\n",
      " \\sf GPT-5-mini & Base & 60.34 & 64.71 & 46.15 & 30.00 & 58.62 & 72.10 & 69.17 & 72.76 \\\\\n",
      " & CtE & 71.74 & 100.00 & \\ditto & \\ditto & 60.71 & 78.90 & \\ditto & 77.00 \\\\ \\midrule\n",
      " \\sf GPT-o3 & Base & 16.28 & 70.59 & 28.57 & 67.86 & 8.11 & 43.24 & 16.46 & 39.48 \\\\\n",
      " & CtE & 32.50 & 86.36 & \\ditto & \\ditto & 41.38 & 68.66 & \\ditto & 71.69 \\\\ \\midrule\n",
      " \\sf GPT-5 & Base & 50.88 & 64.29 & 60.00 & 15.79 & 51.52 & 71.02 & 71.31 & 64.70 \\\\\n",
      " & CtE & 56.25 & 100.00 & \\ditto & \\ditto & 62.07 & 78.48 & \\ditto & 84.04 \\\\ \\midrule\n",
      "\n",
      "\n",
      "Output saved to: ../../output/consistency_sim_table_rows-.tex\n"
     ]
    }
   ],
   "source": [
    "# Generate LaTeX table for consistency rate and similarity when predicted relation is correct\n",
    "# Columns: ?A1=A2(+), ?A1>A3(+), ?A1>A4(+), ?A3∅A4(+), ?A1=A3+A4(+), J(1-2)+, J(3-4)+, J(1-34)+\n",
    "\n",
    "# Column names from the- CSV\n",
    "consistency_sim_columns = [\n",
    "    '?A1=A2(-)',      # Column 1\n",
    "    '?A1>A3(-)',      # Column 2  \n",
    "    '?A1>A4(-)',      # Column 3\n",
    "    '?A3∅A4(-)',      # Column 4\n",
    "    '?A1=A3+A4(-)',   # Column 5\n",
    "    'J(1-2)-',        # Column 6\n",
    "    'J(3-4)-',        # Column 7\n",
    "    'J(1-34)-'        # Column 8\n",
    "]\n",
    "\n",
    "consistency_sim_columns_cte = [\n",
    "    '?A1=A2(-)',      # Column 1\n",
    "    '?A1>A3(-)',      # Column 2  \n",
    "    r'\\ditto',        # Column 3 - placeholder\n",
    "    r'\\ditto',        # Column 4 - placeholder\n",
    "    '?A1=A3+A4(-)',   # Column 5\n",
    "    'J(1-2)-',        # Column 6\n",
    "    r'\\ditto',        # Column 7 - placeholder\n",
    "    'J(1-34)-'        # Column 8\n",
    "]\n",
    "\n",
    "\n",
    "\n",
    "# Function to format values (multiply by 100 and format to 2 decimal places)\n",
    "def format_value(val):\n",
    "    if pd.isna(val):\n",
    "        return r'\\ditto'\n",
    "    # Handle literal \\ditto strings (don't multiply by 100)\n",
    "    if isinstance(val, str) and val == r'\\ditto':\n",
    "        return val\n",
    "    return f'{val * 100:.2f}'\n",
    "\n",
    "# Create the LaTeX output\n",
    "output_lines_2 = []\n",
    "\n",
    "for llm_name in llms_order:\n",
    "    # Get CtE (classification) row\n",
    "    cte_row = classification_df[classification_df['llm'] == llm_name]\n",
    "    # Get Base (zero-shot) row\n",
    "    base_row = zero_shot_df[zero_shot_df['llm'] == llm_name]\n",
    "    \n",
    "    if not cte_row.empty and not base_row.empty:\n",
    "        display_name = llm_mapping.get(llm_name, llm_name)\n",
    "\n",
    "        # Base line\n",
    "        base_values = [format_value(base_row[col].values[0]) for col in consistency_sim_columns]\n",
    "        base_line = f' {display_name} & Base & {\" & \".join(base_values)} \\\\\\\\'\n",
    "        output_lines_2.append(base_line)\n",
    "        \n",
    "        # CtE line - handle both actual column values and literal \\ditto placeholders\n",
    "        cte_values = []\n",
    "        for col in consistency_sim_columns_cte:\n",
    "            if col == r'\\ditto':\n",
    "                cte_values.append(r'\\ditto')\n",
    "            else:\n",
    "                cte_values.append(format_value(cte_row[col].values[0]))\n",
    "        cte_line = f' & CtE & {\" & \".join(cte_values)} \\\\\\\\ \\\\midrule'\n",
    "        output_lines_2.append(cte_line)\n",
    "    \n",
    "\n",
    "# Print the output\n",
    "print(\"LaTeX table rows for consistency rate and similarity:\")\n",
    "print()\n",
    "for line in output_lines_2:\n",
    "    print(line)\n",
    "\n",
    "# Save to file\n",
    "out_file_2 = os.path.join(folder, 'consistency_sim_table_rows-.tex')\n",
    "with open(out_file_2, 'w') as f:\n",
    "    for line in output_lines_2:\n",
    "        f.write(line + '\\n')\n",
    "\n",
    "print(f\"\\n\\nOutput saved to: {out_file_2}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "id": "14a33bb0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>?A1=A2(-)</th>\n",
       "      <th>?A1&gt;A3(-)</th>\n",
       "      <th>?A1&gt;A4(-)</th>\n",
       "      <th>?A3∅A4(-)</th>\n",
       "      <th>?A1=A3+A4(-)</th>\n",
       "      <th>J(1-2)-</th>\n",
       "      <th>J(3-4)-</th>\n",
       "      <th>J(1-34)-</th>\n",
       "      <th>llm</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.162021</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.080460</td>\n",
       "      <td>0.552536</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.500857</td>\n",
       "      <td>llama3.1:8b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.558824</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.745047</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.809004</td>\n",
       "      <td>gpt-oss:20b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.994118</td>\n",
       "      <td>0.734967</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.565000</td>\n",
       "      <td>0.997059</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.727222</td>\n",
       "      <td>gpt-4.1-nano-2025-04-14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.260000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.569120</td>\n",
       "      <td>0.648988</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.810597</td>\n",
       "      <td>mistral-small:24b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.166667</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.575163</td>\n",
       "      <td>0.460678</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.860901</td>\n",
       "      <td>llama3.1:70b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.428571</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.852459</td>\n",
       "      <td>0.556643</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.919794</td>\n",
       "      <td>gemini-2.0-flash</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0.528302</td>\n",
       "      <td>0.634660</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.754717</td>\n",
       "      <td>0.808589</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.913953</td>\n",
       "      <td>gpt-4.1-mini-2025-04-14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>0.647059</td>\n",
       "      <td>0.995000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.941176</td>\n",
       "      <td>0.433735</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.978738</td>\n",
       "      <td>gpt-4o</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>0.454545</td>\n",
       "      <td>0.700000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.823529</td>\n",
       "      <td>0.785045</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.927341</td>\n",
       "      <td>gpt-4.1-2025-04-14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>0.444444</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.578947</td>\n",
       "      <td>0.733550</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.864616</td>\n",
       "      <td>grok-3-mini</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>0.545455</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.839161</td>\n",
       "      <td>0.740909</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.918315</td>\n",
       "      <td>deepseek-chat</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>0.585366</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.620690</td>\n",
       "      <td>0.826557</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.868610</td>\n",
       "      <td>gemini-2.5-flash</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>0.509091</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.360000</td>\n",
       "      <td>0.519160</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.520666</td>\n",
       "      <td>gpt-5-nano</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>0.363636</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.554545</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.731975</td>\n",
       "      <td>deepseek-reasoner</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>0.257576</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.386364</td>\n",
       "      <td>0.631673</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.600916</td>\n",
       "      <td>gemini-2.5-pro</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>0.717391</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.607143</td>\n",
       "      <td>0.788983</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.770007</td>\n",
       "      <td>gpt-5-mini</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>0.325000</td>\n",
       "      <td>0.863636</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.413793</td>\n",
       "      <td>0.686647</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.716852</td>\n",
       "      <td>o3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>0.562500</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.620690</td>\n",
       "      <td>0.784827</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.840407</td>\n",
       "      <td>gpt-5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    ?A1=A2(-)  ?A1>A3(-)  ?A1>A4(-)  ?A3∅A4(-)  ?A1=A3+A4(-)   J(1-2)-  \\\n",
       "0    0.162021   1.000000        NaN        NaN      0.080460  0.552536   \n",
       "1    0.558824   1.000000        NaN        NaN      0.500000  0.745047   \n",
       "2    0.994118   0.734967        NaN        NaN      0.565000  0.997059   \n",
       "3    0.260000   1.000000        NaN        NaN      0.569120  0.648988   \n",
       "4    0.166667   1.000000        NaN        NaN      0.575163  0.460678   \n",
       "5    0.428571   1.000000        NaN        NaN      0.852459  0.556643   \n",
       "6    0.528302   0.634660        NaN        NaN      0.754717  0.808589   \n",
       "7    0.647059   0.995000        NaN        NaN      0.941176  0.433735   \n",
       "8    0.454545   0.700000        NaN        NaN      0.823529  0.785045   \n",
       "9    0.444444   1.000000        NaN        NaN      0.578947  0.733550   \n",
       "10   0.545455   1.000000        NaN        NaN      0.839161  0.740909   \n",
       "11   0.585366   1.000000        NaN        NaN      0.620690  0.826557   \n",
       "12   0.509091   1.000000        NaN        NaN      0.360000  0.519160   \n",
       "13   0.363636   1.000000        NaN        NaN      0.375000  0.554545   \n",
       "14   0.257576   1.000000        NaN        NaN      0.386364  0.631673   \n",
       "15   0.717391   1.000000        NaN        NaN      0.607143  0.788983   \n",
       "16   0.325000   0.863636        NaN        NaN      0.413793  0.686647   \n",
       "17   0.562500   1.000000        NaN        NaN      0.620690  0.784827   \n",
       "\n",
       "    J(3-4)-  J(1-34)-                      llm  \n",
       "0       NaN  0.500857              llama3.1:8b  \n",
       "1       NaN  0.809004              gpt-oss:20b  \n",
       "2       NaN  0.727222  gpt-4.1-nano-2025-04-14  \n",
       "3       NaN  0.810597        mistral-small:24b  \n",
       "4       NaN  0.860901             llama3.1:70b  \n",
       "5       NaN  0.919794         gemini-2.0-flash  \n",
       "6       NaN  0.913953  gpt-4.1-mini-2025-04-14  \n",
       "7       NaN  0.978738                   gpt-4o  \n",
       "8       NaN  0.927341       gpt-4.1-2025-04-14  \n",
       "9       NaN  0.864616              grok-3-mini  \n",
       "10      NaN  0.918315            deepseek-chat  \n",
       "11      NaN  0.868610         gemini-2.5-flash  \n",
       "12      NaN  0.520666               gpt-5-nano  \n",
       "13      NaN  0.731975        deepseek-reasoner  \n",
       "14      NaN  0.600916           gemini-2.5-pro  \n",
       "15      NaN  0.770007               gpt-5-mini  \n",
       "16      NaN  0.716852                       o3  \n",
       "17      NaN  0.840407                    gpt-5  "
      ]
     },
     "execution_count": 115,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "classification_df[consistency_sim_columns + ['llm']]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
