{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "e42a0864",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib\n",
    "from matplotlib import pyplot as plt\n",
    "import seaborn as sns\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from copy import deepcopy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "f38c90be",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>dataset</th>\n",
       "      <th>model</th>\n",
       "      <th>nan_values_in_cor</th>\n",
       "      <th>threshold</th>\n",
       "      <th>auc_score</th>\n",
       "      <th>aupr_score</th>\n",
       "      <th>daupr_score</th>\n",
       "      <th>correctness</th>\n",
       "      <th>uncertainty</th>\n",
       "      <th>status</th>\n",
       "      <th>cor_thr</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>BCB</td>\n",
       "      <td>qwen2_32b_i_0.6</td>\n",
       "      <td>0</td>\n",
       "      <td>0.1</td>\n",
       "      <td>0.489189</td>\n",
       "      <td>0.574416</td>\n",
       "      <td>-0.001023</td>\n",
       "      <td>bleu</td>\n",
       "      <td>predent</td>\n",
       "      <td>ok</td>\n",
       "      <td>bleu_0.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>BCB</td>\n",
       "      <td>qwen2_32b_i_0.6</td>\n",
       "      <td>0</td>\n",
       "      <td>0.2</td>\n",
       "      <td>0.449339</td>\n",
       "      <td>0.947835</td>\n",
       "      <td>-0.008305</td>\n",
       "      <td>bleu</td>\n",
       "      <td>predent</td>\n",
       "      <td>ok</td>\n",
       "      <td>bleu_0.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>BCB</td>\n",
       "      <td>qwen2_32b_i_0.6</td>\n",
       "      <td>0</td>\n",
       "      <td>0.3</td>\n",
       "      <td>0.370763</td>\n",
       "      <td>0.984324</td>\n",
       "      <td>-0.007782</td>\n",
       "      <td>bleu</td>\n",
       "      <td>predent</td>\n",
       "      <td>ok</td>\n",
       "      <td>bleu_0.30000000000000004</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>BCB</td>\n",
       "      <td>qwen2_32b_i_0.6</td>\n",
       "      <td>0</td>\n",
       "      <td>0.4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>bleu</td>\n",
       "      <td>predent</td>\n",
       "      <td>bad</td>\n",
       "      <td>bleu_0.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>BCB</td>\n",
       "      <td>qwen2_32b_i_0.6</td>\n",
       "      <td>0</td>\n",
       "      <td>0.5</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>bleu</td>\n",
       "      <td>predent</td>\n",
       "      <td>bad</td>\n",
       "      <td>bleu_0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38551</th>\n",
       "      <td>SQUAD</td>\n",
       "      <td>llama3_70b_i_1.</td>\n",
       "      <td>5563</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.646481</td>\n",
       "      <td>0.175649</td>\n",
       "      <td>0.102439</td>\n",
       "      <td>bma_judge</td>\n",
       "      <td>GNLL</td>\n",
       "      <td>ok</td>\n",
       "      <td>bma_judge_0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38552</th>\n",
       "      <td>SQUAD</td>\n",
       "      <td>llama3_70b_i_1.</td>\n",
       "      <td>5563</td>\n",
       "      <td>0.6</td>\n",
       "      <td>0.634551</td>\n",
       "      <td>0.203064</td>\n",
       "      <td>0.113525</td>\n",
       "      <td>bma_judge</td>\n",
       "      <td>GNLL</td>\n",
       "      <td>ok</td>\n",
       "      <td>bma_judge_0.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38553</th>\n",
       "      <td>SQUAD</td>\n",
       "      <td>llama3_70b_i_1.</td>\n",
       "      <td>5563</td>\n",
       "      <td>0.7</td>\n",
       "      <td>0.602338</td>\n",
       "      <td>0.235710</td>\n",
       "      <td>0.103645</td>\n",
       "      <td>bma_judge</td>\n",
       "      <td>GNLL</td>\n",
       "      <td>ok</td>\n",
       "      <td>bma_judge_0.7000000000000001</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38554</th>\n",
       "      <td>SQUAD</td>\n",
       "      <td>llama3_70b_i_1.</td>\n",
       "      <td>5563</td>\n",
       "      <td>0.8</td>\n",
       "      <td>0.602338</td>\n",
       "      <td>0.235710</td>\n",
       "      <td>0.103645</td>\n",
       "      <td>bma_judge</td>\n",
       "      <td>GNLL</td>\n",
       "      <td>ok</td>\n",
       "      <td>bma_judge_0.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38555</th>\n",
       "      <td>SQUAD</td>\n",
       "      <td>llama3_70b_i_1.</td>\n",
       "      <td>5563</td>\n",
       "      <td>0.9</td>\n",
       "      <td>0.576126</td>\n",
       "      <td>0.298291</td>\n",
       "      <td>0.095708</td>\n",
       "      <td>bma_judge</td>\n",
       "      <td>GNLL</td>\n",
       "      <td>ok</td>\n",
       "      <td>bma_judge_0.9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>49853 rows × 11 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      dataset            model  nan_values_in_cor  threshold  auc_score  \\\n",
       "0         BCB  qwen2_32b_i_0.6                  0        0.1   0.489189   \n",
       "1         BCB  qwen2_32b_i_0.6                  0        0.2   0.449339   \n",
       "2         BCB  qwen2_32b_i_0.6                  0        0.3   0.370763   \n",
       "3         BCB  qwen2_32b_i_0.6                  0        0.4        NaN   \n",
       "4         BCB  qwen2_32b_i_0.6                  0        0.5        NaN   \n",
       "...       ...              ...                ...        ...        ...   \n",
       "38551   SQUAD  llama3_70b_i_1.               5563        0.5   0.646481   \n",
       "38552   SQUAD  llama3_70b_i_1.               5563        0.6   0.634551   \n",
       "38553   SQUAD  llama3_70b_i_1.               5563        0.7   0.602338   \n",
       "38554   SQUAD  llama3_70b_i_1.               5563        0.8   0.602338   \n",
       "38555   SQUAD  llama3_70b_i_1.               5563        0.9   0.576126   \n",
       "\n",
       "       aupr_score  daupr_score correctness uncertainty status  \\\n",
       "0        0.574416    -0.001023        bleu     predent     ok   \n",
       "1        0.947835    -0.008305        bleu     predent     ok   \n",
       "2        0.984324    -0.007782        bleu     predent     ok   \n",
       "3        1.000000     0.000000        bleu     predent    bad   \n",
       "4        1.000000     0.000000        bleu     predent    bad   \n",
       "...           ...          ...         ...         ...    ...   \n",
       "38551    0.175649     0.102439   bma_judge        GNLL     ok   \n",
       "38552    0.203064     0.113525   bma_judge        GNLL     ok   \n",
       "38553    0.235710     0.103645   bma_judge        GNLL     ok   \n",
       "38554    0.235710     0.103645   bma_judge        GNLL     ok   \n",
       "38555    0.298291     0.095708   bma_judge        GNLL     ok   \n",
       "\n",
       "                            cor_thr  \n",
       "0                          bleu_0.1  \n",
       "1                          bleu_0.2  \n",
       "2          bleu_0.30000000000000004  \n",
       "3                          bleu_0.4  \n",
       "4                          bleu_0.5  \n",
       "...                             ...  \n",
       "38551                 bma_judge_0.5  \n",
       "38552                 bma_judge_0.6  \n",
       "38553  bma_judge_0.7000000000000001  \n",
       "38554                 bma_judge_0.8  \n",
       "38555                 bma_judge_0.9  \n",
       "\n",
       "[49853 rows x 11 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "selpred_df = [\n",
    "    pd.read_csv('csvs/uncert_methods_selpred_scores_05_15a.csv', index_col=0),\n",
    "    pd.read_csv('csvs/uncert_methods_selpred_scores_05_15b.csv', index_col=0)\n",
    "]\n",
    "selpred_df = pd.concat(selpred_df, axis=0)\n",
    "selpred_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "af9a19ad",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>dataset</th>\n",
       "      <th>model</th>\n",
       "      <th>uncertainty</th>\n",
       "      <th>auc_score</th>\n",
       "      <th>correctness</th>\n",
       "      <th>threshold</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>BCB</td>\n",
       "      <td>llama3_70b_i_0.6</td>\n",
       "      <td>GNLL</td>\n",
       "      <td>0.615947</td>\n",
       "      <td>spmoji</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>BCB</td>\n",
       "      <td>llama3_70b_i_0.6</td>\n",
       "      <td>SAR</td>\n",
       "      <td>0.510249</td>\n",
       "      <td>spmoji</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>BCB</td>\n",
       "      <td>llama3_70b_i_0.6</td>\n",
       "      <td>SENT_SAR</td>\n",
       "      <td>0.545564</td>\n",
       "      <td>spmoji</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>BCB</td>\n",
       "      <td>llama3_70b_i_0.6</td>\n",
       "      <td>TOKEN_SAR</td>\n",
       "      <td>0.543931</td>\n",
       "      <td>spmoji</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>BCB</td>\n",
       "      <td>llama3_70b_i_0.6</td>\n",
       "      <td>len_norm_predent</td>\n",
       "      <td>0.494681</td>\n",
       "      <td>spmoji</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>352</th>\n",
       "      <td>TRIVIA</td>\n",
       "      <td>phi35_i_1.</td>\n",
       "      <td>predent</td>\n",
       "      <td>0.588493</td>\n",
       "      <td>spmoji</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>353</th>\n",
       "      <td>TRIVIA</td>\n",
       "      <td>phi35_i_1.</td>\n",
       "      <td>ptrue_neg_log_prob</td>\n",
       "      <td>0.602367</td>\n",
       "      <td>spmoji</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>354</th>\n",
       "      <td>TRIVIA</td>\n",
       "      <td>phi35_i_1.</td>\n",
       "      <td>sement</td>\n",
       "      <td>0.647283</td>\n",
       "      <td>spmoji</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>355</th>\n",
       "      <td>TRIVIA</td>\n",
       "      <td>phi35_i_1.</td>\n",
       "      <td>seqlen_correct</td>\n",
       "      <td>0.561139</td>\n",
       "      <td>spmoji</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>356</th>\n",
       "      <td>TRIVIA</td>\n",
       "      <td>phi35_i_1.</td>\n",
       "      <td>seqlen_sample</td>\n",
       "      <td>0.562039</td>\n",
       "      <td>spmoji</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>357 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    dataset             model         uncertainty  auc_score correctness  \\\n",
       "0       BCB  llama3_70b_i_0.6                GNLL   0.615947      spmoji   \n",
       "1       BCB  llama3_70b_i_0.6                 SAR   0.510249      spmoji   \n",
       "2       BCB  llama3_70b_i_0.6            SENT_SAR   0.545564      spmoji   \n",
       "3       BCB  llama3_70b_i_0.6           TOKEN_SAR   0.543931      spmoji   \n",
       "4       BCB  llama3_70b_i_0.6    len_norm_predent   0.494681      spmoji   \n",
       "..      ...               ...                 ...        ...         ...   \n",
       "352  TRIVIA        phi35_i_1.             predent   0.588493      spmoji   \n",
       "353  TRIVIA        phi35_i_1.  ptrue_neg_log_prob   0.602367      spmoji   \n",
       "354  TRIVIA        phi35_i_1.              sement   0.647283      spmoji   \n",
       "355  TRIVIA        phi35_i_1.      seqlen_correct   0.561139      spmoji   \n",
       "356  TRIVIA        phi35_i_1.       seqlen_sample   0.562039      spmoji   \n",
       "\n",
       "     threshold  \n",
       "0          0.5  \n",
       "1          0.5  \n",
       "2          0.5  \n",
       "3          0.5  \n",
       "4          0.5  \n",
       "..         ...  \n",
       "352        0.5  \n",
       "353        0.5  \n",
       "354        0.5  \n",
       "355        0.5  \n",
       "356        0.5  \n",
       "\n",
       "[357 rows x 6 columns]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## compute sp moji here\n",
    "spmoji_stuff = selpred_df[selpred_df.correctness.str.contains('j_')&(~selpred_df.correctness.str.contains('8b'))&(selpred_df.threshold==0.5)].groupby(['dataset', 'model', 'uncertainty'])[['auc_score']].mean().reset_index()\n",
    "spmoji_stuff['correctness'] = 'spmoji'\n",
    "spmoji_stuff['threshold'] = 0.5\n",
    "spmoji_stuff"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "38a10ff3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>unc_name</th>\n",
       "      <th>dataset</th>\n",
       "      <th>model</th>\n",
       "      <th>nan_values_in_cor</th>\n",
       "      <th>score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>predent</td>\n",
       "      <td>SQUAD</td>\n",
       "      <td>qwen2_32b_i_0.6</td>\n",
       "      <td>0</td>\n",
       "      <td>0.764600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>len_norm_predent</td>\n",
       "      <td>SQUAD</td>\n",
       "      <td>qwen2_32b_i_0.6</td>\n",
       "      <td>0</td>\n",
       "      <td>0.737589</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>seqlen_sample</td>\n",
       "      <td>SQUAD</td>\n",
       "      <td>qwen2_32b_i_0.6</td>\n",
       "      <td>0</td>\n",
       "      <td>0.549445</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>seqlen_correct</td>\n",
       "      <td>SQUAD</td>\n",
       "      <td>qwen2_32b_i_0.6</td>\n",
       "      <td>0</td>\n",
       "      <td>0.791079</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>TOKEN_SAR</td>\n",
       "      <td>SQUAD</td>\n",
       "      <td>qwen2_32b_i_0.6</td>\n",
       "      <td>0</td>\n",
       "      <td>0.688164</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>79</th>\n",
       "      <td>sement</td>\n",
       "      <td>SQUAD</td>\n",
       "      <td>llama3_70b_i_1.</td>\n",
       "      <td>0</td>\n",
       "      <td>0.660880</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>80</th>\n",
       "      <td>min_logprob</td>\n",
       "      <td>SQUAD</td>\n",
       "      <td>llama3_70b_i_1.</td>\n",
       "      <td>0</td>\n",
       "      <td>0.607500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>81</th>\n",
       "      <td>len_norm_sement</td>\n",
       "      <td>SQUAD</td>\n",
       "      <td>llama3_70b_i_1.</td>\n",
       "      <td>0</td>\n",
       "      <td>0.698005</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>82</th>\n",
       "      <td>ptrue_neg_log_prob</td>\n",
       "      <td>SQUAD</td>\n",
       "      <td>llama3_70b_i_1.</td>\n",
       "      <td>0</td>\n",
       "      <td>0.858251</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>83</th>\n",
       "      <td>GNLL</td>\n",
       "      <td>SQUAD</td>\n",
       "      <td>llama3_70b_i_1.</td>\n",
       "      <td>0</td>\n",
       "      <td>0.676907</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>97 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "              unc_name dataset            model  nan_values_in_cor     score\n",
       "0              predent   SQUAD  qwen2_32b_i_0.6                  0  0.764600\n",
       "1     len_norm_predent   SQUAD  qwen2_32b_i_0.6                  0  0.737589\n",
       "2        seqlen_sample   SQUAD  qwen2_32b_i_0.6                  0  0.549445\n",
       "3       seqlen_correct   SQUAD  qwen2_32b_i_0.6                  0  0.791079\n",
       "4            TOKEN_SAR   SQUAD  qwen2_32b_i_0.6                  0  0.688164\n",
       "..                 ...     ...              ...                ...       ...\n",
       "79              sement   SQUAD  llama3_70b_i_1.                  0  0.660880\n",
       "80         min_logprob   SQUAD  llama3_70b_i_1.                  0  0.607500\n",
       "81     len_norm_sement   SQUAD  llama3_70b_i_1.                  0  0.698005\n",
       "82  ptrue_neg_log_prob   SQUAD  llama3_70b_i_1.                  0  0.858251\n",
       "83                GNLL   SQUAD  llama3_70b_i_1.                  0  0.676907\n",
       "\n",
       "[97 rows x 5 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results_ood_df = [\n",
    "    pd.read_csv('csvs/ood_detection_05_15a.csv', index_col=0),\n",
    "    pd.read_csv('csvs/ood_detection_05_15b.csv', index_col=0)\n",
    "]\n",
    "results_ood_df = pd.concat(results_ood_df, axis=0)\n",
    "results_ood_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "9812952e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>perturb_larger</th>\n",
       "      <th>dataset</th>\n",
       "      <th>model</th>\n",
       "      <th>uncertainty</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.564453</td>\n",
       "      <td>SQUADPERT</td>\n",
       "      <td>llama3_70b_i_0.6</td>\n",
       "      <td>predent</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.569336</td>\n",
       "      <td>SQUADPERT</td>\n",
       "      <td>llama3_70b_i_0.6</td>\n",
       "      <td>len_norm_predent</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.460938</td>\n",
       "      <td>SQUADPERT</td>\n",
       "      <td>llama3_70b_i_0.6</td>\n",
       "      <td>seqlen_sample</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.766602</td>\n",
       "      <td>SQUADPERT</td>\n",
       "      <td>llama3_70b_i_0.6</td>\n",
       "      <td>seqlen_correct</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.594727</td>\n",
       "      <td>SQUADPERT</td>\n",
       "      <td>llama3_70b_i_0.6</td>\n",
       "      <td>TOKEN_SAR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>99</th>\n",
       "      <td>0.607422</td>\n",
       "      <td>SQUADPERT</td>\n",
       "      <td>llama3_8b_i_0.6</td>\n",
       "      <td>sement</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>SQUADPERT</td>\n",
       "      <td>llama3_8b_i_0.6</td>\n",
       "      <td>min_logprob</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>101</th>\n",
       "      <td>0.639648</td>\n",
       "      <td>SQUADPERT</td>\n",
       "      <td>llama3_8b_i_0.6</td>\n",
       "      <td>len_norm_sement</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102</th>\n",
       "      <td>0.843750</td>\n",
       "      <td>SQUADPERT</td>\n",
       "      <td>llama3_8b_i_0.6</td>\n",
       "      <td>ptrue_neg_log_prob</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>103</th>\n",
       "      <td>0.645508</td>\n",
       "      <td>SQUADPERT</td>\n",
       "      <td>llama3_8b_i_0.6</td>\n",
       "      <td>GNLL</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>104 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     perturb_larger    dataset             model         uncertainty\n",
       "0          0.564453  SQUADPERT  llama3_70b_i_0.6             predent\n",
       "1          0.569336  SQUADPERT  llama3_70b_i_0.6    len_norm_predent\n",
       "2          0.460938  SQUADPERT  llama3_70b_i_0.6       seqlen_sample\n",
       "3          0.766602  SQUADPERT  llama3_70b_i_0.6      seqlen_correct\n",
       "4          0.594727  SQUADPERT  llama3_70b_i_0.6           TOKEN_SAR\n",
       "..              ...        ...               ...                 ...\n",
       "99         0.607422  SQUADPERT   llama3_8b_i_0.6              sement\n",
       "100        0.000000  SQUADPERT   llama3_8b_i_0.6         min_logprob\n",
       "101        0.639648  SQUADPERT   llama3_8b_i_0.6     len_norm_sement\n",
       "102        0.843750  SQUADPERT   llama3_8b_i_0.6  ptrue_neg_log_prob\n",
       "103        0.645508  SQUADPERT   llama3_8b_i_0.6                GNLL\n",
       "\n",
       "[104 rows x 4 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "perturbation_df = pd.read_csv(f\"csvs/perturb_detection_05_15a.csv\", index_col=0)\n",
    "perturbation_df.model = perturbation_df.model.apply(lambda s: '_'.join(s.split('/')[-1].split('_')[1:]))\n",
    "perturbation_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "1e658b59-e25c-446e-a67d-81364c25eab7",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_4671/2279883441.py:6: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  ood_df['correctness'] = 'ood_id'\n",
      "/tmp/ipykernel_4671/2279883441.py:7: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  ood_df['threshold'] = 0.5\n",
      "/tmp/ipykernel_4671/2279883441.py:20: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  exact_where_occurs.drop('status', axis=1, inplace=True)\n",
      "/tmp/ipykernel_4671/2279883441.py:21: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  exact_where_occurs.drop('cor_thr', axis=1, inplace=True)\n",
      "/tmp/ipykernel_4671/2279883441.py:22: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  exact_where_occurs.drop('aupr_score', axis=1, inplace=True)\n",
      "/tmp/ipykernel_4671/2279883441.py:23: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  exact_where_occurs.drop('daupr_score', axis=1, inplace=True)\n",
      "/tmp/ipykernel_4671/2279883441.py:24: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  exact_where_occurs.drop('nan_values_in_cor', axis=1, inplace=True)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>dataset</th>\n",
       "      <th>model</th>\n",
       "      <th>threshold</th>\n",
       "      <th>auc_score</th>\n",
       "      <th>correctness</th>\n",
       "      <th>uncertainty</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1638</td>\n",
       "      <td>BCB</td>\n",
       "      <td>qwen2_32b_i_0.6</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.533830</td>\n",
       "      <td>exact_correctness</td>\n",
       "      <td>predent</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1639</td>\n",
       "      <td>BCB</td>\n",
       "      <td>qwen2_32b_i_0.6</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.485413</td>\n",
       "      <td>exact_correctness</td>\n",
       "      <td>len_norm_predent</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1640</td>\n",
       "      <td>BCB</td>\n",
       "      <td>qwen2_32b_i_0.6</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.542166</td>\n",
       "      <td>exact_correctness</td>\n",
       "      <td>seqlen_sample</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1641</td>\n",
       "      <td>BCB</td>\n",
       "      <td>qwen2_32b_i_0.6</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.526072</td>\n",
       "      <td>exact_correctness</td>\n",
       "      <td>seqlen_correct</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1642</td>\n",
       "      <td>BCB</td>\n",
       "      <td>qwen2_32b_i_0.6</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.505271</td>\n",
       "      <td>exact_correctness</td>\n",
       "      <td>TOKEN_SAR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>744</th>\n",
       "      <td>99</td>\n",
       "      <td>SQUADPERT</td>\n",
       "      <td>llama3_8b_i_0.6</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.607422</td>\n",
       "      <td>perturb</td>\n",
       "      <td>sement</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>745</th>\n",
       "      <td>100</td>\n",
       "      <td>SQUADPERT</td>\n",
       "      <td>llama3_8b_i_0.6</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>perturb</td>\n",
       "      <td>min_logprob</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>746</th>\n",
       "      <td>101</td>\n",
       "      <td>SQUADPERT</td>\n",
       "      <td>llama3_8b_i_0.6</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.639648</td>\n",
       "      <td>perturb</td>\n",
       "      <td>len_norm_sement</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>747</th>\n",
       "      <td>102</td>\n",
       "      <td>SQUADPERT</td>\n",
       "      <td>llama3_8b_i_0.6</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.843750</td>\n",
       "      <td>perturb</td>\n",
       "      <td>ptrue_neg_log_prob</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>748</th>\n",
       "      <td>103</td>\n",
       "      <td>SQUADPERT</td>\n",
       "      <td>llama3_8b_i_0.6</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.645508</td>\n",
       "      <td>perturb</td>\n",
       "      <td>GNLL</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>749 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     index    dataset            model  threshold  auc_score  \\\n",
       "0     1638        BCB  qwen2_32b_i_0.6        0.5   0.533830   \n",
       "1     1639        BCB  qwen2_32b_i_0.6        0.5   0.485413   \n",
       "2     1640        BCB  qwen2_32b_i_0.6        0.5   0.542166   \n",
       "3     1641        BCB  qwen2_32b_i_0.6        0.5   0.526072   \n",
       "4     1642        BCB  qwen2_32b_i_0.6        0.5   0.505271   \n",
       "..     ...        ...              ...        ...        ...   \n",
       "744     99  SQUADPERT  llama3_8b_i_0.6        0.0   0.607422   \n",
       "745    100  SQUADPERT  llama3_8b_i_0.6        0.0   0.000000   \n",
       "746    101  SQUADPERT  llama3_8b_i_0.6        0.0   0.639648   \n",
       "747    102  SQUADPERT  llama3_8b_i_0.6        0.0   0.843750   \n",
       "748    103  SQUADPERT  llama3_8b_i_0.6        0.0   0.645508   \n",
       "\n",
       "           correctness         uncertainty  \n",
       "0    exact_correctness             predent  \n",
       "1    exact_correctness    len_norm_predent  \n",
       "2    exact_correctness       seqlen_sample  \n",
       "3    exact_correctness      seqlen_correct  \n",
       "4    exact_correctness           TOKEN_SAR  \n",
       "..                 ...                 ...  \n",
       "744            perturb              sement  \n",
       "745            perturb         min_logprob  \n",
       "746            perturb     len_norm_sement  \n",
       "747            perturb  ptrue_neg_log_prob  \n",
       "748            perturb                GNLL  \n",
       "\n",
       "[749 rows x 7 columns]"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# load the correctness cross df\n",
    "selpred_df.head()\n",
    "\n",
    "# prepare results_ood_df\n",
    "ood_df = results_ood_df[['dataset', 'model']]\n",
    "ood_df['correctness'] = 'ood_id'\n",
    "ood_df['threshold'] = 0.5\n",
    "ood_df['auc_score'] = results_ood_df['score']\n",
    "ood_df['uncertainty'] = results_ood_df['unc_name']\n",
    "\n",
    "# prepare perturbation df\n",
    "p2_df = perturbation_df[['dataset', 'model', 'uncertainty']]\n",
    "p2_df['correctness'] = 'perturb'\n",
    "p2_df['threshold'] = 0.\n",
    "p2_df['auc_score'] = perturbation_df['perturb_larger']\n",
    "\n",
    "\n",
    "# prepare the SP-MOJI\n",
    "exact_where_occurs = selpred_df[selpred_df.correctness.str.contains('exact')]\n",
    "exact_where_occurs.drop('status', axis=1, inplace=True)\n",
    "exact_where_occurs.drop('cor_thr', axis=1, inplace=True)\n",
    "exact_where_occurs.drop('aupr_score', axis=1, inplace=True)\n",
    "exact_where_occurs.drop('daupr_score', axis=1, inplace=True)\n",
    "exact_where_occurs.drop('nan_values_in_cor', axis=1, inplace=True)\n",
    "\n",
    "# merge all four\n",
    "df = pd.concat([exact_where_occurs,spmoji_stuff, ood_df, p2_df]).reset_index()\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "3c88afde-121c-4e48-b4ad-523b0ccd26c9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['predent', 'len_norm_predent', 'seqlen_sample', 'seqlen_correct',\n",
       "       'TOKEN_SAR', 'SENT_SAR', 'SAR', 'log_pplx', 'sement',\n",
       "       'min_logprob', 'len_norm_sement', 'ptrue_neg_log_prob', 'GNLL',\n",
       "       'EigenScore', 'pplx'], dtype=object)"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.uncertainty.unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "38eb89d5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['predent', 'len_norm_predent', 'seqlen_sample', 'seqlen_correct',\n",
       "       'TOKEN_SAR', 'SENT_SAR', 'SAR', 'log_pplx', 'sement',\n",
       "       'min_logprob', 'len_norm_sement', 'ptrue_neg_log_prob', 'GNLL',\n",
       "       'EigenScore'], dtype=object)"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preopt_df.uncertainty.unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "705077e7-8860-403f-9f04-e2f39b89fae4",
   "metadata": {},
   "outputs": [],
   "source": [
    "selscore = 'auc_score'\n",
    "exact_only_datasets = ['BCB', 'COLLIE']\n",
    "qa_datasets = ['COQA', 'SQUAD', 'TRIVIA']\n",
    "# make sure that judges are not considered for the exact datasets\n",
    "preopt_df = df[~((df.dataset.isin(exact_only_datasets))&(df.correctness!='exact_correctness'))]\n",
    "# preopt_df = preopt_df[(preopt_df.correctness.isin(['exact_correctness', 'spmoji', 'ood_id', 'perturb']))&(preopt_df.threshold==0.5)&(~preopt_df.uncertainty.isin(['pplx']))]\n",
    "preopt_df = preopt_df[(preopt_df.correctness.isin(['exact_correctness', 'spmoji', 'ood_id', 'perturb']))&(~preopt_df.uncertainty.isin(['pplx']))]\n",
    "\n",
    "\n",
    "big_accumulator = {}\n",
    "# FIRST OPTION: ALL DATA RUNS\n",
    "# remove all but bma judge or exact where applicable\n",
    "all_subdfs = {}\n",
    "for k, ids in preopt_df.groupby(['dataset', 'model', 'correctness']).groups.items():\n",
    "    redf = preopt_df.loc[ids]\n",
    "    all_subdfs[k] = redf[['dataset', 'model', 'uncertainty','auc_score']] #, 'aupr_score', 'daupr_score']]\n",
    "big_accumulator['ALL TASKS'] = deepcopy(all_subdfs)\n",
    "\n",
    "\n",
    "# derive other options from here\n",
    "# splits by datasets\n",
    "big_accumulator['QA ONLY'] = {k: v for k,v in all_subdfs.items() if k[0] in qa_datasets and k[2] != 'ood_id'}\n",
    "big_accumulator['CODE ONLY'] = {k: v for k,v in all_subdfs.items() if k[0] == 'BCB'}\n",
    "big_accumulator['CONSTRAINED TEXT ONLY'] = {k: v for k,v in all_subdfs.items() if k[0] == 'COLLIE'}\n",
    "\n",
    "# splits by models\n",
    "big_accumulator['IFT ONLY'] = {k: v for k,v in all_subdfs.items() if 'i' in k[1] or k[1].startswith('phi35')}\n",
    "big_accumulator['PRETRAINED ONLY'] = {k: v for k,v in all_subdfs.items() if not ('i' in k[1] or k[1].startswith('phi35'))}\n",
    "\n",
    "# ood - special category\n",
    "big_accumulator['OOD ONLY'] = {k: v for k,v in all_subdfs.items() if k[2] == 'ood_id'}\n",
    "\n",
    "big_accumulator['PERTURB ONLY'] = {k: v for k,v in all_subdfs.items() if k[2] == 'perturb'}\n",
    "\n",
    "# big_accumulator[''] = {k: v for k,v in all_subdfs.items()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "id": "14430654",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{('COQAPERT',\n",
       "  'llama3_70b_i_0.6',\n",
       "  'perturb'):       dataset             model         uncertainty  auc_score\n",
       " 723  COQAPERT  llama3_70b_i_0.6             predent   0.628906\n",
       " 724  COQAPERT  llama3_70b_i_0.6    len_norm_predent   0.618164\n",
       " 725  COQAPERT  llama3_70b_i_0.6       seqlen_sample   0.650391\n",
       " 726  COQAPERT  llama3_70b_i_0.6      seqlen_correct   0.782227\n",
       " 727  COQAPERT  llama3_70b_i_0.6           TOKEN_SAR   0.582031\n",
       " 728  COQAPERT  llama3_70b_i_0.6            SENT_SAR   0.617188\n",
       " 729  COQAPERT  llama3_70b_i_0.6                 SAR   0.604492\n",
       " 730  COQAPERT  llama3_70b_i_0.6            log_pplx   0.656250\n",
       " 731  COQAPERT  llama3_70b_i_0.6              sement   0.669922\n",
       " 732  COQAPERT  llama3_70b_i_0.6         min_logprob   0.000000\n",
       " 733  COQAPERT  llama3_70b_i_0.6     len_norm_sement   0.684570\n",
       " 734  COQAPERT  llama3_70b_i_0.6  ptrue_neg_log_prob   0.561523\n",
       " 735  COQAPERT  llama3_70b_i_0.6                GNLL   0.633789,\n",
       " ('COQAPERT',\n",
       "  'llama3_8b_i_0.6',\n",
       "  'perturb'):       dataset            model         uncertainty  auc_score\n",
       " 671  COQAPERT  llama3_8b_i_0.6             predent   0.556641\n",
       " 672  COQAPERT  llama3_8b_i_0.6    len_norm_predent   0.545898\n",
       " 673  COQAPERT  llama3_8b_i_0.6       seqlen_sample   0.460938\n",
       " 674  COQAPERT  llama3_8b_i_0.6      seqlen_correct   0.969727\n",
       " 675  COQAPERT  llama3_8b_i_0.6           TOKEN_SAR   0.597656\n",
       " 676  COQAPERT  llama3_8b_i_0.6            SENT_SAR   0.615234\n",
       " 677  COQAPERT  llama3_8b_i_0.6                 SAR   0.575195\n",
       " 678  COQAPERT  llama3_8b_i_0.6            log_pplx   0.701172\n",
       " 679  COQAPERT  llama3_8b_i_0.6              sement   0.574219\n",
       " 680  COQAPERT  llama3_8b_i_0.6         min_logprob   0.000000\n",
       " 681  COQAPERT  llama3_8b_i_0.6     len_norm_sement   0.587891\n",
       " 682  COQAPERT  llama3_8b_i_0.6  ptrue_neg_log_prob   0.656250\n",
       " 683  COQAPERT  llama3_8b_i_0.6                GNLL   0.580078,\n",
       " ('COQAPERT',\n",
       "  'qwen2_32b_i_0.6',\n",
       "  'perturb'):       dataset            model         uncertainty  auc_score\n",
       " 658  COQAPERT  qwen2_32b_i_0.6             predent   0.704102\n",
       " 659  COQAPERT  qwen2_32b_i_0.6    len_norm_predent   0.665039\n",
       " 660  COQAPERT  qwen2_32b_i_0.6       seqlen_sample   0.618164\n",
       " 661  COQAPERT  qwen2_32b_i_0.6      seqlen_correct   0.702148\n",
       " 662  COQAPERT  qwen2_32b_i_0.6           TOKEN_SAR   0.596680\n",
       " 663  COQAPERT  qwen2_32b_i_0.6            SENT_SAR   0.657227\n",
       " 664  COQAPERT  qwen2_32b_i_0.6                 SAR   0.602539\n",
       " 665  COQAPERT  qwen2_32b_i_0.6            log_pplx   0.708984\n",
       " 666  COQAPERT  qwen2_32b_i_0.6              sement   0.647461\n",
       " 667  COQAPERT  qwen2_32b_i_0.6         min_logprob   0.000000\n",
       " 668  COQAPERT  qwen2_32b_i_0.6     len_norm_sement   0.695312\n",
       " 669  COQAPERT  qwen2_32b_i_0.6  ptrue_neg_log_prob   0.423828\n",
       " 670  COQAPERT  qwen2_32b_i_0.6                GNLL   0.693359,\n",
       " ('COQAPERT',\n",
       "  'qwen2_7b_i_0.6',\n",
       "  'perturb'):       dataset           model         uncertainty  auc_score\n",
       " 710  COQAPERT  qwen2_7b_i_0.6             predent   0.646484\n",
       " 711  COQAPERT  qwen2_7b_i_0.6    len_norm_predent   0.595703\n",
       " 712  COQAPERT  qwen2_7b_i_0.6       seqlen_sample   0.581055\n",
       " 713  COQAPERT  qwen2_7b_i_0.6      seqlen_correct   0.663086\n",
       " 714  COQAPERT  qwen2_7b_i_0.6           TOKEN_SAR   0.595703\n",
       " 715  COQAPERT  qwen2_7b_i_0.6            SENT_SAR   0.625000\n",
       " 716  COQAPERT  qwen2_7b_i_0.6                 SAR   0.592773\n",
       " 717  COQAPERT  qwen2_7b_i_0.6            log_pplx   0.712891\n",
       " 718  COQAPERT  qwen2_7b_i_0.6              sement   0.609375\n",
       " 719  COQAPERT  qwen2_7b_i_0.6         min_logprob   0.000000\n",
       " 720  COQAPERT  qwen2_7b_i_0.6     len_norm_sement   0.664062\n",
       " 721  COQAPERT  qwen2_7b_i_0.6  ptrue_neg_log_prob   0.546875\n",
       " 722  COQAPERT  qwen2_7b_i_0.6                GNLL   0.684570,\n",
       " ('SQUADPERT',\n",
       "  'llama3_70b_i_0.6',\n",
       "  'perturb'):        dataset             model         uncertainty  auc_score\n",
       " 645  SQUADPERT  llama3_70b_i_0.6             predent   0.564453\n",
       " 646  SQUADPERT  llama3_70b_i_0.6    len_norm_predent   0.569336\n",
       " 647  SQUADPERT  llama3_70b_i_0.6       seqlen_sample   0.460938\n",
       " 648  SQUADPERT  llama3_70b_i_0.6      seqlen_correct   0.766602\n",
       " 649  SQUADPERT  llama3_70b_i_0.6           TOKEN_SAR   0.594727\n",
       " 650  SQUADPERT  llama3_70b_i_0.6            SENT_SAR   0.604492\n",
       " 651  SQUADPERT  llama3_70b_i_0.6                 SAR   0.622070\n",
       " 652  SQUADPERT  llama3_70b_i_0.6            log_pplx   0.649414\n",
       " 653  SQUADPERT  llama3_70b_i_0.6              sement   0.607422\n",
       " 654  SQUADPERT  llama3_70b_i_0.6         min_logprob   0.000000\n",
       " 655  SQUADPERT  llama3_70b_i_0.6     len_norm_sement   0.622070\n",
       " 656  SQUADPERT  llama3_70b_i_0.6  ptrue_neg_log_prob   0.602539\n",
       " 657  SQUADPERT  llama3_70b_i_0.6                GNLL   0.500977,\n",
       " ('SQUADPERT',\n",
       "  'llama3_8b_i_0.6',\n",
       "  'perturb'):        dataset            model         uncertainty  auc_score\n",
       " 736  SQUADPERT  llama3_8b_i_0.6             predent   0.603516\n",
       " 737  SQUADPERT  llama3_8b_i_0.6    len_norm_predent   0.571289\n",
       " 738  SQUADPERT  llama3_8b_i_0.6       seqlen_sample   0.565430\n",
       " 739  SQUADPERT  llama3_8b_i_0.6      seqlen_correct   0.811523\n",
       " 740  SQUADPERT  llama3_8b_i_0.6           TOKEN_SAR   0.574219\n",
       " 741  SQUADPERT  llama3_8b_i_0.6            SENT_SAR   0.620117\n",
       " 742  SQUADPERT  llama3_8b_i_0.6                 SAR   0.562500\n",
       " 743  SQUADPERT  llama3_8b_i_0.6            log_pplx   0.707031\n",
       " 744  SQUADPERT  llama3_8b_i_0.6              sement   0.607422\n",
       " 745  SQUADPERT  llama3_8b_i_0.6         min_logprob   0.000000\n",
       " 746  SQUADPERT  llama3_8b_i_0.6     len_norm_sement   0.639648\n",
       " 747  SQUADPERT  llama3_8b_i_0.6  ptrue_neg_log_prob   0.843750\n",
       " 748  SQUADPERT  llama3_8b_i_0.6                GNLL   0.645508,\n",
       " ('SQUADPERT',\n",
       "  'qwen2_32b_i_0.6',\n",
       "  'perturb'):        dataset            model         uncertainty  auc_score\n",
       " 697  SQUADPERT  qwen2_32b_i_0.6             predent   0.625977\n",
       " 698  SQUADPERT  qwen2_32b_i_0.6    len_norm_predent   0.595703\n",
       " 699  SQUADPERT  qwen2_32b_i_0.6       seqlen_sample   0.598633\n",
       " 700  SQUADPERT  qwen2_32b_i_0.6      seqlen_correct   0.622070\n",
       " 701  SQUADPERT  qwen2_32b_i_0.6           TOKEN_SAR   0.571289\n",
       " 702  SQUADPERT  qwen2_32b_i_0.6            SENT_SAR   0.551758\n",
       " 703  SQUADPERT  qwen2_32b_i_0.6                 SAR   0.561523\n",
       " 704  SQUADPERT  qwen2_32b_i_0.6            log_pplx   0.658203\n",
       " 705  SQUADPERT  qwen2_32b_i_0.6              sement   0.630859\n",
       " 706  SQUADPERT  qwen2_32b_i_0.6         min_logprob   0.000000\n",
       " 707  SQUADPERT  qwen2_32b_i_0.6     len_norm_sement   0.663086\n",
       " 708  SQUADPERT  qwen2_32b_i_0.6  ptrue_neg_log_prob   0.463867\n",
       " 709  SQUADPERT  qwen2_32b_i_0.6                GNLL   0.630859,\n",
       " ('SQUADPERT',\n",
       "  'qwen2_7b_i_0.6',\n",
       "  'perturb'):        dataset           model         uncertainty  auc_score\n",
       " 684  SQUADPERT  qwen2_7b_i_0.6             predent   0.630859\n",
       " 685  SQUADPERT  qwen2_7b_i_0.6    len_norm_predent   0.613281\n",
       " 686  SQUADPERT  qwen2_7b_i_0.6       seqlen_sample   0.619141\n",
       " 687  SQUADPERT  qwen2_7b_i_0.6      seqlen_correct   0.685547\n",
       " 688  SQUADPERT  qwen2_7b_i_0.6           TOKEN_SAR   0.600586\n",
       " 689  SQUADPERT  qwen2_7b_i_0.6            SENT_SAR   0.585938\n",
       " 690  SQUADPERT  qwen2_7b_i_0.6                 SAR   0.578125\n",
       " 691  SQUADPERT  qwen2_7b_i_0.6            log_pplx   0.670898\n",
       " 692  SQUADPERT  qwen2_7b_i_0.6              sement   0.634766\n",
       " 693  SQUADPERT  qwen2_7b_i_0.6         min_logprob   0.000977\n",
       " 694  SQUADPERT  qwen2_7b_i_0.6     len_norm_sement   0.655273\n",
       " 695  SQUADPERT  qwen2_7b_i_0.6  ptrue_neg_log_prob   0.330078\n",
       " 696  SQUADPERT  qwen2_7b_i_0.6                GNLL   0.649414}"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "big_accumulator['PERTURB ONLY']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b83ce162-1d30-4bb2-901b-15752301abdd",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ALL TASKS\n",
      "[('BCB', 'llama3_70b_i_0.6', 'exact_correctness'), ('BCB', 'llama3_8b_1.', 'exact_correctness'), ('BCB', 'llama3_8b_i_1.', 'exact_correctness'), ('BCB', 'qwen2_32b_i_0.6', 'exact_correctness'), ('BCB', 'qwen2_7b_i_0.6', 'exact_correctness'), ('COLLIE', 'falcon_mamba_1.', 'exact_correctness'), ('COLLIE', 'falcon_mamba_i_1.', 'exact_correctness'), ('COLLIE', 'llama3_70b_i_1.', 'exact_correctness'), ('COLLIE', 'llama3_8b_1.', 'exact_correctness'), ('COLLIE', 'llama3_8b_i_1.', 'exact_correctness'), ('COLLIE', 'phi35_1.', 'exact_correctness'), ('COLLIE', 'phi35_i_1.', 'exact_correctness'), ('COLLIE', 'qwen2_32b_i_0.6', 'exact_correctness'), ('COLLIE', 'qwen2_7b_i_0.6', 'exact_correctness'), ('COQA', 'llama3_70b_1.', 'spmoji'), ('COQA', 'llama3_8b_1.', 'spmoji'), ('COQA', 'llama3_8b_i_1.', 'spmoji'), ('COQA', 'phi35_i_1.', 'spmoji'), ('COQA', 'qwen2_32b_i_0.6', 'spmoji'), ('COQAPERT', 'llama3_70b_i_0.6', 'perturb'), ('COQAPERT', 'llama3_8b_i_0.6', 'perturb'), ('COQAPERT', 'qwen2_32b_i_0.6', 'perturb'), ('COQAPERT', 'qwen2_7b_i_0.6', 'perturb'), ('KUQ', 'llama3_8b_i_1.', 'ood_id'), ('KUQ', 'phi35_i_1.', 'ood_id'), ('SQUAD', 'llama3_70b_i_1.', 'ood_id'), ('SQUAD', 'llama3_70b_i_1.', 'spmoji'), ('SQUAD', 'llama3_8b_1.', 'ood_id'), ('SQUAD', 'llama3_8b_1.', 'spmoji'), ('SQUAD', 'llama3_8b_i_1.', 'ood_id'), ('SQUAD', 'llama3_8b_i_1.', 'spmoji'), ('SQUAD', 'phi35_i_1.', 'ood_id'), ('SQUAD', 'phi35_i_1.', 'spmoji'), ('SQUAD', 'qwen2_32b_i_0.6', 'ood_id'), ('SQUAD', 'qwen2_32b_i_0.6', 'spmoji'), ('SQUADPERT', 'llama3_70b_i_0.6', 'perturb'), ('SQUADPERT', 'llama3_8b_i_0.6', 'perturb'), ('SQUADPERT', 'qwen2_32b_i_0.6', 'perturb'), ('SQUADPERT', 'qwen2_7b_i_0.6', 'perturb'), ('TRIVIA', 'llama3_8b_i_1.', 'spmoji'), ('TRIVIA', 'phi35_i_1.', 'spmoji')]\n",
      "['random_baseline', 'predent', 'len_norm_predent', 'seqlen_sample', 'seqlen_correct', 'TOKEN_SAR', 'SENT_SAR', 'SAR', 'log_pplx', 'sement', 'min_logprob', 'len_norm_sement', 'ptrue_neg_log_prob', 'GNLL', 'EigenScore']\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "623it [00:00, 6227.87it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.  999. 1001.\n",
      " 1000. 1000. 1000.]\n",
      "15000.0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "10851it [00:01, 6345.05it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 668.3017006  1055.83963887  731.55959338 1020.76716774 1180.80538524\n",
      "  906.87812902 1137.58554593  936.79697341 1128.11753378  897.06939039\n",
      "  895.69832347 1156.71904446  960.03750734 1284.75878957 1039.0652768 ]\n",
      "14999.999999999996\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "21059it [00:03, 6334.18it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 590.38204991 1105.41981337  613.77342416  973.46260149 1204.23147473\n",
      "  909.61075384 1174.68782832  915.17396219 1125.60679213  872.46067997\n",
      "  884.28123912 1224.21629685  942.07727071 1367.89710911 1096.71870409]\n",
      "14999.99999999999\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "40789it [00:06, 6370.95it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 568.01140111 1092.30195005  616.88170143  977.57406267 1231.56820613\n",
      "  883.6164699  1142.75289969  929.61760095 1123.57545915  857.6072494\n",
      "  869.07024135 1241.74455144  946.92460511 1376.82325292 1141.93034871]\n",
      "14999.999999999995\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "50854it [00:08, 6238.76it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 540.65926354 1107.88299763  625.37949115  923.86205087 1207.65612573\n",
      "  898.65882212 1184.94986349  912.88427717 1168.27756596  905.73667272\n",
      "  836.82542676 1217.08517916  955.97427831 1369.29831077 1144.86967462]\n",
      "15000.000000000005\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "60945it [00:09, 6251.86it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 547.9859425  1064.35014138  573.69116294  939.59955592 1232.66401948\n",
      "  918.52507475 1171.23762856  920.8347113  1119.2104466   858.84394743\n",
      "  878.56260613 1262.22244536  946.55102744 1395.95180196 1169.76948826]\n",
      "15000.000000000005\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "71017it [00:11, 6286.44it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 541.62025875 1086.8748949   579.05764073  955.20669904 1227.28420823\n",
      "  910.89448936 1153.42310822  938.46378636 1113.72163822  882.40288306\n",
      "  828.5676155  1264.52593267  936.6571927  1408.39425713 1172.90539513]\n",
      "15000.000000000005\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "81158it [00:12, 6355.69it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 517.45862182 1094.34642328  590.64056235  908.07134871 1261.80465985\n",
      "  944.63955344 1168.22605944  948.69870491 1125.93596596  907.56708196\n",
      "  844.66322002 1213.75899775  944.68579423 1396.67087757 1132.83212872]\n",
      "15000.000000000005\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "90707it [00:14, 6233.56it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 544.10784592 1104.08708799  610.97812468  951.02369594 1226.70404691\n",
      "  862.30368575 1204.00004751  918.95411979 1120.32863592  920.7505145\n",
      "  823.1365835  1233.34556656  940.65805834 1384.16702842 1155.45495828]\n",
      "15000.000000000002\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100000it [00:15, 6292.72it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "QA ONLY\n",
      "[('COQA', 'llama3_70b_1.', 'spmoji'), ('COQA', 'llama3_8b_1.', 'spmoji'), ('COQA', 'llama3_8b_i_1.', 'spmoji'), ('COQA', 'phi35_i_1.', 'spmoji'), ('COQA', 'qwen2_32b_i_0.6', 'spmoji'), ('SQUAD', 'llama3_70b_i_1.', 'spmoji'), ('SQUAD', 'llama3_8b_1.', 'spmoji'), ('SQUAD', 'llama3_8b_i_1.', 'spmoji'), ('SQUAD', 'phi35_i_1.', 'spmoji'), ('SQUAD', 'qwen2_32b_i_0.6', 'spmoji'), ('TRIVIA', 'llama3_8b_i_1.', 'spmoji'), ('TRIVIA', 'phi35_i_1.', 'spmoji')]\n",
      "['random_baseline', 'predent', 'len_norm_predent', 'seqlen_sample', 'seqlen_correct', 'TOKEN_SAR', 'SENT_SAR', 'SAR', 'log_pplx', 'sement', 'min_logprob', 'len_norm_sement', 'ptrue_neg_log_prob', 'GNLL', 'EigenScore']\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "616it [00:00, 6159.81it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.  999. 1001.\n",
      " 1000. 1000. 1000.]\n",
      "15000.0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "10750it [00:01, 6359.14it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 561.84019707 1048.27119852  715.32624958  736.63586004  929.88077776\n",
      "  684.50404842 1302.34675518  893.17059477  994.66710637 1090.5938298\n",
      " 1127.47891783 1438.70272031  942.94406748 1350.86106168 1182.77661519]\n",
      "15000.000000000002\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "20921it [00:03, 6349.62it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 383.11903477 1070.63554504  623.50971894  628.16501865  948.31660473\n",
      "  611.24631286 1388.67385961  871.25389099 1021.57821484 1067.21421705\n",
      " 1171.14744862 1558.93560287  946.09291219 1474.7886859  1235.32293295]\n",
      "14999.999999999996\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "31088it [00:04, 6324.73it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 333.90407995 1078.03399465  606.71978836  586.36508526  951.74434336\n",
      "  563.20175739 1441.56303382  850.18350225 1012.90860864 1080.04516149\n",
      " 1197.40617909 1613.99445824  948.82538982 1496.69612819 1238.40848947]\n",
      "14999.99999999999\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "40642it [00:06, 6367.88it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 301.80120872 1057.93223087  564.51312279  556.17447876  955.4263178\n",
      "  542.70809869 1471.84670359  845.96933631  985.94007167 1089.00801364\n",
      " 1221.59098787 1663.43576598  929.21411274 1533.7135235  1280.72602708]\n",
      "14999.999999999996\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "50673it [00:08, 6134.65it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 283.367321   1098.53724821  568.40520618  558.30907877  911.76925865\n",
      "  513.46031361 1493.62843679  861.40251257 1000.66247605 1108.56273422\n",
      " 1193.8503129  1666.00390449  935.59140955 1552.66297864 1253.78680837]\n",
      "14999.999999999998\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "60718it [00:09, 6277.64it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 280.68829089 1082.15378838  554.89832266  566.05763271  913.2066061\n",
      "  529.15492392 1479.41070695  827.03031504  994.42095273 1083.97841374\n",
      " 1241.51149838 1698.21299285  942.62257347 1576.68608247 1229.96689972]\n",
      "14999.999999999998\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "70780it [00:11, 6271.45it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 263.02572755 1089.09493774  552.52879072  574.30371669  896.13313682\n",
      "  537.32462818 1515.79203351  836.19523154  982.1957247  1118.93684487\n",
      " 1200.71578318 1657.141758    962.57822312 1564.00093282 1250.03253056]\n",
      "14999.999999999993\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "80887it [00:12, 6294.31it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 293.20776986 1065.85499567  575.61936236  548.06930002  921.17549758\n",
      "  526.40267552 1508.53735773  830.02127325 1010.69082689 1107.427363\n",
      " 1206.36685884 1637.1826122   905.09899599 1592.18377451 1272.16133656]\n",
      "14999.999999999989\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "91014it [00:14, 6238.90it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 304.85771867 1073.56305215  583.52128223  551.77789189  914.52111063\n",
      "  506.65489533 1484.31372331  831.36286465 1012.91884741 1123.88595568\n",
      " 1202.01116106 1674.63936657  915.06316284 1557.27689621 1263.63207135]\n",
      "14999.999999999996\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100000it [00:15, 6278.45it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CODE ONLY\n",
      "[('BCB', 'llama3_70b_i_0.6', 'exact_correctness'), ('BCB', 'llama3_8b_1.', 'exact_correctness'), ('BCB', 'llama3_8b_i_1.', 'exact_correctness'), ('BCB', 'qwen2_32b_i_0.6', 'exact_correctness'), ('BCB', 'qwen2_7b_i_0.6', 'exact_correctness')]\n",
      "['random_baseline', 'predent', 'len_norm_predent', 'seqlen_sample', 'seqlen_correct', 'TOKEN_SAR', 'SENT_SAR', 'SAR', 'log_pplx', 'sement', 'min_logprob', 'len_norm_sement', 'ptrue_neg_log_prob', 'GNLL', 'EigenScore']\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "630it [00:00, 6299.09it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1001.  999.\n",
      " 1000. 1000. 1000.]\n",
      "15000.0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "10874it [00:01, 6354.98it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 804.01418409 1147.68395817  574.62265614 1246.76651603 1231.36106232\n",
      "  896.05240653  944.95506385  918.8114861  1076.09554474  734.24436665\n",
      " 1035.98814133  860.7791311  1009.47462573 1317.98442295 1201.16643427]\n",
      "14999.999999999998\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "21118it [00:03, 6381.77it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 771.00050134 1185.54789104  449.00326895 1329.08086795 1274.36424686\n",
      "  850.42163839  937.19612949  889.1573784  1110.09780836  652.55258474\n",
      "  957.55506339  800.90863135 1049.95677877 1410.85859777 1332.29861321]\n",
      "14999.999999999998\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "30780it [00:04, 6401.87it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 768.70252721 1188.28841676  424.26158142 1368.29735593 1276.75681737\n",
      "  830.60277183  929.08838582  876.35383195 1116.71123327  572.77827073\n",
      "  952.94344964  792.28325408 1075.4307104  1409.65161428 1417.84977929]\n",
      "15000.00000000001\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "41045it [00:06, 6386.15it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 748.48582999 1210.01690301  381.08688727 1382.14484614 1323.33932461\n",
      "  821.8098178   937.23329575  850.53050048 1144.7551334   580.45340033\n",
      "  950.8975735   781.24663043 1039.79705715 1395.27088871 1452.93191141]\n",
      "15000.000000000005\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "51252it [00:08, 6331.09it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 769.80911627 1187.52503655  368.81794302 1357.09776711 1323.12578032\n",
      "  824.28096671  924.44177818  883.84128276 1147.19740067  519.45876763\n",
      "  997.7539531   762.52131001 1039.92385817 1406.71681033 1487.48822919]\n",
      "15000.000000000007\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "60849it [00:09, 6392.79it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 740.52840405 1200.83036189  376.12204951 1380.30935459 1307.55015146\n",
      "  838.51169936  908.13639509  855.20649416 1137.67233755  570.22266688\n",
      "  953.68546452  782.37621857 1020.21291189 1443.1144671  1485.52102339]\n",
      "15000.000000000011\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "71051it [00:11, 6364.77it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 724.95035847 1165.58458888  409.82092083 1383.37698812 1293.84427637\n",
      "  854.01028468  932.08071319  883.21895404 1143.72788181  541.40641401\n",
      "  979.40323156  730.39248468 1050.31635362 1409.90745728 1497.95909247]\n",
      "15000.00000000001\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "81255it [00:12, 6362.10it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 716.98001302 1194.75363123  366.90632807 1413.43792476 1313.69385822\n",
      "  869.67275207  925.66245594  878.12292482 1111.63345562  553.9884932\n",
      "  995.84049665  748.22353204  990.20276421 1410.21565615 1510.66571403]\n",
      "15000.000000000007\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "90852it [00:14, 6294.64it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 753.99470051 1180.61107387  341.80946564 1408.47134213 1318.38707609\n",
      "  840.52994022  917.89146231  844.41034779 1091.12857875  574.67076025\n",
      "  959.52426084  756.33038312 1011.95259824 1469.51473567 1530.77327458]\n",
      "15000.000000000002\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100000it [00:15, 6345.07it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CONSTRAINED TEXT ONLY\n",
      "[('COLLIE', 'falcon_mamba_1.', 'exact_correctness'), ('COLLIE', 'falcon_mamba_i_1.', 'exact_correctness'), ('COLLIE', 'llama3_70b_i_1.', 'exact_correctness'), ('COLLIE', 'llama3_8b_1.', 'exact_correctness'), ('COLLIE', 'llama3_8b_i_1.', 'exact_correctness'), ('COLLIE', 'phi35_1.', 'exact_correctness'), ('COLLIE', 'phi35_i_1.', 'exact_correctness'), ('COLLIE', 'qwen2_32b_i_0.6', 'exact_correctness'), ('COLLIE', 'qwen2_7b_i_0.6', 'exact_correctness')]\n",
      "['random_baseline', 'predent', 'len_norm_predent', 'seqlen_sample', 'seqlen_correct', 'TOKEN_SAR', 'SENT_SAR', 'SAR', 'log_pplx', 'sement', 'min_logprob', 'len_norm_sement', 'ptrue_neg_log_prob', 'GNLL', 'EigenScore']\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "610it [00:00, 6090.18it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.  999. 1001.\n",
      " 1000. 1000. 1000.]\n",
      "15000.0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "10713it [00:01, 6274.44it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 905.04217698 1029.16561409  699.74753708 1229.28551941 1225.20795254\n",
      " 1245.31597936 1139.05158767 1116.92392647  959.58252803  653.6709178\n",
      " 1004.27671572  784.89248961 1029.43600156 1140.12381366  838.27724001]\n",
      "15000.000000000007\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "20814it [00:03, 6354.81it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 872.51570325 1026.67911342  634.39221903 1294.1731761  1267.95948791\n",
      " 1317.10575027 1200.56047535 1157.66122359  970.5003826   575.27753299\n",
      "  981.47436022  736.65620351 1010.34114104 1141.59232636  813.11090436]\n",
      "15000.0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "31010it [00:04, 6333.14it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 863.77115264 1022.96643661  603.21038675 1310.99744723 1290.03549113\n",
      " 1350.97360037 1187.70104613 1127.0881127   962.86211643  581.14968187\n",
      "  981.9799082   716.44308229 1017.77809865 1205.82554859  777.21789041]\n",
      "15000.000000000002\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "41172it [00:06, 6305.10it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 873.69275412 1040.48762889  550.10553542 1283.77373002 1328.44454788\n",
      " 1351.44615044 1169.48641803 1149.81215393  949.52354486  525.8361263\n",
      "  996.12886846  745.91909269 1047.58290214 1216.32746559  771.43308123]\n",
      "14999.999999999993\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "51251it [00:08, 6251.79it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 898.08813068 1062.94406822  561.91994656 1298.860959   1303.66640112\n",
      " 1352.83870933 1205.28949579 1137.89799552  958.10772807  534.61263806\n",
      "  961.17121971  711.38566519 1049.87312694 1204.57314949  758.77076631]\n",
      "15000.0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "61298it [00:09, 6249.18it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 890.10242966 1036.1292424   559.4494819  1320.37649811 1314.33765353\n",
      " 1368.62353705 1222.63612332 1135.71244509  927.35160944  517.47898435\n",
      "  971.7566211   726.59812269 1061.57290271 1175.92091166  771.95343697]\n",
      "14999.999999999995\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "70678it [00:11, 6216.31it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 879.98449278 1043.97098623  568.49912442 1285.40286283 1336.06886693\n",
      " 1370.23575176 1191.11465707 1172.22356435  938.23933073  525.8775335\n",
      "  923.53506316  721.5455596  1055.92150845 1223.93814669  763.44255149]\n",
      "14999.999999999996\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "80731it [00:12, 6305.25it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 880.31632601 1028.64228843  582.21286067 1231.11279134 1337.5847339\n",
      " 1392.91037893 1178.22671427 1167.99710165  926.6329877   563.27336864\n",
      "  949.86675256  697.45220079 1034.07176451 1229.60581626  800.09391432]\n",
      "14999.99999999999\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "90858it [00:14, 6236.81it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 899.93474235 1059.38740085  597.42531281 1269.26015175 1304.48278499\n",
      " 1363.53425985 1173.06758086 1168.55764378  928.88734116  540.49817824\n",
      "  997.22413578  722.53796438 1030.03603336 1192.58309712  752.58337271]\n",
      "14999.999999999996\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100000it [00:15, 6263.78it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "IFT ONLY\n",
      "[('BCB', 'llama3_70b_i_0.6', 'exact_correctness'), ('BCB', 'llama3_8b_i_1.', 'exact_correctness'), ('BCB', 'qwen2_32b_i_0.6', 'exact_correctness'), ('BCB', 'qwen2_7b_i_0.6', 'exact_correctness'), ('COLLIE', 'falcon_mamba_i_1.', 'exact_correctness'), ('COLLIE', 'llama3_70b_i_1.', 'exact_correctness'), ('COLLIE', 'llama3_8b_i_1.', 'exact_correctness'), ('COLLIE', 'phi35_1.', 'exact_correctness'), ('COLLIE', 'phi35_i_1.', 'exact_correctness'), ('COLLIE', 'qwen2_32b_i_0.6', 'exact_correctness'), ('COLLIE', 'qwen2_7b_i_0.6', 'exact_correctness'), ('COQA', 'llama3_8b_i_1.', 'spmoji'), ('COQA', 'phi35_i_1.', 'spmoji'), ('COQA', 'qwen2_32b_i_0.6', 'spmoji'), ('COQAPERT', 'llama3_70b_i_0.6', 'perturb'), ('COQAPERT', 'llama3_8b_i_0.6', 'perturb'), ('COQAPERT', 'qwen2_32b_i_0.6', 'perturb'), ('COQAPERT', 'qwen2_7b_i_0.6', 'perturb'), ('KUQ', 'llama3_8b_i_1.', 'ood_id'), ('KUQ', 'phi35_i_1.', 'ood_id'), ('SQUAD', 'llama3_70b_i_1.', 'ood_id'), ('SQUAD', 'llama3_70b_i_1.', 'spmoji'), ('SQUAD', 'llama3_8b_i_1.', 'ood_id'), ('SQUAD', 'llama3_8b_i_1.', 'spmoji'), ('SQUAD', 'phi35_i_1.', 'ood_id'), ('SQUAD', 'phi35_i_1.', 'spmoji'), ('SQUAD', 'qwen2_32b_i_0.6', 'ood_id'), ('SQUAD', 'qwen2_32b_i_0.6', 'spmoji'), ('SQUADPERT', 'llama3_70b_i_0.6', 'perturb'), ('SQUADPERT', 'llama3_8b_i_0.6', 'perturb'), ('SQUADPERT', 'qwen2_32b_i_0.6', 'perturb'), ('SQUADPERT', 'qwen2_7b_i_0.6', 'perturb'), ('TRIVIA', 'llama3_8b_i_1.', 'spmoji'), ('TRIVIA', 'phi35_i_1.', 'spmoji')]\n",
      "['random_baseline', 'predent', 'len_norm_predent', 'seqlen_sample', 'seqlen_correct', 'TOKEN_SAR', 'SENT_SAR', 'SAR', 'log_pplx', 'sement', 'min_logprob', 'len_norm_sement', 'ptrue_neg_log_prob', 'GNLL', 'EigenScore']\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "613it [00:00, 6129.77it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.  999. 1001.\n",
      " 1000. 1000. 1000.]\n",
      "15000.0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "10723it [00:01, 6299.84it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 608.33215518 1111.38027034  756.83531454  976.2868158  1080.06998244\n",
      "  927.43827804 1140.33890514  943.32792619 1192.64124337  949.77175938\n",
      "  802.59171959 1173.40526987  963.22639105 1253.72548809 1120.62848099]\n",
      "14999.999999999993\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "20876it [00:03, 6316.50it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 499.79467774 1157.59880109  676.36102806  973.58932516 1148.97346991\n",
      "  903.66636423 1111.19465538  886.47416137 1275.34521666  909.36706768\n",
      "  757.19560303 1259.06604318  928.26082945 1343.33897791 1169.77377916]\n",
      "14999.999999999998\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "31078it [00:04, 6387.50it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 439.85631591 1173.29966726  661.89996192  961.43304945 1122.81948999\n",
      "  889.24363751 1124.11563777  910.99396159 1294.16246506  928.39068246\n",
      "  754.97825322 1240.62276997  961.82819162 1345.04276126 1191.31315502]\n",
      "14999.999999999989\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "41251it [00:06, 6262.44it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 419.83607073 1158.22678456  660.88009427  916.88770121 1169.47903222\n",
      "  858.41079396 1116.51223446  877.41249395 1288.48496419  931.09044678\n",
      "  748.02219955 1276.89329296  970.2592234  1389.90540658 1217.6992612 ]\n",
      "14999.999999999987\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "50710it [00:08, 6256.38it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 429.35653237 1176.05532726  625.44433258  926.36028154 1171.68790036\n",
      "  851.08890882 1149.13620152  871.86285153 1297.75971633  936.99902886\n",
      "  736.924593   1257.89658939  993.56374846 1352.73405417 1223.12993382]\n",
      "14999.999999999987\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "60849it [00:09, 6302.96it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 405.27547301 1190.20251233  607.60164956  929.95214002 1159.64525961\n",
      "  871.65421204 1153.09277736  893.54167952 1293.56080785  919.28541259\n",
      "  759.87859268 1281.52894479  923.45472185 1384.92616321 1226.39965358]\n",
      "14999.999999999985\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "70919it [00:11, 6240.17it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 369.77859778 1181.34909529  628.9209835   917.35153187 1129.53854311\n",
      "  873.07532543 1179.88082096  928.49922208 1275.50566786  932.0961581\n",
      "  717.43817176 1272.60382137  938.57945677 1402.69408479 1252.68851932]\n",
      "14999.999999999993\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "81039it [00:12, 6348.15it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 393.04368525 1180.51514516  631.73065115  899.28775357 1162.63012171\n",
      "  902.60129819 1139.74144295  912.31444724 1304.70340414  945.82897228\n",
      "  729.54182438 1249.87945307  916.28957592 1398.42170233 1233.47052266]\n",
      "14999.999999999996\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "91218it [00:14, 6216.27it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 433.21559294 1197.76804261  666.71708955  881.58406637 1151.25486752\n",
      "  827.88377769 1130.38139907  890.24376366 1281.39472601  948.07992473\n",
      "  732.6248337  1300.41879411  935.24091965 1383.44184224 1239.75036013]\n",
      "14999.999999999984\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100000it [00:15, 6286.54it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "PRETRAINED ONLY\n",
      "[('BCB', 'llama3_8b_1.', 'exact_correctness'), ('COLLIE', 'falcon_mamba_1.', 'exact_correctness'), ('COLLIE', 'llama3_8b_1.', 'exact_correctness'), ('COQA', 'llama3_70b_1.', 'spmoji'), ('COQA', 'llama3_8b_1.', 'spmoji'), ('SQUAD', 'llama3_8b_1.', 'ood_id'), ('SQUAD', 'llama3_8b_1.', 'spmoji')]\n",
      "['random_baseline', 'predent', 'len_norm_predent', 'seqlen_sample', 'seqlen_correct', 'TOKEN_SAR', 'SENT_SAR', 'SAR', 'log_pplx', 'sement', 'min_logprob', 'len_norm_sement', 'ptrue_neg_log_prob', 'GNLL', 'EigenScore']\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "609it [00:00, 6083.07it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.  999. 1001.\n",
      " 1000. 1000. 1000.]\n",
      "15000.0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "10641it [00:01, 6261.24it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 957.54580758  875.16318726  568.70514882 1035.58893704 1508.11181924\n",
      " 1000.82722175 1256.66439246  998.41188997  579.6636465   731.62962763\n",
      " 1272.765671   1014.81780319 1003.01105037 1291.92328989  905.1705073 ]\n",
      "14999.999999999996\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "20671it [00:03, 6238.12it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 956.07823397  823.36334179  419.24513311 1058.11447628 1696.36279939\n",
      "  990.66821785 1365.38886579  966.90040364  411.01755441  656.27618925\n",
      " 1340.14933277 1063.74780015  986.69414712 1357.41766955  908.57583493]\n",
      "14999.999999999993\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "30705it [00:04, 6189.14it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 942.79835267  810.95371677  369.68879664 1040.33102906 1757.48919082\n",
      "  973.49091386 1421.74054171 1004.31030816  358.35565379  635.72077304\n",
      " 1372.84097622 1003.06790756  959.37999494 1432.32708325  917.50476151]\n",
      "14999.999999999996\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "40715it [00:06, 6250.44it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 938.7452892   818.34187839  337.95442342 1041.07698255 1802.52011822\n",
      " 1004.46395491 1423.64199527  995.7339317   327.23696759  580.87491753\n",
      " 1367.10413711 1016.29165544  953.28141533 1452.30122939  940.43110395]\n",
      "14999.999999999998\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "50715it [00:08, 6219.67it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 936.68240359  787.85727164  297.39455943 1040.81531752 1804.94245625\n",
      "  991.23046075 1433.81339679  979.08206234  307.74158851  640.27191085\n",
      " 1375.90524849 1051.14009617  967.16229111 1457.43261843  928.52831811]\n",
      "14999.999999999998\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "60717it [00:09, 6231.67it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 929.71288267  831.43063565  295.53419959 1040.04855414 1817.1751787\n",
      "  965.16608972 1424.1115703   963.64652325  298.37913112  636.54110454\n",
      " 1350.2893456  1059.5111211  1024.09536724 1448.38460299  915.97369338]\n",
      "14999.999999999995\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "70742it [00:11, 6255.83it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 966.80155976  800.78639595  294.66881654 1045.82089589 1825.42129781\n",
      "  967.00513227 1439.61562001  994.0717861   287.22111839  637.13418801\n",
      " 1367.6293192  1037.09297956 1002.31792691 1450.60645901  883.80650459]\n",
      "14999.999999999984\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "80777it [00:12, 6263.23it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 949.69834756  792.67238054  272.67469649 1024.91462196 1823.32788028\n",
      " 1012.13446457 1420.1327354   943.18649137  274.44019286  676.16514299\n",
      " 1371.65311753 1061.3091232  1012.86442692 1472.06324342  892.7631349 ]\n",
      "14999.999999999995\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "90881it [00:14, 6206.20it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 979.04476473  770.49545973  296.434556    980.99188604 1840.76587417\n",
      " 1002.0703837  1409.93591643  968.69356856  300.12799608  631.40194859\n",
      " 1405.00932822 1046.2375662  1046.63320753 1462.70741662  859.45012738]\n",
      "14999.999999999984\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100000it [00:16, 6230.00it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "OOD ONLY\n",
      "[('KUQ', 'llama3_8b_i_1.', 'ood_id'), ('KUQ', 'phi35_i_1.', 'ood_id'), ('SQUAD', 'llama3_70b_i_1.', 'ood_id'), ('SQUAD', 'llama3_8b_1.', 'ood_id'), ('SQUAD', 'llama3_8b_i_1.', 'ood_id'), ('SQUAD', 'phi35_i_1.', 'ood_id'), ('SQUAD', 'qwen2_32b_i_0.6', 'ood_id')]\n",
      "['random_baseline', 'predent', 'len_norm_predent', 'seqlen_sample', 'seqlen_correct', 'TOKEN_SAR', 'SENT_SAR', 'SAR', 'log_pplx', 'sement', 'min_logprob', 'len_norm_sement', 'ptrue_neg_log_prob', 'GNLL', 'EigenScore']\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "609it [00:00, 6086.70it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.  999. 1001.\n",
      " 1000. 1000. 1000.]\n",
      "15000.0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "10724it [00:01, 6304.83it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 553.40861565 1107.29173192  635.18539738 1015.89304311 1150.79578879\n",
      "  947.70458219 1100.21041796  943.88648592  979.61752696  805.23939404\n",
      "  834.96813972 1303.12522433  974.83310077 1404.05190983 1243.78864141]\n",
      "15000.000000000007\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "20813it [00:03, 6296.47it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 413.01770018 1131.19703398  500.68409719  997.95182184 1181.61247869\n",
      "  945.86420368 1128.37543642  994.60622905  969.39940167  736.43892197\n",
      "  819.42491426 1348.78020839  969.85665157 1521.39614333 1341.39475778]\n",
      "15000.000000000011\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "30876it [00:04, 6247.08it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 344.27292727 1177.10684893  447.24590606  979.94713575 1192.94404435\n",
      "  973.01741742 1137.80942267  979.31799642  974.7347885   739.15542748\n",
      "  795.52588459 1367.2781039   948.56561509 1581.6238439  1361.45463767]\n",
      "15000.000000000015\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "40934it [00:06, 6231.73it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 317.04543998 1168.11848124  432.53441196  986.39437059 1213.54509563\n",
      "  936.24550558 1173.98106909  937.02577009  973.03792758  755.73443914\n",
      "  784.73450186 1379.31616258  941.69093654 1628.51704463 1372.07884351]\n",
      "15000.00000000002\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "50936it [00:08, 6222.37it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 306.56768978 1159.63057556  415.72183742  995.86913702 1190.90102578\n",
      "  910.75022656 1181.98109408  946.04364677  950.77324731  761.60894278\n",
      "  748.90049314 1425.81482463  955.98822654 1664.73850835 1384.71052428]\n",
      "15000.000000000027\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "61003it [00:09, 6261.34it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 316.94407977 1172.38595271  423.96028169 1001.19448954 1201.06736147\n",
      "  944.84539704 1138.39922617  967.91349864  938.28961283  716.52515542\n",
      "  755.44226658 1400.22287514  979.8526099  1689.57620784 1353.38098525]\n",
      "15000.000000000027\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "71023it [00:11, 6218.10it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 310.65607714 1166.88347993  411.21627318  993.49288954 1233.70676548\n",
      "  976.95012453 1155.67082207  964.24957165  969.69324922  709.6792587\n",
      "  752.89807378 1388.88697442  941.6190067  1670.5569077  1353.84052594]\n",
      "15000.00000000003\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "81053it [00:12, 6244.38it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 328.73443671 1150.29059046  407.5646678   971.77692013 1237.9667247\n",
      "  949.0037893  1187.18957117  958.09085199  946.18124031  727.42803458\n",
      "  754.04279658 1414.64671379  918.80542675 1688.54437523 1359.7338605 ]\n",
      "15000.000000000035\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "91125it [00:14, 6188.60it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 307.37654143 1131.62438484  438.24687958  943.08815397 1232.81561973\n",
      "  935.54457002 1183.12610188  965.2804166   952.9881468   712.01685254\n",
      "  769.37211076 1407.17176172  948.94096969 1721.1354261  1351.27206432]\n",
      "15000.000000000025\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100000it [00:16, 6242.75it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "PERTURB ONLY\n",
      "[('COQAPERT', 'llama3_70b_i_0.6', 'perturb'), ('COQAPERT', 'llama3_8b_i_0.6', 'perturb'), ('COQAPERT', 'qwen2_32b_i_0.6', 'perturb'), ('COQAPERT', 'qwen2_7b_i_0.6', 'perturb'), ('SQUADPERT', 'llama3_70b_i_0.6', 'perturb'), ('SQUADPERT', 'llama3_8b_i_0.6', 'perturb'), ('SQUADPERT', 'qwen2_32b_i_0.6', 'perturb'), ('SQUADPERT', 'qwen2_7b_i_0.6', 'perturb')]\n",
      "['random_baseline', 'predent', 'len_norm_predent', 'seqlen_sample', 'seqlen_correct', 'TOKEN_SAR', 'SENT_SAR', 'SAR', 'log_pplx', 'sement', 'min_logprob', 'len_norm_sement', 'ptrue_neg_log_prob', 'GNLL', 'EigenScore']\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "633it [00:00, 6328.62it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.  999. 1001.\n",
      " 1000. 1000. 1000.]\n",
      "15000.0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "10995it [00:01, 6436.55it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 530.78895218 1045.25804382  894.52122791  819.21728954 1508.01570286\n",
      "  868.61716962  989.76978226  854.81712523 1539.00496163 1148.4107747\n",
      "  365.36731296 1395.90101659  832.11019692 1208.2004438  1000.        ]\n",
      "14999.999999999998\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "20728it [00:03, 6482.23it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 333.74107311 1094.41923793  866.00802482  743.45841651 1704.72610321\n",
      "  840.59788411  999.51028832  763.52248168 1759.36855312 1194.81715693\n",
      "   47.61125278 1549.56162712  801.70897335 1300.94892701 1000.        ]\n",
      "14999.999999999996\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "40985it [00:06, 6505.62it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 155.21436358 1131.37713736  839.25566053  676.98186716 1941.54230618\n",
      "  829.96337459 1011.04608703  701.09752777 1969.23141189 1251.99678342\n",
      " -301.55060876 1688.82501206  734.90871461 1370.11036258 1000.        ]\n",
      "14999.999999999998\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "50691it [00:07, 6410.79it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 128.82903242 1154.37798805  800.19233433  649.18160537 1977.67023717\n",
      "  817.79379965 1047.96640699  721.91339053 2021.47312197 1251.07641239\n",
      " -411.42533187 1740.95516019  713.34091705 1386.65492575 1000.        ]\n",
      "14999.999999999996\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "61084it [00:09, 6471.06it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[  70.38277219 1153.30020663  812.41752408  650.17150368 2012.18474943\n",
      "  798.47641856 1041.69413951  712.45434415 2075.28043841 1273.47439599\n",
      " -502.92348177 1790.34284344  716.74928512 1395.99486058 1000.        ]\n",
      "14999.999999999995\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "70793it [00:10, 6441.97it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[  14.13898852 1161.39987099  808.15476125  653.95965689 2086.07770308\n",
      "  796.06195038 1021.58668808  696.27797108 2114.19766922 1276.99831284\n",
      " -577.15408478 1813.32364416  737.73681037 1397.24005793 1000.        ]\n",
      "14999.99999999999\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "81138it [00:12, 6468.99it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[  -7.4725692  1158.69723905  826.7777092   628.13219601 2103.37820435\n",
      "  815.18318482 1037.11520686  684.95821591 2155.35949711 1331.75967564\n",
      " -648.41678816 1786.52932886  703.38434995 1424.61454961 1000.        ]\n",
      "15000.000000000005\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "90900it [00:14, 6360.12it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ -10.75223195 1159.24961063  855.56564834  627.04469397 2086.65223402\n",
      "  769.79176759 1044.12587052  699.74579847 2193.31796557 1325.78093673\n",
      " -707.31790808 1811.27133565  692.28187126 1453.24240728 1000.        ]\n",
      "15000.000000000007\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100000it [00:15, 6442.00it/s]\n"
     ]
    }
   ],
   "source": [
    "# COMPUTE ELO SCORES FOR EACH CATEGORY FROM ABOVE\n",
    "from tqdm import tqdm\n",
    "\n",
    "def sigmoid(Z, scale=400):\n",
    "    A=1/(1+(np.exp((-Z/400))))\n",
    "    return A\n",
    "\n",
    "\n",
    "selscore = 'auc_score'\n",
    "n_games = 100000\n",
    "K_const = 2\n",
    "\n",
    "elo_packs = {}\n",
    "\n",
    "for grand_identifier, all_subdfs in big_accumulator.items():\n",
    "    # technical\n",
    "    print(grand_identifier)\n",
    "    np.random.seed(228)\n",
    "    ds_model_pairs = list(all_subdfs.keys())\n",
    "    ds_model_probs = np.ones(len(ds_model_pairs))/len(ds_model_pairs)\n",
    "    \n",
    "    # initialize\n",
    "    print(ds_model_pairs)\n",
    "    \n",
    "    method_names = ['random_baseline',] + list(preopt_df.uncertainty.unique()) # list(all_subdfs[ds_model_pairs[0]].uncertainty.unique())\n",
    "    \n",
    "    method_elos = np.zeros(len(method_names))+1000.\n",
    "    elo_hist = []\n",
    "    # \n",
    "    print(method_names)\n",
    "    \n",
    "    game_choice = np.random.choice(len(ds_model_probs), size=n_games, replace=True, p=ds_model_probs) \n",
    "    method_a_choice = np.random.choice(len(method_names), size=n_games, replace=True) #, p=ds_model_probs) \n",
    "    method_b_choice = np.random.choice(len(method_names), size=n_games, replace=True) # p=ds_model_probs) \n",
    "    # the main loop\n",
    "    for step, (gi, met_a, met_b) in tqdm(enumerate(zip(game_choice, method_a_choice, method_b_choice))):\n",
    "        if met_a == met_b:\n",
    "            # no self play allowed, this is a christian manga :)\n",
    "            continue\n",
    "        # pick a problem according to the defined probability\n",
    "        subdf = all_subdfs[ds_model_pairs[gi]]\n",
    "    \n",
    "        try:\n",
    "            if met_a == 0:\n",
    "                col_a = 0.5 # change if using daupr for whatever reason\n",
    "            else:\n",
    "                col_a = subdf[subdf.uncertainty==method_names[met_a]][selscore].item()\n",
    "            if met_b == 0:\n",
    "                col_b = 0.5 # change if using daupr for whatever reason\n",
    "            else:\n",
    "                col_b = subdf[subdf.uncertainty==method_names[met_b]][selscore].item()\n",
    "        except:\n",
    "            continue\n",
    "\n",
    "        S_A = 1. if col_a > col_b else 0.\n",
    "        S_B = 1. - S_A\n",
    "    \n",
    "        # compute the elo updates\n",
    "        r_a = method_elos[met_a]\n",
    "        r_b = method_elos[met_b]\n",
    "        sigma_r_ab = sigmoid(r_a - r_b)\n",
    "        sigma_r_ba = 1- sigma_r_ab # sigmoid(r_b - r_a)\n",
    "    \n",
    "        r_a_u = r_a + K_const * (S_A - sigma_r_ab)\n",
    "        r_b_u = r_b + K_const * (S_B - sigma_r_ba)\n",
    "    \n",
    "        method_elos[met_a] = r_a_u\n",
    "        method_elos[met_b] = r_b_u\n",
    "\n",
    "        # if this game results in negative Elo, roll back, no negatives allowed due to possible infinite drain\n",
    "        if method_elos[met_a] < 0. or method_elos[met_b] < 0.:\n",
    "            method_elos[met_b] = r_b\n",
    "            method_elos[met_a] = r_a\n",
    "            \n",
    "\n",
    "        # print(col_a, col_b, S_A, S_B, r_a, r_b, sigma_r_ab, sigma_r_ba, r_a_u, r_b_u)\n",
    "        elo_hist.append(np.copy(method_elos))\n",
    "    \n",
    "        if step%10000==0:\n",
    "            print(method_elos)\n",
    "            print(method_elos.sum())\n",
    "    \n",
    "    # save to the dict\n",
    "    elo_packs[grand_identifier] = {\n",
    "        'elo_history': elo_hist,\n",
    "        'method_names': method_names,\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "id": "8148051c-f868-494f-b093-feee99acd6b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "\n",
    "with open('csvs/elo_scores_05_15p.pkl', 'wb') as f:\n",
    "    pickle.dump(elo_packs, f)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "id": "ddd5b6b6-2d85-472e-9c60-fbb9677fa598",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "\n",
    "with open('csvs/elo_scores_05_15p.pkl', 'rb') as f:\n",
    "    elo_packs = pickle.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "id": "d7d2323e-fc66-46bf-abcd-ed595dde1987",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ALL TASKS\n",
      "random_baseline 524.9519329399285 10.26222702599378\n",
      "predent 1105.3540880937928 27.54011710172134\n",
      "len_norm_predent 602.359353712436 66.62426288755438\n",
      "seqlen_sample 960.342189592219 10.83476716379451\n",
      "seqlen_correct 1218.6916885370522 48.023653930277355\n",
      "TOKEN_SAR 896.3971056781702 8.033277158728568\n",
      "SENT_SAR 1174.1497283846306 37.494162940634475\n",
      "SAR 916.7492737146486 17.439864448124762\n",
      "log_pplx 1137.7858048326284 11.138435673230562\n",
      "sement 890.0656809519802 22.32133607582665\n",
      "min_logprob 876.1224500117323 29.410481557765888\n",
      "len_norm_sement 1261.4087346089211 11.988108573667807\n",
      "ptrue_neg_log_prob 907.2394934296223 70.6965450964048\n",
      "GNLL 1386.3895051513846 12.703571357280788\n",
      "EigenScore 1141.9929703608277 11.648706538725516\n",
      "QA ONLY\n",
      "random_baseline 269.1602003702104 4.328483561258366\n",
      "predent 1044.5385461184642 9.748891933691786\n",
      "len_norm_predent 570.1775911583937 8.834531840825528\n",
      "seqlen_sample 577.674933437123 36.80987421629478\n",
      "seqlen_correct 935.7765436686266 4.578940439667584\n",
      "TOKEN_SAR 512.8565575063507 2.4556685945895556\n",
      "SENT_SAR 1506.2631868550466 13.946583036307878\n",
      "SAR 842.9464636620597 11.235363123392801\n",
      "log_pplx 1007.7114756092853 7.389745369995842\n",
      "sement 1094.0964223202675 9.527278828240412\n",
      "min_logprob 1187.866775402334 45.320133523249154\n",
      "len_norm_sement 1681.9182925751124 13.798540737012322\n",
      "ptrue_neg_log_prob 928.7222775229608 8.56159250909688\n",
      "GNLL 1590.1597470542342 12.70300755772845\n",
      "EigenScore 1250.130986739506 8.349511909090705\n",
      "CODE ONLY\n",
      "random_baseline 739.2070229717734 3.6898221898980945\n",
      "predent 1181.0389625522268 17.81941280794212\n",
      "len_norm_predent 378.80126872521873 2.7226394411283827\n",
      "seqlen_sample 1377.0580778341448 11.26526247048523\n",
      "seqlen_correct 1305.8561357317653 5.622777869812973\n",
      "TOKEN_SAR 825.1485796934475 5.802683565087657\n",
      "SENT_SAR 910.7211748145261 13.31601097547779\n",
      "SAR 856.7637151477091 14.720746110861691\n",
      "log_pplx 1095.6948792573276 24.690381819180626\n",
      "sement 579.6304583151476 30.74010822907375\n",
      "min_logprob 951.0079003362334 17.321096742745038\n",
      "len_norm_sement 756.9561158358827 9.630649105730365\n",
      "ptrue_neg_log_prob 1051.8847549898996 8.364885555054203\n",
      "GNLL 1437.1697100135568 9.85060870847218\n",
      "EigenScore 1553.0612437811365 2.6296646263224637\n",
      "CONSTRAINED TEXT ONLY\n",
      "random_baseline 864.4401816631869 5.171686975101058\n",
      "predent 1023.850868604958 29.229508977692664\n",
      "len_norm_predent 570.2045300975642 3.984314748007711\n",
      "seqlen_sample 1292.3359954000716 27.022928873244464\n",
      "seqlen_correct 1308.3003246918415 60.42263696230505\n",
      "TOKEN_SAR 1359.5047162762742 39.906784791259355\n",
      "SENT_SAR 1188.8523292786797 4.767504063082425\n",
      "SAR 1148.8805823559105 12.060851453319993\n",
      "log_pplx 905.8854542458694 21.976583332972137\n",
      "sement 580.8327191457063 6.385421721283534\n",
      "min_logprob 1032.9575602007537 8.39772190340921\n",
      "len_norm_sement 725.5218006822349 7.0255231202659045\n",
      "ptrue_neg_log_prob 1032.3491448953853 12.851570151751153\n",
      "GNLL 1229.8182695459718 15.895853118378406\n",
      "EigenScore 736.265522915568 13.717318618879638\n",
      "IFT ONLY\n",
      "random_baseline 392.64145607903407 5.427621990330059\n",
      "predent 1156.7881825077475 6.456852394087215\n",
      "len_norm_predent 652.8691486781337 12.279043416282004\n",
      "seqlen_sample 928.6926232636868 9.779639987920083\n",
      "seqlen_correct 1154.8174440452078 106.44303973674099\n",
      "TOKEN_SAR 832.4452984682088 13.29242606909223\n",
      "SENT_SAR 1145.422989284868 11.89601789357824\n",
      "SAR 901.1537874425486 11.4042348159045\n",
      "log_pplx 1283.022749393578 14.76848694498816\n",
      "sement 921.4904907391657 17.02043873373071\n",
      "min_logprob 740.9441848793227 16.646637044947134\n",
      "len_norm_sement 1312.8274752899104 5.154926927944998\n",
      "ptrue_neg_log_prob 955.6197836091592 54.28885312979788\n",
      "GNLL 1372.0003576867498 39.06108960962633\n",
      "EigenScore 1249.264028632641 2.2811707417456653\n",
      "PRETRAINED ONLY\n",
      "random_baseline 1003.7409878314004 7.864204447036224\n",
      "predent 757.6779792566138 19.53442575258336\n",
      "len_norm_predent 288.8511741109344 5.520172612165164\n",
      "seqlen_sample 996.5589098746717 7.344245032822374\n",
      "seqlen_correct 1864.4949222721536 10.487187083726578\n",
      "TOKEN_SAR 986.6345474037448 17.550818995594206\n",
      "SENT_SAR 1387.6795293459506 11.15023260569064\n",
      "SAR 945.9694417853985 10.045879890701595\n",
      "log_pplx 309.20567030118553 43.923528151627025\n",
      "sement 639.7815252593051 7.330419749261719\n",
      "min_logprob 1392.6067162429413 21.373236382930877\n",
      "len_norm_sement 1055.1984927604015 17.96696526257884\n",
      "ptrue_neg_log_prob 979.4566452707504 8.80033850381979\n",
      "GNLL 1481.461124018691 61.45238579761964\n",
      "EigenScore 910.6823342658541 10.039928385863186\n",
      "OOD ONLY\n",
      "random_baseline 280.20847369251294 7.652767586351738\n",
      "predent 1145.8942369871509 2.9946793696635043\n",
      "len_norm_predent 408.3318546822898 11.442998710775491\n",
      "seqlen_sample 989.9325019030081 38.89638369125584\n",
      "seqlen_correct 1209.2006459751249 8.125626083079792\n",
      "TOKEN_SAR 892.0661494616375 21.13631915120509\n",
      "SENT_SAR 1172.2275050596174 10.497076186322284\n",
      "SAR 993.4819659207125 17.83552007554784\n",
      "log_pplx 969.3571198907928 16.025855961264448\n",
      "sement 715.9456373094985 21.374006599926993\n",
      "min_logprob 779.9737694836267 11.971806053683368\n",
      "len_norm_sement 1413.8679348866392 60.82007967940123\n",
      "ptrue_neg_log_prob 949.5459166487603 23.289363365851607\n",
      "GNLL 1707.3588590959394 2.5427158977200692\n",
      "EigenScore 1372.6074290027125 10.207612259395068\n",
      "PERTURB ONLY\n",
      "random_baseline -21.967859611288283 4.225784829585881\n",
      "predent 1181.3231366162197 4.5122717647690465\n",
      "len_norm_predent 807.7678922292706 4.902630052294459\n",
      "seqlen_sample 645.3296883659034 8.625199472011253\n",
      "seqlen_correct 2118.5191178055043 2.851407955687894\n",
      "TOKEN_SAR 776.0021680391513 1.957531662371936\n",
      "SENT_SAR 1035.6684935867 6.888159672914498\n",
      "SAR 700.9538035787384 7.152892212529104\n",
      "log_pplx 2221.3449760892177 6.3809508201346805\n",
      "sement 1337.734718089769 5.3331021630212065\n",
      "min_logprob -760.2625963080187 2.612360705229262\n",
      "len_norm_sement 1841.9124065385704 2.4518885142450997\n",
      "ptrue_neg_log_prob 693.2152176116572 7.166452849953206\n",
      "GNLL 1422.4588373685929 2.756443567409625\n",
      "EigenScore 1000.0 0.0\n"
     ]
    }
   ],
   "source": [
    "for mode, elos in elo_packs.items():\n",
    "    elo_hist = elos['elo_history']\n",
    "    method_names = elos['method_names']\n",
    "    print(mode)\n",
    "\n",
    "    elo_hist_concat = np.stack(elo_hist)\n",
    "    # print(elo_hist_concat.shape)\n",
    "    elo_means = elo_hist_concat[-1000:].mean(0)\n",
    "    elo_vars = elo_hist_concat[-1000:].var(0)\n",
    "    \n",
    "    for n,elo_mean,elo_var in zip(method_names,elo_means,elo_vars):\n",
    "        print(n, elo_mean, elo_var)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a98098ca",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
