{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "0b05c516-5540-4061-8590-0a62cebb58ab",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib\n",
    "from matplotlib import pyplot as plt\n",
    "import seaborn as sns\n",
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "ad22c6a0-cc8b-493a-8a41-784459460689",
   "metadata": {},
   "outputs": [],
   "source": [
    "# load the correctness cross df\n",
    "df = pd.read_csv('csvs/uncert_methods_selpred_scores_28_01.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "67c30afd-d6a5-41d8-bd43-b018e74fc49b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# we will try to find correctness threshold to maximize the performance of the word count uncertainty\n",
    "\n",
    "def seqlen_top_rank(\n",
    "    cordf, \n",
    "    correctness, \n",
    "    threshold,\n",
    "    use_score='daupr_score', #'auc_score', 'aupr_score', 'daupr_score'\n",
    "    top_rank=1\n",
    "):\n",
    "    specdf = df[(df.correctness==correctness)&(df.threshold==threshold)]\n",
    "    seqlen_sample_rank = 0\n",
    "    seqlen_correct_rank = 0\n",
    "    total = 0\n",
    "    for g, ids in specdf.groupby(['dataset', 'model']).groups.items():\n",
    "        # print(g, ids)\n",
    "        subdf = -specdf.loc[ids][['uncertainty', use_score]].set_index('uncertainty')\n",
    "        # print(subdf)\n",
    "        subdf = subdf.rank()\n",
    "        # print(subdf)\n",
    "        # do not consider identical ranks\n",
    "        # try:\n",
    "        seqlen_sample_rank += 1 if subdf.loc['seqlen_sample'].item()<=top_rank else 0\n",
    "        seqlen_correct_rank += 1 if subdf.loc['seqlen_correct'].item()<=top_rank else 0\n",
    "        total += 1\n",
    "        # except:\n",
    "        #     print(subdf)\n",
    "        #     print(subdf.loc['seqlen_correct'].item())\n",
    "\n",
    "    return {\n",
    "        'TOTAL': total,\n",
    "        'SEQLEN_SAMPLE_TOP': seqlen_sample_rank,\n",
    "        'SEQLEN_CORRECT_TOP': seqlen_correct_rank,\n",
    "        'COR': correctness,\n",
    "        'SCORE': use_score,\n",
    "        'THRESH': threshold,\n",
    "    }\n",
    "\n",
    "\n",
    "from collections import defaultdict\n",
    "\n",
    "def any_method_top_rank(\n",
    "    cordf, \n",
    "    correctness, \n",
    "    threshold,\n",
    "    use_score='daupr_score', #'auc_score', 'aupr_score', 'daupr_score'\n",
    "    top_rank=1\n",
    "):\n",
    "    specdf = df[(df.correctness==correctness)&(df.threshold==threshold)]\n",
    "    methods = defaultdict(int)\n",
    "    total = 0\n",
    "    for g, ids in specdf.groupby(['dataset', 'model']).groups.items():\n",
    "        # print(g, ids)\n",
    "        # subdf = specdf.loc[ids][['uncertainty', use_score]].set_index('uncertainty').rank()\n",
    "        subdf = -specdf.loc[ids][['uncertainty', use_score]].set_index('uncertainty')\n",
    "        # print(subdf)\n",
    "        subdf = subdf.rank()\n",
    "        # print(subdf)\n",
    "        # do not consider identical ranks\n",
    "        # try:\n",
    "        for i, el in zip(subdf.index, subdf.to_numpy()):\n",
    "            methods[i] += 1 if el.item()<=top_rank else 0\n",
    "        total += 1\n",
    "\n",
    "    return {\n",
    "        **methods,\n",
    "        'total': total,\n",
    "        'COR': correctness,\n",
    "        'SCORE': use_score,\n",
    "        'THRESH': threshold,\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fbf7b094-9569-46e8-a5a1-720b8b69a02c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "7c9fd78f-b8b2-4098-8b7a-20e0f18353df",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>TOTAL</th>\n",
       "      <th>SEQLEN_SAMPLE_TOP</th>\n",
       "      <th>SEQLEN_CORRECT_TOP</th>\n",
       "      <th>COR</th>\n",
       "      <th>SCORE</th>\n",
       "      <th>THRESH</th>\n",
       "      <th>frac</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>50</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama70b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.6</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>115</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama8b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.8</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>114</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama8b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.7</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>113</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama8b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.6</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>53</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama70b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.9</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama70b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.8</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>51</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama70b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.7</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>112</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama8b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama70b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama70b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.4</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama70b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama70b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.2</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama70b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>111</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama8b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.4</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>110</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama8b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>109</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama8b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.2</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>108</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama8b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>116</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama8b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.9</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>146</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama8b_qa_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>148</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama8b_qa_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama8b_qa_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.6</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>150</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama8b_qa_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.7</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>151</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama8b_qa_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.8</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>152</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama8b_qa_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.9</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>144</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama8b_qa_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.1</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>81</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama70b_qa_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.1</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>82</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama70b_qa_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.2</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>83</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama70b_qa_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>84</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama70b_qa_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.4</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>85</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama70b_qa_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>86</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama70b_qa_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.6</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>87</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama70b_qa_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.7</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     TOTAL  SEQLEN_SAMPLE_TOP  SEQLEN_CORRECT_TOP                    COR  \\\n",
       "50       2                  2                   2  j_llama70b_gen_16_0.5   \n",
       "115      2                  2                   2   j_llama8b_gen_16_0.5   \n",
       "114      2                  2                   2   j_llama8b_gen_16_0.5   \n",
       "113      2                  2                   2   j_llama8b_gen_16_0.5   \n",
       "53       2                  2                   2  j_llama70b_gen_16_0.5   \n",
       "52       2                  2                   2  j_llama70b_gen_16_0.5   \n",
       "51       2                  2                   2  j_llama70b_gen_16_0.5   \n",
       "112      2                  2                   2   j_llama8b_gen_16_0.5   \n",
       "49       2                  2                   2  j_llama70b_gen_16_0.5   \n",
       "48       2                  2                   2  j_llama70b_gen_16_0.5   \n",
       "47       2                  2                   2  j_llama70b_gen_16_0.5   \n",
       "46       2                  2                   2  j_llama70b_gen_16_0.5   \n",
       "45       2                  2                   2  j_llama70b_gen_16_0.5   \n",
       "111      2                  2                   2   j_llama8b_gen_16_0.5   \n",
       "110      2                  2                   2   j_llama8b_gen_16_0.5   \n",
       "109      2                  2                   2   j_llama8b_gen_16_0.5   \n",
       "108      2                  2                   2   j_llama8b_gen_16_0.5   \n",
       "116      2                  2                   2   j_llama8b_gen_16_0.5   \n",
       "146      2                  1                   2    j_llama8b_qa_16_0.5   \n",
       "148      2                  1                   2    j_llama8b_qa_16_0.5   \n",
       "149      2                  1                   2    j_llama8b_qa_16_0.5   \n",
       "150      2                  1                   2    j_llama8b_qa_16_0.5   \n",
       "151      2                  1                   2    j_llama8b_qa_16_0.5   \n",
       "152      2                  1                   2    j_llama8b_qa_16_0.5   \n",
       "144      2                  1                   2    j_llama8b_qa_16_0.5   \n",
       "81       2                  1                   2   j_llama70b_qa_16_0.5   \n",
       "82       2                  1                   2   j_llama70b_qa_16_0.5   \n",
       "83       2                  1                   2   j_llama70b_qa_16_0.5   \n",
       "84       2                  1                   2   j_llama70b_qa_16_0.5   \n",
       "85       2                  1                   2   j_llama70b_qa_16_0.5   \n",
       "86       2                  1                   2   j_llama70b_qa_16_0.5   \n",
       "87       2                  1                   2   j_llama70b_qa_16_0.5   \n",
       "\n",
       "         SCORE  THRESH  frac  \n",
       "50   auc_score     0.6   1.0  \n",
       "115  auc_score     0.8   1.0  \n",
       "114  auc_score     0.7   1.0  \n",
       "113  auc_score     0.6   1.0  \n",
       "53   auc_score     0.9   1.0  \n",
       "52   auc_score     0.8   1.0  \n",
       "51   auc_score     0.7   1.0  \n",
       "112  auc_score     0.5   1.0  \n",
       "49   auc_score     0.5   1.0  \n",
       "48   auc_score     0.4   1.0  \n",
       "47   auc_score     0.3   1.0  \n",
       "46   auc_score     0.2   1.0  \n",
       "45   auc_score     0.1   1.0  \n",
       "111  auc_score     0.4   1.0  \n",
       "110  auc_score     0.3   1.0  \n",
       "109  auc_score     0.2   1.0  \n",
       "108  auc_score     0.1   1.0  \n",
       "116  auc_score     0.9   1.0  \n",
       "146  auc_score     0.3   0.5  \n",
       "148  auc_score     0.5   0.5  \n",
       "149  auc_score     0.6   0.5  \n",
       "150  auc_score     0.7   0.5  \n",
       "151  auc_score     0.8   0.5  \n",
       "152  auc_score     0.9   0.5  \n",
       "144  auc_score     0.1   0.5  \n",
       "81   auc_score     0.1   0.5  \n",
       "82   auc_score     0.2   0.5  \n",
       "83   auc_score     0.3   0.5  \n",
       "84   auc_score     0.4   0.5  \n",
       "85   auc_score     0.5   0.5  \n",
       "86   auc_score     0.6   0.5  \n",
       "87   auc_score     0.7   0.5  "
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "opt_df = df[(df.dataset.isin(['TRIVIA','COQA','SQUAD'])&(~df.correctness.str.contains('to'))&(~df.correctness.str.contains('from')))]\n",
    "reslist = []\n",
    "for (cor, thr), _ in opt_df.groupby(['correctness', 'threshold']).groups.items():\n",
    "    reslist.append(seqlen_top_rank(opt_df, cor, thr, use_score='auc_score', top_rank=3))\n",
    "resdf = pd.DataFrame.from_records(reslist)\n",
    "resdf['frac'] = resdf['SEQLEN_SAMPLE_TOP']/resdf['TOTAL']\n",
    "resdf.sort_values('frac', ascending=False).head(32)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "0159151c-1dbb-48ac-a8ce-7b0131295de0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>TOTAL</th>\n",
       "      <th>SEQLEN_SAMPLE_TOP</th>\n",
       "      <th>SEQLEN_CORRECT_TOP</th>\n",
       "      <th>COR</th>\n",
       "      <th>SCORE</th>\n",
       "      <th>THRESH</th>\n",
       "      <th>frac</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>16</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>bma_judge</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.1</td>\n",
       "      <td>0.25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>16</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>bma_judge</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.2</td>\n",
       "      <td>0.25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>16</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>bma_judge</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "      <td>0.25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>16</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>bma_judge</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.4</td>\n",
       "      <td>0.25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>16</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>bma_judge</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>16</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>bma_judge</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.6</td>\n",
       "      <td>0.25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>16</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>bma_judge</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.7</td>\n",
       "      <td>0.25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>16</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>bma_judge</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.8</td>\n",
       "      <td>0.25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>16</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>bma_judge</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.9</td>\n",
       "      <td>0.25</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    TOTAL  SEQLEN_SAMPLE_TOP  SEQLEN_CORRECT_TOP        COR      SCORE  \\\n",
       "18     16                  4                   5  bma_judge  auc_score   \n",
       "19     16                  4                   5  bma_judge  auc_score   \n",
       "20     16                  4                   5  bma_judge  auc_score   \n",
       "21     16                  4                   5  bma_judge  auc_score   \n",
       "22     16                  4                   5  bma_judge  auc_score   \n",
       "23     16                  4                   5  bma_judge  auc_score   \n",
       "24     16                  4                   5  bma_judge  auc_score   \n",
       "25     16                  4                   4  bma_judge  auc_score   \n",
       "26     16                  4                   4  bma_judge  auc_score   \n",
       "\n",
       "    THRESH  frac  \n",
       "18     0.1  0.25  \n",
       "19     0.2  0.25  \n",
       "20     0.3  0.25  \n",
       "21     0.4  0.25  \n",
       "22     0.5  0.25  \n",
       "23     0.6  0.25  \n",
       "24     0.7  0.25  \n",
       "25     0.8  0.25  \n",
       "26     0.9  0.25  "
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "resdf[resdf.COR=='bma_judge']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "330d73f3-7e99-4beb-b454-45cdc3c9cc8a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>TOTAL</th>\n",
       "      <th>SEQLEN_SAMPLE_TOP</th>\n",
       "      <th>SEQLEN_CORRECT_TOP</th>\n",
       "      <th>COR</th>\n",
       "      <th>SCORE</th>\n",
       "      <th>THRESH</th>\n",
       "      <th>frac</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>103</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama8b_gen_16.0_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama70b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama8b_gen_16.0_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.2</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>99</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama8b_gen_16.0_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.1</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>53</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama70b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.9</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama70b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.8</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>51</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama70b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.7</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama70b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.6</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama70b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama70b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.4</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama70b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.2</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama70b_gen_16.0_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.7</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama70b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.1</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama70b_gen_16.0_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.9</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama70b_gen_16.0_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.8</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama70b_gen_16.0_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.1</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama70b_gen_16.0_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.2</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama70b_gen_16.0_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama70b_gen_16.0_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.4</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama70b_gen_16.0_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>101</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama8b_gen_16.0_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama8b_gen_16.0_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.4</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>104</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama8b_gen_16.0_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.6</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>111</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama8b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.4</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>116</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama8b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.9</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>115</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama8b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.8</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>114</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama8b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.7</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>113</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama8b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.6</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama8b_gen_16.0_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.7</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>112</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama8b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>110</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama8b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>109</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama8b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.2</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     TOTAL  SEQLEN_SAMPLE_TOP  SEQLEN_CORRECT_TOP                      COR  \\\n",
       "103      2                  0                   1   j_llama8b_gen_16.0_0.5   \n",
       "47       2                  0                   1    j_llama70b_gen_16_0.5   \n",
       "100      2                  0                   1   j_llama8b_gen_16.0_0.5   \n",
       "99       2                  0                   1   j_llama8b_gen_16.0_0.5   \n",
       "53       2                  0                   1    j_llama70b_gen_16_0.5   \n",
       "52       2                  0                   1    j_llama70b_gen_16_0.5   \n",
       "51       2                  0                   1    j_llama70b_gen_16_0.5   \n",
       "50       2                  0                   1    j_llama70b_gen_16_0.5   \n",
       "49       2                  0                   1    j_llama70b_gen_16_0.5   \n",
       "48       2                  0                   1    j_llama70b_gen_16_0.5   \n",
       "46       2                  0                   1    j_llama70b_gen_16_0.5   \n",
       "42       2                  0                   1  j_llama70b_gen_16.0_0.5   \n",
       "45       2                  0                   1    j_llama70b_gen_16_0.5   \n",
       "44       2                  0                   1  j_llama70b_gen_16.0_0.5   \n",
       "43       2                  0                   1  j_llama70b_gen_16.0_0.5   \n",
       "36       2                  0                   1  j_llama70b_gen_16.0_0.5   \n",
       "37       2                  0                   1  j_llama70b_gen_16.0_0.5   \n",
       "38       2                  0                   1  j_llama70b_gen_16.0_0.5   \n",
       "39       2                  0                   1  j_llama70b_gen_16.0_0.5   \n",
       "40       2                  0                   1  j_llama70b_gen_16.0_0.5   \n",
       "101      2                  0                   1   j_llama8b_gen_16.0_0.5   \n",
       "102      2                  0                   1   j_llama8b_gen_16.0_0.5   \n",
       "104      2                  0                   1   j_llama8b_gen_16.0_0.5   \n",
       "111      2                  1                   1     j_llama8b_gen_16_0.5   \n",
       "116      2                  1                   1     j_llama8b_gen_16_0.5   \n",
       "115      2                  1                   1     j_llama8b_gen_16_0.5   \n",
       "114      2                  1                   1     j_llama8b_gen_16_0.5   \n",
       "113      2                  1                   1     j_llama8b_gen_16_0.5   \n",
       "105      2                  0                   1   j_llama8b_gen_16.0_0.5   \n",
       "112      2                  1                   1     j_llama8b_gen_16_0.5   \n",
       "110      2                  1                   1     j_llama8b_gen_16_0.5   \n",
       "109      2                  1                   1     j_llama8b_gen_16_0.5   \n",
       "\n",
       "         SCORE  THRESH  frac  \n",
       "103  auc_score     0.5   0.5  \n",
       "47   auc_score     0.3   0.5  \n",
       "100  auc_score     0.2   0.5  \n",
       "99   auc_score     0.1   0.5  \n",
       "53   auc_score     0.9   0.5  \n",
       "52   auc_score     0.8   0.5  \n",
       "51   auc_score     0.7   0.5  \n",
       "50   auc_score     0.6   0.5  \n",
       "49   auc_score     0.5   0.5  \n",
       "48   auc_score     0.4   0.5  \n",
       "46   auc_score     0.2   0.5  \n",
       "42   auc_score     0.7   0.5  \n",
       "45   auc_score     0.1   0.5  \n",
       "44   auc_score     0.9   0.5  \n",
       "43   auc_score     0.8   0.5  \n",
       "36   auc_score     0.1   0.5  \n",
       "37   auc_score     0.2   0.5  \n",
       "38   auc_score     0.3   0.5  \n",
       "39   auc_score     0.4   0.5  \n",
       "40   auc_score     0.5   0.5  \n",
       "101  auc_score     0.3   0.5  \n",
       "102  auc_score     0.4   0.5  \n",
       "104  auc_score     0.6   0.5  \n",
       "111  auc_score     0.4   0.5  \n",
       "116  auc_score     0.9   0.5  \n",
       "115  auc_score     0.8   0.5  \n",
       "114  auc_score     0.7   0.5  \n",
       "113  auc_score     0.6   0.5  \n",
       "105  auc_score     0.7   0.5  \n",
       "112  auc_score     0.5   0.5  \n",
       "110  auc_score     0.3   0.5  \n",
       "109  auc_score     0.2   0.5  "
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "opt_df = df[(df.dataset.isin(['TRIVIA','COQA','SQUAD'])&(~df.correctness.str.contains('to'))&(~df.correctness.str.contains('from')))]\n",
    "reslist = []\n",
    "for (cor, thr), _ in opt_df.groupby(['correctness', 'threshold']).groups.items():\n",
    "    reslist.append(seqlen_top_rank(opt_df, cor, thr, use_score='auc_score'))\n",
    "resdf = pd.DataFrame.from_records(reslist)\n",
    "resdf['frac'] = resdf['SEQLEN_CORRECT_TOP']/resdf['TOTAL']\n",
    "resdf.sort_values('frac', ascending=False).head(32)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "04093471-f783-4294-bf5f-9aaa50156c4a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>predent</th>\n",
       "      <th>len_norm_predent</th>\n",
       "      <th>seqlen_sample</th>\n",
       "      <th>seqlen_correct</th>\n",
       "      <th>EigenScore</th>\n",
       "      <th>TOKEN_SAR</th>\n",
       "      <th>SENT_SAR</th>\n",
       "      <th>SAR</th>\n",
       "      <th>log_pplx</th>\n",
       "      <th>sement</th>\n",
       "      <th>min_logprob</th>\n",
       "      <th>len_norm_sement</th>\n",
       "      <th>ptrue_neg_log_prob</th>\n",
       "      <th>GNLL</th>\n",
       "      <th>pplx</th>\n",
       "      <th>total</th>\n",
       "      <th>COR</th>\n",
       "      <th>SCORE</th>\n",
       "      <th>THRESH</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>10</td>\n",
       "      <td>6</td>\n",
       "      <td>4.0</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>2.0</td>\n",
       "      <td>16</td>\n",
       "      <td>bleu</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>11</td>\n",
       "      <td>5</td>\n",
       "      <td>7.0</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "      <td>2.0</td>\n",
       "      <td>16</td>\n",
       "      <td>bleu</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>4</td>\n",
       "      <td>6.0</td>\n",
       "      <td>2</td>\n",
       "      <td>6</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>2.0</td>\n",
       "      <td>16</td>\n",
       "      <td>bleu</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>4</td>\n",
       "      <td>5.0</td>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>6</td>\n",
       "      <td>2.0</td>\n",
       "      <td>16</td>\n",
       "      <td>bleu</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>4</td>\n",
       "      <td>6.0</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>8</td>\n",
       "      <td>2.0</td>\n",
       "      <td>16</td>\n",
       "      <td>bleu</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>202</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>5.0</td>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>10</td>\n",
       "      <td>0.0</td>\n",
       "      <td>16</td>\n",
       "      <td>rougeLsum</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>203</th>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>3</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>11</td>\n",
       "      <td>0.0</td>\n",
       "      <td>16</td>\n",
       "      <td>rougeLsum</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>204</th>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>4.0</td>\n",
       "      <td>3</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>11</td>\n",
       "      <td>0.0</td>\n",
       "      <td>16</td>\n",
       "      <td>rougeLsum</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>205</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "      <td>0.0</td>\n",
       "      <td>16</td>\n",
       "      <td>rougeLsum</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>206</th>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "      <td>0.0</td>\n",
       "      <td>16</td>\n",
       "      <td>rougeLsum</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>207 rows × 19 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     predent  len_norm_predent  seqlen_sample  seqlen_correct  EigenScore  \\\n",
       "0          2                 3              0               4           1   \n",
       "1          2                 1              0               4           1   \n",
       "2          2                 1              0               5           1   \n",
       "3          1                 0              0               5           1   \n",
       "4          3                 1              0               5           1   \n",
       "..       ...               ...            ...             ...         ...   \n",
       "202        2                 0              0               8           2   \n",
       "203        3                 0              0               8           3   \n",
       "204        4                 2              0               8           2   \n",
       "205        5                 0              0               9           2   \n",
       "206        4                 0              1              10           0   \n",
       "\n",
       "     TOKEN_SAR  SENT_SAR  SAR  log_pplx  sement  min_logprob  len_norm_sement  \\\n",
       "0            2        10    6       4.0       3            2                3   \n",
       "1            0        11    5       7.0       3            3                2   \n",
       "2            0         9    4       6.0       2            6                3   \n",
       "3            0         8    4       5.0       3            5                3   \n",
       "4            0         6    4       6.0       1            4                3   \n",
       "..         ...       ...  ...       ...     ...          ...              ...   \n",
       "202          0         7    1       5.0       3            7                2   \n",
       "203          0         5    0       4.0       3            6                1   \n",
       "204          0         3    1       4.0       3            6                1   \n",
       "205          0         2    1       3.0       1            5                2   \n",
       "206          0         1    0       1.0       0            6                0   \n",
       "\n",
       "     ptrue_neg_log_prob  GNLL  pplx  total        COR      SCORE  THRESH  \n",
       "0                     2     4   2.0     16       bleu  auc_score     0.1  \n",
       "1                     2     5   2.0     16       bleu  auc_score     0.2  \n",
       "2                     1     6   2.0     16       bleu  auc_score     0.3  \n",
       "3                     2     6   2.0     16       bleu  auc_score     0.4  \n",
       "4                     1     8   2.0     16       bleu  auc_score     0.5  \n",
       "..                  ...   ...   ...    ...        ...        ...     ...  \n",
       "202                   1    10   0.0     16  rougeLsum  auc_score     0.5  \n",
       "203                   1    11   0.0     16  rougeLsum  auc_score     0.6  \n",
       "204                   0    11   0.0     16  rougeLsum  auc_score     0.7  \n",
       "205                   0    12   0.0     16  rougeLsum  auc_score     0.8  \n",
       "206                   0    10   0.0     16  rougeLsum  auc_score     0.9  \n",
       "\n",
       "[207 rows x 19 columns]"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "opt_df = df[(df.dataset.isin(['TRIVIA','COQA','SQUAD'])&(~df.correctness.str.contains('to'))&(~df.correctness.str.contains('from')))]\n",
    "reslist = []\n",
    "for (cor, thr), _ in opt_df.groupby(['correctness', 'threshold']).groups.items():\n",
    "    reslist.append(any_method_top_rank(opt_df, cor, thr, use_score='auc_score', top_rank=3))\n",
    "resdf = pd.DataFrame.from_records(reslist)\n",
    "# resdf['frac'] = resdf['SEQLEN_SAMPLE_TOP']/resdf['TOTAL']\n",
    "# resdf.sort_values('frac', ascending=False).head(32)\n",
    "resdf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "26da42e7-1776-4fe1-a049-11a070b5f0ee",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_2518232/1713339829.py:5: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  _resdf[c] = _resdf[c]/_resdf['total']\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>predent</th>\n",
       "      <th>len_norm_predent</th>\n",
       "      <th>seqlen_sample</th>\n",
       "      <th>seqlen_correct</th>\n",
       "      <th>EigenScore</th>\n",
       "      <th>TOKEN_SAR</th>\n",
       "      <th>SENT_SAR</th>\n",
       "      <th>SAR</th>\n",
       "      <th>log_pplx</th>\n",
       "      <th>sement</th>\n",
       "      <th>min_logprob</th>\n",
       "      <th>len_norm_sement</th>\n",
       "      <th>ptrue_neg_log_prob</th>\n",
       "      <th>GNLL</th>\n",
       "      <th>pplx</th>\n",
       "      <th>total</th>\n",
       "      <th>COR</th>\n",
       "      <th>SCORE</th>\n",
       "      <th>THRESH</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>164</th>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3</td>\n",
       "      <td>ood_label</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>166</th>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3</td>\n",
       "      <td>ood_label</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.437500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.562500</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>16</td>\n",
       "      <td>bma_judge</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.437500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.562500</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>16</td>\n",
       "      <td>bma_judge</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>155</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.285714</td>\n",
       "      <td>0.142857</td>\n",
       "      <td>0.142857</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.142857</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.285714</td>\n",
       "      <td>0.285714</td>\n",
       "      <td>0.142857</td>\n",
       "      <td>0.857143</td>\n",
       "      <td>0.142857</td>\n",
       "      <td>0.571429</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>7</td>\n",
       "      <td>j_llama8b_qa_1_1.</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>119</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama8b_qa_16.0_0.49</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>121</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama8b_qa_16.0_0.49</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>128</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.600000</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.600000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>5</td>\n",
       "      <td>j_llama8b_qa_16.0_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>130</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.600000</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.600000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>5</td>\n",
       "      <td>j_llama8b_qa_16.0_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>137</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama8b_qa_16_0.49</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>139</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama8b_qa_16_0.49</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>146</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama8b_qa_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>148</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama8b_qa_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.562500</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>16</td>\n",
       "      <td>bleu</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>157</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.285714</td>\n",
       "      <td>0.142857</td>\n",
       "      <td>0.142857</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.142857</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.285714</td>\n",
       "      <td>0.285714</td>\n",
       "      <td>0.142857</td>\n",
       "      <td>0.857143</td>\n",
       "      <td>0.142857</td>\n",
       "      <td>0.571429</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>7</td>\n",
       "      <td>j_llama8b_qa_1_1.</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>110</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama8b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>173</th>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.562500</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.625000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>16</td>\n",
       "      <td>rouge1</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>175</th>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.437500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.562500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>16</td>\n",
       "      <td>rouge1</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182</th>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.625000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>16</td>\n",
       "      <td>rouge2</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>184</th>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.562500</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.625000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>16</td>\n",
       "      <td>rouge2</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>191</th>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.562500</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.625000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>16</td>\n",
       "      <td>rougeL</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>193</th>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.437500</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.625000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>16</td>\n",
       "      <td>rougeL</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>200</th>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.562500</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.625000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>16</td>\n",
       "      <td>rougeLsum</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>112</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama8b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>103</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama8b_gen_16.0_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>16</td>\n",
       "      <td>bleu</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>56</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama70b_qa_16.0_0.49</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.625000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>16</td>\n",
       "      <td>bleu_adapt</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.437500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.687500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>16</td>\n",
       "      <td>bleu_adapt</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.437500</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.562500</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>16</td>\n",
       "      <td>bma_judge_w8</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.562500</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>16</td>\n",
       "      <td>bma_judge_w8</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama70b_gen_16.0_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama70b_gen_16.0_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama70b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama70b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>58</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama70b_qa_16.0_0.49</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>101</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama8b_gen_16.0_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.800000</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>5</td>\n",
       "      <td>j_llama70b_qa_16.0_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>67</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.800000</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>5</td>\n",
       "      <td>j_llama70b_qa_16.0_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama70b_qa_16_0.49</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>76</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama70b_qa_16_0.49</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>83</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama70b_qa_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>85</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama70b_qa_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>92</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.555556</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.444444</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.222222</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>9</td>\n",
       "      <td>j_llama70b_qa_1_1.</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>94</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.555556</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.444444</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.222222</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>9</td>\n",
       "      <td>j_llama70b_qa_1_1.</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>202</th>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.437500</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.437500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.625000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>16</td>\n",
       "      <td>rougeLsum</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      predent  len_norm_predent  seqlen_sample  seqlen_correct  EigenScore  \\\n",
       "164  0.333333          0.666667       0.333333        0.333333    0.000000   \n",
       "166  0.333333          0.666667       0.333333        0.333333    0.000000   \n",
       "20   0.000000          0.000000       0.250000        0.312500    0.125000   \n",
       "22   0.000000          0.000000       0.250000        0.312500    0.125000   \n",
       "155  0.000000          0.000000       0.285714        0.142857    0.142857   \n",
       "119  0.000000          0.000000       0.000000        0.000000    0.000000   \n",
       "121  0.000000          0.000000       0.000000        0.000000    0.000000   \n",
       "128  0.000000          0.000000       0.000000        0.200000    0.200000   \n",
       "130  0.000000          0.000000       0.000000        0.200000    0.200000   \n",
       "137  0.000000          0.000000       0.000000        0.000000    1.000000   \n",
       "139  0.000000          0.000000       0.000000        0.000000    1.000000   \n",
       "146  0.000000          0.000000       0.500000        1.000000    0.000000   \n",
       "148  0.000000          0.000000       0.500000        1.000000    0.000000   \n",
       "2    0.125000          0.062500       0.000000        0.312500    0.062500   \n",
       "157  0.000000          0.000000       0.285714        0.142857    0.142857   \n",
       "110  0.000000          0.000000       1.000000        1.000000    0.000000   \n",
       "173  0.187500          0.062500       0.000000        0.562500    0.250000   \n",
       "175  0.250000          0.062500       0.000000        0.500000    0.125000   \n",
       "182  0.125000          0.000000       0.000000        0.312500    0.125000   \n",
       "184  0.125000          0.000000       0.000000        0.312500    0.187500   \n",
       "191  0.187500          0.062500       0.000000        0.562500    0.250000   \n",
       "193  0.125000          0.000000       0.000000        0.500000    0.125000   \n",
       "200  0.187500          0.125000       0.000000        0.562500    0.187500   \n",
       "112  0.000000          0.000000       1.000000        1.000000    0.000000   \n",
       "103  0.000000          0.000000       0.000000        1.000000    0.000000   \n",
       "4    0.187500          0.062500       0.000000        0.312500    0.062500   \n",
       "56   0.000000          0.000000       0.000000        0.000000    0.500000   \n",
       "11   0.187500          0.062500       0.000000        0.500000    0.125000   \n",
       "13   0.187500          0.062500       0.062500        0.500000    0.125000   \n",
       "29   0.000000          0.000000       0.312500        0.375000    0.000000   \n",
       "31   0.000000          0.000000       0.312500        0.375000    0.062500   \n",
       "38   0.000000          0.000000       0.000000        1.000000    0.000000   \n",
       "40   0.000000          0.000000       0.000000        1.000000    0.000000   \n",
       "47   0.000000          0.000000       1.000000        1.000000    0.000000   \n",
       "49   0.000000          0.000000       1.000000        1.000000    0.000000   \n",
       "58   0.000000          0.000000       0.000000        0.000000    0.500000   \n",
       "101  0.000000          0.000000       0.000000        1.000000    0.000000   \n",
       "65   0.000000          0.000000       0.000000        0.200000    0.200000   \n",
       "67   0.000000          0.000000       0.000000        0.200000    0.200000   \n",
       "74   0.000000          0.000000       0.000000        0.000000    0.000000   \n",
       "76   0.000000          0.000000       0.000000        0.000000    0.000000   \n",
       "83   0.000000          0.000000       0.500000        1.000000    0.000000   \n",
       "85   0.000000          0.000000       0.500000        1.000000    0.000000   \n",
       "92   0.000000          0.000000       0.111111        0.111111    0.111111   \n",
       "94   0.000000          0.000000       0.111111        0.111111    0.111111   \n",
       "202  0.125000          0.000000       0.000000        0.500000    0.125000   \n",
       "\n",
       "     TOKEN_SAR  SENT_SAR       SAR  log_pplx    sement  min_logprob  \\\n",
       "164   0.333333  0.000000  0.333333  0.000000  0.333333     0.000000   \n",
       "166   0.333333  0.000000  0.333333  0.000000  0.333333     0.000000   \n",
       "20    0.062500  0.437500  0.125000  0.125000  0.125000     0.125000   \n",
       "22    0.062500  0.437500  0.125000  0.125000  0.125000     0.125000   \n",
       "155   0.000000  0.142857  0.000000  0.285714  0.285714     0.142857   \n",
       "119   0.000000  1.000000  0.000000  0.500000  0.000000     0.500000   \n",
       "121   0.000000  1.000000  0.000000  0.500000  0.000000     0.500000   \n",
       "128   0.000000  0.400000  0.000000  0.000000  0.200000     0.400000   \n",
       "130   0.000000  0.400000  0.000000  0.000000  0.200000     0.400000   \n",
       "137   0.000000  0.000000  0.000000       NaN  0.000000     0.000000   \n",
       "139   0.000000  0.000000  0.000000       NaN  0.000000     0.000000   \n",
       "146   0.000000  0.500000  0.500000  0.000000  0.000000     0.000000   \n",
       "148   0.000000  0.500000  0.500000  0.000000  0.000000     0.000000   \n",
       "2     0.000000  0.562500  0.250000  0.375000  0.125000     0.375000   \n",
       "157   0.000000  0.142857  0.000000  0.285714  0.285714     0.142857   \n",
       "110   0.000000  0.000000  0.500000  0.000000  0.000000     0.000000   \n",
       "173   0.000000  0.375000  0.062500  0.125000  0.062500     0.375000   \n",
       "175   0.000000  0.375000  0.062500  0.250000  0.125000     0.437500   \n",
       "182   0.000000  0.625000  0.125000  0.312500  0.187500     0.312500   \n",
       "184   0.000000  0.562500  0.062500  0.250000  0.187500     0.250000   \n",
       "191   0.000000  0.375000  0.062500  0.125000  0.125000     0.375000   \n",
       "193   0.000000  0.437500  0.062500  0.312500  0.187500     0.375000   \n",
       "200   0.000000  0.375000  0.062500  0.125000  0.125000     0.375000   \n",
       "112   0.000000  0.000000  0.500000  0.000000  0.000000     0.000000   \n",
       "103   0.000000  0.500000  0.000000  0.000000  0.500000     0.000000   \n",
       "4     0.000000  0.375000  0.250000  0.375000  0.062500     0.250000   \n",
       "56    0.000000  0.000000  0.000000  0.000000  1.000000     0.000000   \n",
       "11    0.000000  0.312500  0.062500  0.187500  0.125000     0.500000   \n",
       "13    0.000000  0.187500  0.062500  0.250000  0.062500     0.437500   \n",
       "29    0.000000  0.437500  0.187500  0.125000  0.187500     0.125000   \n",
       "31    0.000000  0.500000  0.062500  0.125000  0.187500     0.125000   \n",
       "38    0.000000  0.500000  0.000000  0.000000  0.000000     0.500000   \n",
       "40    0.000000  0.500000  0.000000  0.000000  0.000000     0.500000   \n",
       "47    0.000000  0.000000  0.500000  0.000000  0.000000     0.000000   \n",
       "49    0.000000  0.000000  0.500000  0.000000  0.000000     0.000000   \n",
       "58    0.000000  0.000000  0.000000  0.000000  1.000000     0.000000   \n",
       "101   0.000000  0.500000  0.000000  0.000000  0.500000     0.000000   \n",
       "65    0.000000  0.400000  0.000000  0.000000  0.400000     0.200000   \n",
       "67    0.000000  0.400000  0.000000  0.000000  0.400000     0.200000   \n",
       "74    0.000000  1.000000  0.000000       NaN  0.000000     0.000000   \n",
       "76    0.000000  1.000000  0.000000       NaN  0.000000     0.000000   \n",
       "83    0.000000  0.500000  0.000000  0.000000  0.000000     0.000000   \n",
       "85    0.000000  0.500000  0.000000  0.000000  0.000000     0.000000   \n",
       "92    0.000000  0.555556  0.000000  0.444444  0.333333     0.000000   \n",
       "94    0.000000  0.555556  0.000000  0.444444  0.333333     0.000000   \n",
       "202   0.000000  0.437500  0.062500  0.312500  0.187500     0.437500   \n",
       "\n",
       "     len_norm_sement  ptrue_neg_log_prob      GNLL      pplx  total  \\\n",
       "164         0.000000            0.333333  0.000000       NaN      3   \n",
       "166         0.000000            0.333333  0.000000       NaN      3   \n",
       "20          0.562500            0.250000  0.375000  0.125000     16   \n",
       "22          0.562500            0.250000  0.375000  0.125000     16   \n",
       "155         0.857143            0.142857  0.571429  0.000000      7   \n",
       "119         0.000000            0.500000  0.500000       NaN      2   \n",
       "121         0.000000            0.500000  0.500000       NaN      2   \n",
       "128         0.600000            0.400000  0.600000  0.000000      5   \n",
       "130         0.600000            0.400000  0.600000  0.000000      5   \n",
       "137         1.000000            0.000000  1.000000  0.000000      1   \n",
       "139         1.000000            0.000000  1.000000  0.000000      1   \n",
       "146         0.000000            0.000000  0.500000       NaN      2   \n",
       "148         0.000000            0.000000  0.500000       NaN      2   \n",
       "2           0.187500            0.062500  0.375000  0.125000     16   \n",
       "157         0.857143            0.142857  0.571429  0.000000      7   \n",
       "110         0.000000            0.000000  0.500000       NaN      2   \n",
       "173         0.125000            0.187500  0.625000  0.000000     16   \n",
       "175         0.125000            0.125000  0.562500  0.000000     16   \n",
       "182         0.187500            0.062500  0.500000  0.125000     16   \n",
       "184         0.125000            0.062500  0.625000  0.062500     16   \n",
       "191         0.125000            0.125000  0.625000  0.000000     16   \n",
       "193         0.125000            0.125000  0.625000  0.000000     16   \n",
       "200         0.125000            0.125000  0.625000  0.000000     16   \n",
       "112         0.000000            0.000000  0.500000       NaN      2   \n",
       "103         0.000000            0.500000  0.500000       NaN      2   \n",
       "4           0.187500            0.062500  0.500000  0.125000     16   \n",
       "56          1.000000            0.500000  0.000000       NaN      2   \n",
       "11          0.250000            0.062500  0.625000  0.000000     16   \n",
       "13          0.125000            0.062500  0.687500  0.000000     16   \n",
       "29          0.562500            0.375000  0.187500  0.125000     16   \n",
       "31          0.562500            0.375000  0.187500  0.125000     16   \n",
       "38          0.000000            0.500000  0.500000       NaN      2   \n",
       "40          0.000000            0.500000  0.500000       NaN      2   \n",
       "47          0.000000            0.000000  0.500000       NaN      2   \n",
       "49          0.000000            0.000000  0.500000       NaN      2   \n",
       "58          1.000000            0.500000  0.000000       NaN      2   \n",
       "101         0.000000            0.500000  0.500000       NaN      2   \n",
       "65          0.800000            0.400000  0.200000  0.200000      5   \n",
       "67          0.800000            0.400000  0.200000  0.200000      5   \n",
       "74          1.000000            0.000000  0.000000  1.000000      1   \n",
       "76          1.000000            0.000000  0.000000  1.000000      1   \n",
       "83          0.000000            0.000000  1.000000       NaN      2   \n",
       "85          0.000000            0.000000  1.000000       NaN      2   \n",
       "92          0.666667            0.222222  0.333333  0.111111      9   \n",
       "94          0.666667            0.222222  0.333333  0.111111      9   \n",
       "202         0.125000            0.062500  0.625000  0.000000     16   \n",
       "\n",
       "                         COR      SCORE  THRESH  \n",
       "164                ood_label  auc_score     0.3  \n",
       "166                ood_label  auc_score     0.5  \n",
       "20                 bma_judge  auc_score     0.3  \n",
       "22                 bma_judge  auc_score     0.5  \n",
       "155        j_llama8b_qa_1_1.  auc_score     0.3  \n",
       "119   j_llama8b_qa_16.0_0.49  auc_score     0.3  \n",
       "121   j_llama8b_qa_16.0_0.49  auc_score     0.5  \n",
       "128    j_llama8b_qa_16.0_0.5  auc_score     0.3  \n",
       "130    j_llama8b_qa_16.0_0.5  auc_score     0.5  \n",
       "137     j_llama8b_qa_16_0.49  auc_score     0.3  \n",
       "139     j_llama8b_qa_16_0.49  auc_score     0.5  \n",
       "146      j_llama8b_qa_16_0.5  auc_score     0.3  \n",
       "148      j_llama8b_qa_16_0.5  auc_score     0.5  \n",
       "2                       bleu  auc_score     0.3  \n",
       "157        j_llama8b_qa_1_1.  auc_score     0.5  \n",
       "110     j_llama8b_gen_16_0.5  auc_score     0.3  \n",
       "173                   rouge1  auc_score     0.3  \n",
       "175                   rouge1  auc_score     0.5  \n",
       "182                   rouge2  auc_score     0.3  \n",
       "184                   rouge2  auc_score     0.5  \n",
       "191                   rougeL  auc_score     0.3  \n",
       "193                   rougeL  auc_score     0.5  \n",
       "200                rougeLsum  auc_score     0.3  \n",
       "112     j_llama8b_gen_16_0.5  auc_score     0.5  \n",
       "103   j_llama8b_gen_16.0_0.5  auc_score     0.5  \n",
       "4                       bleu  auc_score     0.5  \n",
       "56   j_llama70b_qa_16.0_0.49  auc_score     0.3  \n",
       "11                bleu_adapt  auc_score     0.3  \n",
       "13                bleu_adapt  auc_score     0.5  \n",
       "29              bma_judge_w8  auc_score     0.3  \n",
       "31              bma_judge_w8  auc_score     0.5  \n",
       "38   j_llama70b_gen_16.0_0.5  auc_score     0.3  \n",
       "40   j_llama70b_gen_16.0_0.5  auc_score     0.5  \n",
       "47     j_llama70b_gen_16_0.5  auc_score     0.3  \n",
       "49     j_llama70b_gen_16_0.5  auc_score     0.5  \n",
       "58   j_llama70b_qa_16.0_0.49  auc_score     0.5  \n",
       "101   j_llama8b_gen_16.0_0.5  auc_score     0.3  \n",
       "65    j_llama70b_qa_16.0_0.5  auc_score     0.3  \n",
       "67    j_llama70b_qa_16.0_0.5  auc_score     0.5  \n",
       "74     j_llama70b_qa_16_0.49  auc_score     0.3  \n",
       "76     j_llama70b_qa_16_0.49  auc_score     0.5  \n",
       "83      j_llama70b_qa_16_0.5  auc_score     0.3  \n",
       "85      j_llama70b_qa_16_0.5  auc_score     0.5  \n",
       "92        j_llama70b_qa_1_1.  auc_score     0.3  \n",
       "94        j_llama70b_qa_1_1.  auc_score     0.5  \n",
       "202                rougeLsum  auc_score     0.5  "
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "_resdf = resdf[resdf['THRESH'].isin([0.3,0.5])]\n",
    "for c in _resdf.columns:\n",
    "    # print(_resdf[c].dtype)\n",
    "     if c not in ['total', 'COR','SCORE','THRESH']:\n",
    "        _resdf[c] = _resdf[c]/_resdf['total']\n",
    "_resdf.sort_values('TOKEN_SAR', ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "79f05f9e-b800-45b7-ae81-944d470a5616",
   "metadata": {},
   "outputs": [],
   "source": [
    "uncert_methods = ['predent', 'len_norm_predent', 'seqlen_sample', 'seqlen_correct', 'EigenScore', 'TOKEN_SAR', 'SENT_SAR', 'SAR', 'log_pplx', 'sement', 'min_logprob', 'len_norm_sement', 'ptrue_neg_log_prob', 'GNLL', 'pplx']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "cff8cc49-6cec-48a0-a9be-1064bb02ee1d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>predent</th>\n",
       "      <th>len_norm_predent</th>\n",
       "      <th>seqlen_sample</th>\n",
       "      <th>seqlen_correct</th>\n",
       "      <th>EigenScore</th>\n",
       "      <th>TOKEN_SAR</th>\n",
       "      <th>SENT_SAR</th>\n",
       "      <th>SAR</th>\n",
       "      <th>log_pplx</th>\n",
       "      <th>sement</th>\n",
       "      <th>min_logprob</th>\n",
       "      <th>len_norm_sement</th>\n",
       "      <th>ptrue_neg_log_prob</th>\n",
       "      <th>GNLL</th>\n",
       "      <th>pplx</th>\n",
       "      <th>total</th>\n",
       "      <th>COR</th>\n",
       "      <th>SCORE</th>\n",
       "      <th>THRESH</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.562500</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>16</td>\n",
       "      <td>bleu</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>16</td>\n",
       "      <td>bleu</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.625000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>16</td>\n",
       "      <td>bleu_adapt</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.437500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.687500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>16</td>\n",
       "      <td>bleu_adapt</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.437500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.562500</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>16</td>\n",
       "      <td>bma_judge</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.437500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.562500</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>16</td>\n",
       "      <td>bma_judge</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.437500</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.562500</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>16</td>\n",
       "      <td>bma_judge_w8</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.562500</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>16</td>\n",
       "      <td>bma_judge_w8</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama70b_gen_16.0_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama70b_gen_16.0_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama70b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama70b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>56</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama70b_qa_16.0_0.49</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>58</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama70b_qa_16.0_0.49</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.800000</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>5</td>\n",
       "      <td>j_llama70b_qa_16.0_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>67</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.800000</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>5</td>\n",
       "      <td>j_llama70b_qa_16.0_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama70b_qa_16_0.49</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>76</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama70b_qa_16_0.49</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>83</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama70b_qa_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>85</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama70b_qa_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>92</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.555556</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.444444</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.222222</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>9</td>\n",
       "      <td>j_llama70b_qa_1_1.</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>94</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.555556</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.444444</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.222222</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>9</td>\n",
       "      <td>j_llama70b_qa_1_1.</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>101</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama8b_gen_16.0_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>103</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama8b_gen_16.0_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>110</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama8b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>112</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama8b_gen_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>119</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama8b_qa_16.0_0.49</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>121</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama8b_qa_16.0_0.49</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>128</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.600000</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.600000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>5</td>\n",
       "      <td>j_llama8b_qa_16.0_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>130</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.600000</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.600000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>5</td>\n",
       "      <td>j_llama8b_qa_16.0_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>137</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama8b_qa_16_0.49</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>139</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1</td>\n",
       "      <td>j_llama8b_qa_16_0.49</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>146</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama8b_qa_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>148</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>j_llama8b_qa_16_0.5</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>155</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.285714</td>\n",
       "      <td>0.142857</td>\n",
       "      <td>0.142857</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.142857</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.285714</td>\n",
       "      <td>0.285714</td>\n",
       "      <td>0.142857</td>\n",
       "      <td>0.857143</td>\n",
       "      <td>0.142857</td>\n",
       "      <td>0.571429</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>7</td>\n",
       "      <td>j_llama8b_qa_1_1.</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>157</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.285714</td>\n",
       "      <td>0.142857</td>\n",
       "      <td>0.142857</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.142857</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.285714</td>\n",
       "      <td>0.285714</td>\n",
       "      <td>0.142857</td>\n",
       "      <td>0.857143</td>\n",
       "      <td>0.142857</td>\n",
       "      <td>0.571429</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>7</td>\n",
       "      <td>j_llama8b_qa_1_1.</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>164</th>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3</td>\n",
       "      <td>ood_label</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>166</th>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3</td>\n",
       "      <td>ood_label</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>173</th>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.562500</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.625000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>16</td>\n",
       "      <td>rouge1</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>175</th>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.437500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.562500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>16</td>\n",
       "      <td>rouge1</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182</th>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.625000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>16</td>\n",
       "      <td>rouge2</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>184</th>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.562500</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.625000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>16</td>\n",
       "      <td>rouge2</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>191</th>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.562500</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.625000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>16</td>\n",
       "      <td>rougeL</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>193</th>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.437500</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.625000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>16</td>\n",
       "      <td>rougeL</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>200</th>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.562500</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.625000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>16</td>\n",
       "      <td>rougeLsum</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>202</th>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.437500</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.437500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.625000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>16</td>\n",
       "      <td>rougeLsum</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      predent  len_norm_predent  seqlen_sample  seqlen_correct  EigenScore  \\\n",
       "2    0.125000          0.062500       0.000000        0.312500    0.062500   \n",
       "4    0.187500          0.062500       0.000000        0.312500    0.062500   \n",
       "11   0.187500          0.062500       0.000000        0.500000    0.125000   \n",
       "13   0.187500          0.062500       0.062500        0.500000    0.125000   \n",
       "20   0.000000          0.000000       0.250000        0.312500    0.125000   \n",
       "22   0.000000          0.000000       0.250000        0.312500    0.125000   \n",
       "29   0.000000          0.000000       0.312500        0.375000    0.000000   \n",
       "31   0.000000          0.000000       0.312500        0.375000    0.062500   \n",
       "38   0.000000          0.000000       0.000000        1.000000    0.000000   \n",
       "40   0.000000          0.000000       0.000000        1.000000    0.000000   \n",
       "47   0.000000          0.000000       1.000000        1.000000    0.000000   \n",
       "49   0.000000          0.000000       1.000000        1.000000    0.000000   \n",
       "56   0.000000          0.000000       0.000000        0.000000    0.500000   \n",
       "58   0.000000          0.000000       0.000000        0.000000    0.500000   \n",
       "65   0.000000          0.000000       0.000000        0.200000    0.200000   \n",
       "67   0.000000          0.000000       0.000000        0.200000    0.200000   \n",
       "74   0.000000          0.000000       0.000000        0.000000    0.000000   \n",
       "76   0.000000          0.000000       0.000000        0.000000    0.000000   \n",
       "83   0.000000          0.000000       0.500000        1.000000    0.000000   \n",
       "85   0.000000          0.000000       0.500000        1.000000    0.000000   \n",
       "92   0.000000          0.000000       0.111111        0.111111    0.111111   \n",
       "94   0.000000          0.000000       0.111111        0.111111    0.111111   \n",
       "101  0.000000          0.000000       0.000000        1.000000    0.000000   \n",
       "103  0.000000          0.000000       0.000000        1.000000    0.000000   \n",
       "110  0.000000          0.000000       1.000000        1.000000    0.000000   \n",
       "112  0.000000          0.000000       1.000000        1.000000    0.000000   \n",
       "119  0.000000          0.000000       0.000000        0.000000    0.000000   \n",
       "121  0.000000          0.000000       0.000000        0.000000    0.000000   \n",
       "128  0.000000          0.000000       0.000000        0.200000    0.200000   \n",
       "130  0.000000          0.000000       0.000000        0.200000    0.200000   \n",
       "137  0.000000          0.000000       0.000000        0.000000    1.000000   \n",
       "139  0.000000          0.000000       0.000000        0.000000    1.000000   \n",
       "146  0.000000          0.000000       0.500000        1.000000    0.000000   \n",
       "148  0.000000          0.000000       0.500000        1.000000    0.000000   \n",
       "155  0.000000          0.000000       0.285714        0.142857    0.142857   \n",
       "157  0.000000          0.000000       0.285714        0.142857    0.142857   \n",
       "164  0.333333          0.666667       0.333333        0.333333    0.000000   \n",
       "166  0.333333          0.666667       0.333333        0.333333    0.000000   \n",
       "173  0.187500          0.062500       0.000000        0.562500    0.250000   \n",
       "175  0.250000          0.062500       0.000000        0.500000    0.125000   \n",
       "182  0.125000          0.000000       0.000000        0.312500    0.125000   \n",
       "184  0.125000          0.000000       0.000000        0.312500    0.187500   \n",
       "191  0.187500          0.062500       0.000000        0.562500    0.250000   \n",
       "193  0.125000          0.000000       0.000000        0.500000    0.125000   \n",
       "200  0.187500          0.125000       0.000000        0.562500    0.187500   \n",
       "202  0.125000          0.000000       0.000000        0.500000    0.125000   \n",
       "\n",
       "     TOKEN_SAR  SENT_SAR       SAR  log_pplx    sement  min_logprob  \\\n",
       "2     0.000000  0.562500  0.250000  0.375000  0.125000     0.375000   \n",
       "4     0.000000  0.375000  0.250000  0.375000  0.062500     0.250000   \n",
       "11    0.000000  0.312500  0.062500  0.187500  0.125000     0.500000   \n",
       "13    0.000000  0.187500  0.062500  0.250000  0.062500     0.437500   \n",
       "20    0.062500  0.437500  0.125000  0.125000  0.125000     0.125000   \n",
       "22    0.062500  0.437500  0.125000  0.125000  0.125000     0.125000   \n",
       "29    0.000000  0.437500  0.187500  0.125000  0.187500     0.125000   \n",
       "31    0.000000  0.500000  0.062500  0.125000  0.187500     0.125000   \n",
       "38    0.000000  0.500000  0.000000  0.000000  0.000000     0.500000   \n",
       "40    0.000000  0.500000  0.000000  0.000000  0.000000     0.500000   \n",
       "47    0.000000  0.000000  0.500000  0.000000  0.000000     0.000000   \n",
       "49    0.000000  0.000000  0.500000  0.000000  0.000000     0.000000   \n",
       "56    0.000000  0.000000  0.000000  0.000000  1.000000     0.000000   \n",
       "58    0.000000  0.000000  0.000000  0.000000  1.000000     0.000000   \n",
       "65    0.000000  0.400000  0.000000  0.000000  0.400000     0.200000   \n",
       "67    0.000000  0.400000  0.000000  0.000000  0.400000     0.200000   \n",
       "74    0.000000  1.000000  0.000000       NaN  0.000000     0.000000   \n",
       "76    0.000000  1.000000  0.000000       NaN  0.000000     0.000000   \n",
       "83    0.000000  0.500000  0.000000  0.000000  0.000000     0.000000   \n",
       "85    0.000000  0.500000  0.000000  0.000000  0.000000     0.000000   \n",
       "92    0.000000  0.555556  0.000000  0.444444  0.333333     0.000000   \n",
       "94    0.000000  0.555556  0.000000  0.444444  0.333333     0.000000   \n",
       "101   0.000000  0.500000  0.000000  0.000000  0.500000     0.000000   \n",
       "103   0.000000  0.500000  0.000000  0.000000  0.500000     0.000000   \n",
       "110   0.000000  0.000000  0.500000  0.000000  0.000000     0.000000   \n",
       "112   0.000000  0.000000  0.500000  0.000000  0.000000     0.000000   \n",
       "119   0.000000  1.000000  0.000000  0.500000  0.000000     0.500000   \n",
       "121   0.000000  1.000000  0.000000  0.500000  0.000000     0.500000   \n",
       "128   0.000000  0.400000  0.000000  0.000000  0.200000     0.400000   \n",
       "130   0.000000  0.400000  0.000000  0.000000  0.200000     0.400000   \n",
       "137   0.000000  0.000000  0.000000       NaN  0.000000     0.000000   \n",
       "139   0.000000  0.000000  0.000000       NaN  0.000000     0.000000   \n",
       "146   0.000000  0.500000  0.500000  0.000000  0.000000     0.000000   \n",
       "148   0.000000  0.500000  0.500000  0.000000  0.000000     0.000000   \n",
       "155   0.000000  0.142857  0.000000  0.285714  0.285714     0.142857   \n",
       "157   0.000000  0.142857  0.000000  0.285714  0.285714     0.142857   \n",
       "164   0.333333  0.000000  0.333333  0.000000  0.333333     0.000000   \n",
       "166   0.333333  0.000000  0.333333  0.000000  0.333333     0.000000   \n",
       "173   0.000000  0.375000  0.062500  0.125000  0.062500     0.375000   \n",
       "175   0.000000  0.375000  0.062500  0.250000  0.125000     0.437500   \n",
       "182   0.000000  0.625000  0.125000  0.312500  0.187500     0.312500   \n",
       "184   0.000000  0.562500  0.062500  0.250000  0.187500     0.250000   \n",
       "191   0.000000  0.375000  0.062500  0.125000  0.125000     0.375000   \n",
       "193   0.000000  0.437500  0.062500  0.312500  0.187500     0.375000   \n",
       "200   0.000000  0.375000  0.062500  0.125000  0.125000     0.375000   \n",
       "202   0.000000  0.437500  0.062500  0.312500  0.187500     0.437500   \n",
       "\n",
       "     len_norm_sement  ptrue_neg_log_prob      GNLL      pplx  total  \\\n",
       "2           0.187500            0.062500  0.375000  0.125000     16   \n",
       "4           0.187500            0.062500  0.500000  0.125000     16   \n",
       "11          0.250000            0.062500  0.625000  0.000000     16   \n",
       "13          0.125000            0.062500  0.687500  0.000000     16   \n",
       "20          0.562500            0.250000  0.375000  0.125000     16   \n",
       "22          0.562500            0.250000  0.375000  0.125000     16   \n",
       "29          0.562500            0.375000  0.187500  0.125000     16   \n",
       "31          0.562500            0.375000  0.187500  0.125000     16   \n",
       "38          0.000000            0.500000  0.500000       NaN      2   \n",
       "40          0.000000            0.500000  0.500000       NaN      2   \n",
       "47          0.000000            0.000000  0.500000       NaN      2   \n",
       "49          0.000000            0.000000  0.500000       NaN      2   \n",
       "56          1.000000            0.500000  0.000000       NaN      2   \n",
       "58          1.000000            0.500000  0.000000       NaN      2   \n",
       "65          0.800000            0.400000  0.200000  0.200000      5   \n",
       "67          0.800000            0.400000  0.200000  0.200000      5   \n",
       "74          1.000000            0.000000  0.000000  1.000000      1   \n",
       "76          1.000000            0.000000  0.000000  1.000000      1   \n",
       "83          0.000000            0.000000  1.000000       NaN      2   \n",
       "85          0.000000            0.000000  1.000000       NaN      2   \n",
       "92          0.666667            0.222222  0.333333  0.111111      9   \n",
       "94          0.666667            0.222222  0.333333  0.111111      9   \n",
       "101         0.000000            0.500000  0.500000       NaN      2   \n",
       "103         0.000000            0.500000  0.500000       NaN      2   \n",
       "110         0.000000            0.000000  0.500000       NaN      2   \n",
       "112         0.000000            0.000000  0.500000       NaN      2   \n",
       "119         0.000000            0.500000  0.500000       NaN      2   \n",
       "121         0.000000            0.500000  0.500000       NaN      2   \n",
       "128         0.600000            0.400000  0.600000  0.000000      5   \n",
       "130         0.600000            0.400000  0.600000  0.000000      5   \n",
       "137         1.000000            0.000000  1.000000  0.000000      1   \n",
       "139         1.000000            0.000000  1.000000  0.000000      1   \n",
       "146         0.000000            0.000000  0.500000       NaN      2   \n",
       "148         0.000000            0.000000  0.500000       NaN      2   \n",
       "155         0.857143            0.142857  0.571429  0.000000      7   \n",
       "157         0.857143            0.142857  0.571429  0.000000      7   \n",
       "164         0.000000            0.333333  0.000000       NaN      3   \n",
       "166         0.000000            0.333333  0.000000       NaN      3   \n",
       "173         0.125000            0.187500  0.625000  0.000000     16   \n",
       "175         0.125000            0.125000  0.562500  0.000000     16   \n",
       "182         0.187500            0.062500  0.500000  0.125000     16   \n",
       "184         0.125000            0.062500  0.625000  0.062500     16   \n",
       "191         0.125000            0.125000  0.625000  0.000000     16   \n",
       "193         0.125000            0.125000  0.625000  0.000000     16   \n",
       "200         0.125000            0.125000  0.625000  0.000000     16   \n",
       "202         0.125000            0.062500  0.625000  0.000000     16   \n",
       "\n",
       "                         COR      SCORE  THRESH  \n",
       "2                       bleu  auc_score     0.3  \n",
       "4                       bleu  auc_score     0.5  \n",
       "11                bleu_adapt  auc_score     0.3  \n",
       "13                bleu_adapt  auc_score     0.5  \n",
       "20                 bma_judge  auc_score     0.3  \n",
       "22                 bma_judge  auc_score     0.5  \n",
       "29              bma_judge_w8  auc_score     0.3  \n",
       "31              bma_judge_w8  auc_score     0.5  \n",
       "38   j_llama70b_gen_16.0_0.5  auc_score     0.3  \n",
       "40   j_llama70b_gen_16.0_0.5  auc_score     0.5  \n",
       "47     j_llama70b_gen_16_0.5  auc_score     0.3  \n",
       "49     j_llama70b_gen_16_0.5  auc_score     0.5  \n",
       "56   j_llama70b_qa_16.0_0.49  auc_score     0.3  \n",
       "58   j_llama70b_qa_16.0_0.49  auc_score     0.5  \n",
       "65    j_llama70b_qa_16.0_0.5  auc_score     0.3  \n",
       "67    j_llama70b_qa_16.0_0.5  auc_score     0.5  \n",
       "74     j_llama70b_qa_16_0.49  auc_score     0.3  \n",
       "76     j_llama70b_qa_16_0.49  auc_score     0.5  \n",
       "83      j_llama70b_qa_16_0.5  auc_score     0.3  \n",
       "85      j_llama70b_qa_16_0.5  auc_score     0.5  \n",
       "92        j_llama70b_qa_1_1.  auc_score     0.3  \n",
       "94        j_llama70b_qa_1_1.  auc_score     0.5  \n",
       "101   j_llama8b_gen_16.0_0.5  auc_score     0.3  \n",
       "103   j_llama8b_gen_16.0_0.5  auc_score     0.5  \n",
       "110     j_llama8b_gen_16_0.5  auc_score     0.3  \n",
       "112     j_llama8b_gen_16_0.5  auc_score     0.5  \n",
       "119   j_llama8b_qa_16.0_0.49  auc_score     0.3  \n",
       "121   j_llama8b_qa_16.0_0.49  auc_score     0.5  \n",
       "128    j_llama8b_qa_16.0_0.5  auc_score     0.3  \n",
       "130    j_llama8b_qa_16.0_0.5  auc_score     0.5  \n",
       "137     j_llama8b_qa_16_0.49  auc_score     0.3  \n",
       "139     j_llama8b_qa_16_0.49  auc_score     0.5  \n",
       "146      j_llama8b_qa_16_0.5  auc_score     0.3  \n",
       "148      j_llama8b_qa_16_0.5  auc_score     0.5  \n",
       "155        j_llama8b_qa_1_1.  auc_score     0.3  \n",
       "157        j_llama8b_qa_1_1.  auc_score     0.5  \n",
       "164                ood_label  auc_score     0.3  \n",
       "166                ood_label  auc_score     0.5  \n",
       "173                   rouge1  auc_score     0.3  \n",
       "175                   rouge1  auc_score     0.5  \n",
       "182                   rouge2  auc_score     0.3  \n",
       "184                   rouge2  auc_score     0.5  \n",
       "191                   rougeL  auc_score     0.3  \n",
       "193                   rougeL  auc_score     0.5  \n",
       "200                rougeLsum  auc_score     0.3  \n",
       "202                rougeLsum  auc_score     0.5  "
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "_resdf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "ab3f8ec6-f333-41fc-a0a5-cdb7af02ca48",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>predent</th>\n",
       "      <th>len_norm_predent</th>\n",
       "      <th>seqlen_sample</th>\n",
       "      <th>seqlen_correct</th>\n",
       "      <th>EigenScore</th>\n",
       "      <th>TOKEN_SAR</th>\n",
       "      <th>SENT_SAR</th>\n",
       "      <th>SAR</th>\n",
       "      <th>log_pplx</th>\n",
       "      <th>sement</th>\n",
       "      <th>min_logprob</th>\n",
       "      <th>len_norm_sement</th>\n",
       "      <th>ptrue_neg_log_prob</th>\n",
       "      <th>GNLL</th>\n",
       "      <th>pplx</th>\n",
       "      <th>total</th>\n",
       "      <th>COR</th>\n",
       "      <th>SCORE</th>\n",
       "      <th>THRESH</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>0.1875</td>\n",
       "      <td>0.0625</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.0625</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.5000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.625000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>16</td>\n",
       "      <td>bleu_adapt</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>0.1875</td>\n",
       "      <td>0.0625</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.0625</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.4375</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.687500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>16</td>\n",
       "      <td>bleu_adapt</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.0625</td>\n",
       "      <td>0.437500</td>\n",
       "      <td>0.1250</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.1250</td>\n",
       "      <td>0.562500</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>16</td>\n",
       "      <td>bma_judge</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.0625</td>\n",
       "      <td>0.437500</td>\n",
       "      <td>0.1250</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.1250</td>\n",
       "      <td>0.562500</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>16</td>\n",
       "      <td>bma_judge</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.437500</td>\n",
       "      <td>0.1875</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.1250</td>\n",
       "      <td>0.562500</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>16</td>\n",
       "      <td>bma_judge_w8</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.0625</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.1250</td>\n",
       "      <td>0.562500</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>16</td>\n",
       "      <td>bma_judge_w8</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>92</th>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.555556</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.444444</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.222222</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>9</td>\n",
       "      <td>j_llama70b_qa_1_1.</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>94</th>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.555556</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.444444</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.222222</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>9</td>\n",
       "      <td>j_llama70b_qa_1_1.</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>173</th>\n",
       "      <td>0.1875</td>\n",
       "      <td>0.0625</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.562500</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.0625</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.3750</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.625000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>16</td>\n",
       "      <td>rouge1</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>175</th>\n",
       "      <td>0.2500</td>\n",
       "      <td>0.0625</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.0625</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.4375</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.562500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>16</td>\n",
       "      <td>rouge1</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>191</th>\n",
       "      <td>0.1875</td>\n",
       "      <td>0.0625</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.562500</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.0625</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.3750</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.625000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>16</td>\n",
       "      <td>rougeL</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>193</th>\n",
       "      <td>0.1250</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.437500</td>\n",
       "      <td>0.0625</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.3750</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.625000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>16</td>\n",
       "      <td>rougeL</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>200</th>\n",
       "      <td>0.1875</td>\n",
       "      <td>0.1250</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.562500</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.0625</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.3750</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.625000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>16</td>\n",
       "      <td>rougeLsum</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>202</th>\n",
       "      <td>0.1250</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.437500</td>\n",
       "      <td>0.0625</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.187500</td>\n",
       "      <td>0.4375</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.625000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>16</td>\n",
       "      <td>rougeLsum</td>\n",
       "      <td>auc_score</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     predent  len_norm_predent  seqlen_sample  seqlen_correct  EigenScore  \\\n",
       "11    0.1875            0.0625       0.000000        0.500000    0.125000   \n",
       "13    0.1875            0.0625       0.062500        0.500000    0.125000   \n",
       "20    0.0000            0.0000       0.250000        0.312500    0.125000   \n",
       "22    0.0000            0.0000       0.250000        0.312500    0.125000   \n",
       "29    0.0000            0.0000       0.312500        0.375000    0.000000   \n",
       "31    0.0000            0.0000       0.312500        0.375000    0.062500   \n",
       "92    0.0000            0.0000       0.111111        0.111111    0.111111   \n",
       "94    0.0000            0.0000       0.111111        0.111111    0.111111   \n",
       "173   0.1875            0.0625       0.000000        0.562500    0.250000   \n",
       "175   0.2500            0.0625       0.000000        0.500000    0.125000   \n",
       "191   0.1875            0.0625       0.000000        0.562500    0.250000   \n",
       "193   0.1250            0.0000       0.000000        0.500000    0.125000   \n",
       "200   0.1875            0.1250       0.000000        0.562500    0.187500   \n",
       "202   0.1250            0.0000       0.000000        0.500000    0.125000   \n",
       "\n",
       "     TOKEN_SAR  SENT_SAR     SAR  log_pplx    sement  min_logprob  \\\n",
       "11      0.0000  0.312500  0.0625  0.187500  0.125000       0.5000   \n",
       "13      0.0000  0.187500  0.0625  0.250000  0.062500       0.4375   \n",
       "20      0.0625  0.437500  0.1250  0.125000  0.125000       0.1250   \n",
       "22      0.0625  0.437500  0.1250  0.125000  0.125000       0.1250   \n",
       "29      0.0000  0.437500  0.1875  0.125000  0.187500       0.1250   \n",
       "31      0.0000  0.500000  0.0625  0.125000  0.187500       0.1250   \n",
       "92      0.0000  0.555556  0.0000  0.444444  0.333333       0.0000   \n",
       "94      0.0000  0.555556  0.0000  0.444444  0.333333       0.0000   \n",
       "173     0.0000  0.375000  0.0625  0.125000  0.062500       0.3750   \n",
       "175     0.0000  0.375000  0.0625  0.250000  0.125000       0.4375   \n",
       "191     0.0000  0.375000  0.0625  0.125000  0.125000       0.3750   \n",
       "193     0.0000  0.437500  0.0625  0.312500  0.187500       0.3750   \n",
       "200     0.0000  0.375000  0.0625  0.125000  0.125000       0.3750   \n",
       "202     0.0000  0.437500  0.0625  0.312500  0.187500       0.4375   \n",
       "\n",
       "     len_norm_sement  ptrue_neg_log_prob      GNLL      pplx  total  \\\n",
       "11          0.250000            0.062500  0.625000  0.000000     16   \n",
       "13          0.125000            0.062500  0.687500  0.000000     16   \n",
       "20          0.562500            0.250000  0.375000  0.125000     16   \n",
       "22          0.562500            0.250000  0.375000  0.125000     16   \n",
       "29          0.562500            0.375000  0.187500  0.125000     16   \n",
       "31          0.562500            0.375000  0.187500  0.125000     16   \n",
       "92          0.666667            0.222222  0.333333  0.111111      9   \n",
       "94          0.666667            0.222222  0.333333  0.111111      9   \n",
       "173         0.125000            0.187500  0.625000  0.000000     16   \n",
       "175         0.125000            0.125000  0.562500  0.000000     16   \n",
       "191         0.125000            0.125000  0.625000  0.000000     16   \n",
       "193         0.125000            0.125000  0.625000  0.000000     16   \n",
       "200         0.125000            0.125000  0.625000  0.000000     16   \n",
       "202         0.125000            0.062500  0.625000  0.000000     16   \n",
       "\n",
       "                    COR      SCORE  THRESH  \n",
       "11           bleu_adapt  auc_score     0.3  \n",
       "13           bleu_adapt  auc_score     0.5  \n",
       "20            bma_judge  auc_score     0.3  \n",
       "22            bma_judge  auc_score     0.5  \n",
       "29         bma_judge_w8  auc_score     0.3  \n",
       "31         bma_judge_w8  auc_score     0.5  \n",
       "92   j_llama70b_qa_1_1.  auc_score     0.3  \n",
       "94   j_llama70b_qa_1_1.  auc_score     0.5  \n",
       "173              rouge1  auc_score     0.3  \n",
       "175              rouge1  auc_score     0.5  \n",
       "191              rougeL  auc_score     0.3  \n",
       "193              rougeL  auc_score     0.5  \n",
       "200           rougeLsum  auc_score     0.3  \n",
       "202           rougeLsum  auc_score     0.5  "
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "_resdf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "id": "68658f9c-b426-4c12-8f26-a4d9be3f999a",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_2518232/892723521.py:6: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  _resdf[c] = _resdf[c]/_resdf['total']\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "predent                 0.1875\n",
       "len_norm_predent         0.125\n",
       "seqlen_sample           0.0625\n",
       "seqlen_correct            0.25\n",
       "EigenScore               0.125\n",
       "TOKEN_SAR                  0.0\n",
       "SENT_SAR              0.118056\n",
       "SAR                     0.0625\n",
       "log_pplx              0.319444\n",
       "sement                0.208333\n",
       "min_logprob             0.3125\n",
       "len_norm_sement       0.104167\n",
       "ptrue_neg_log_prob       0.125\n",
       "GNLL                      0.25\n",
       "pplx                       0.0\n",
       "dtype: object"
      ]
     },
     "execution_count": 81,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# _resdf = resdf[(resdf['THRESH'].isin([0.3,0.5]))&(resdf.total>=9)&(~resdf.COR.str.contains('8b'))&((resdf.COR.str.startswith('j'))|(resdf.COR.str.startswith('bma')))]\n",
    "_resdf = resdf[(resdf['THRESH'].isin([0.3,0.5])&(resdf.total>=9)&(~resdf.COR.isin(['bleu', 'rouge2', 'rouge1', 'ood_label', 'bleu_adapt'])))]\n",
    "for c in _resdf.columns:\n",
    "    # print(_resdf[c].dtype)\n",
    "     if c not in ['total', 'COR','SCORE','THRESH']:\n",
    "        _resdf[c] = _resdf[c]/_resdf['total']\n",
    "_resdf.max(0)[uncert_methods] - _resdf[_resdf.COR=='bma_judge'].iloc[0][uncert_methods]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "id": "f136980c-f3cf-4369-8faf-b6821708d8a9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(predent                  0.0\n",
       " len_norm_predent         0.0\n",
       " seqlen_sample           0.25\n",
       " seqlen_correct        0.3125\n",
       " EigenScore             0.125\n",
       " TOKEN_SAR             0.0625\n",
       " SENT_SAR              0.4375\n",
       " SAR                    0.125\n",
       " log_pplx               0.125\n",
       " sement                 0.125\n",
       " min_logprob            0.125\n",
       " len_norm_sement       0.5625\n",
       " ptrue_neg_log_prob      0.25\n",
       " GNLL                   0.375\n",
       " pplx                   0.125\n",
       " Name: 20, dtype: object,\n",
       " predent                 0.1875\n",
       " len_norm_predent         0.125\n",
       " seqlen_sample           0.3125\n",
       " seqlen_correct          0.5625\n",
       " EigenScore                0.25\n",
       " TOKEN_SAR               0.0625\n",
       " SENT_SAR              0.555556\n",
       " SAR                     0.1875\n",
       " log_pplx              0.444444\n",
       " sement                0.333333\n",
       " min_logprob             0.4375\n",
       " len_norm_sement       0.666667\n",
       " ptrue_neg_log_prob       0.375\n",
       " GNLL                     0.625\n",
       " pplx                     0.125\n",
       " dtype: object)"
      ]
     },
     "execution_count": 82,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "_resdf[_resdf.COR=='bma_judge'].iloc[0][uncert_methods], _resdf.max(0)[uncert_methods]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "id": "d077ab6d-3650-4589-aa39-13c29bf2a5d1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>COR</th>\n",
       "      <th>THRESH</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>bleu</td>\n",
       "      <td>0.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>bleu</td>\n",
       "      <td>0.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>bleu</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>bleu</td>\n",
       "      <td>0.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>bleu</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>202</th>\n",
       "      <td>rougeLsum</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>203</th>\n",
       "      <td>rougeLsum</td>\n",
       "      <td>0.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>204</th>\n",
       "      <td>rougeLsum</td>\n",
       "      <td>0.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>205</th>\n",
       "      <td>rougeLsum</td>\n",
       "      <td>0.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>206</th>\n",
       "      <td>rougeLsum</td>\n",
       "      <td>0.9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>207 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           COR  THRESH\n",
       "0         bleu     0.1\n",
       "1         bleu     0.2\n",
       "2         bleu     0.3\n",
       "3         bleu     0.4\n",
       "4         bleu     0.5\n",
       "..         ...     ...\n",
       "202  rougeLsum     0.5\n",
       "203  rougeLsum     0.6\n",
       "204  rougeLsum     0.7\n",
       "205  rougeLsum     0.8\n",
       "206  rougeLsum     0.9\n",
       "\n",
       "[207 rows x 2 columns]"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "resdf[['COR', 'THRESH']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "id": "fb76b865-2ab5-4bac-9f75-f4a2434c2452",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>COR</th>\n",
       "      <th>THRESH</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>191</th>\n",
       "      <td>rougeL</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>200</th>\n",
       "      <td>rougeLsum</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>bma_judge_w8</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>191</th>\n",
       "      <td>rougeL</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>191</th>\n",
       "      <td>rougeL</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>bma_judge</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>92</th>\n",
       "      <td>j_llama70b_qa_1_1.</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>bma_judge_w8</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>92</th>\n",
       "      <td>j_llama70b_qa_1_1.</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>92</th>\n",
       "      <td>j_llama70b_qa_1_1.</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>202</th>\n",
       "      <td>rougeLsum</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>92</th>\n",
       "      <td>j_llama70b_qa_1_1.</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>bma_judge_w8</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>191</th>\n",
       "      <td>rougeL</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>bma_judge</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    COR  THRESH\n",
       "191              rougeL     0.3\n",
       "200           rougeLsum     0.3\n",
       "29         bma_judge_w8     0.3\n",
       "191              rougeL     0.3\n",
       "191              rougeL     0.3\n",
       "20            bma_judge     0.3\n",
       "92   j_llama70b_qa_1_1.     0.3\n",
       "29         bma_judge_w8     0.3\n",
       "92   j_llama70b_qa_1_1.     0.3\n",
       "92   j_llama70b_qa_1_1.     0.3\n",
       "202           rougeLsum     0.5\n",
       "92   j_llama70b_qa_1_1.     0.3\n",
       "29         bma_judge_w8     0.3\n",
       "191              rougeL     0.3\n",
       "20            bma_judge     0.3"
      ]
     },
     "execution_count": 93,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "resdf[['COR', 'THRESH']].loc[_resdf.idxmax(0)[uncert_methods].to_list()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "id": "f183e4a3-2446-48a2-be24-5a098a865ea0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>20</th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>predent</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.1875</td>\n",
       "      <td>0.1875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>len_norm_predent</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.125</td>\n",
       "      <td>0.125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>seqlen_sample</th>\n",
       "      <td>0.25</td>\n",
       "      <td>0.3125</td>\n",
       "      <td>0.0625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>seqlen_correct</th>\n",
       "      <td>0.3125</td>\n",
       "      <td>0.5625</td>\n",
       "      <td>0.25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>EigenScore</th>\n",
       "      <td>0.125</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TOKEN_SAR</th>\n",
       "      <td>0.0625</td>\n",
       "      <td>0.0625</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>SENT_SAR</th>\n",
       "      <td>0.4375</td>\n",
       "      <td>0.555556</td>\n",
       "      <td>0.118056</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>SAR</th>\n",
       "      <td>0.125</td>\n",
       "      <td>0.1875</td>\n",
       "      <td>0.0625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>log_pplx</th>\n",
       "      <td>0.125</td>\n",
       "      <td>0.444444</td>\n",
       "      <td>0.319444</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sement</th>\n",
       "      <td>0.125</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.208333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min_logprob</th>\n",
       "      <td>0.125</td>\n",
       "      <td>0.4375</td>\n",
       "      <td>0.3125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>len_norm_sement</th>\n",
       "      <td>0.5625</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.104167</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ptrue_neg_log_prob</th>\n",
       "      <td>0.25</td>\n",
       "      <td>0.375</td>\n",
       "      <td>0.125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GNLL</th>\n",
       "      <td>0.375</td>\n",
       "      <td>0.625</td>\n",
       "      <td>0.25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pplx</th>\n",
       "      <td>0.125</td>\n",
       "      <td>0.125</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                        20        0         1 \n",
       "predent                0.0    0.1875    0.1875\n",
       "len_norm_predent       0.0     0.125     0.125\n",
       "seqlen_sample         0.25    0.3125    0.0625\n",
       "seqlen_correct      0.3125    0.5625      0.25\n",
       "EigenScore           0.125      0.25     0.125\n",
       "TOKEN_SAR           0.0625    0.0625       0.0\n",
       "SENT_SAR            0.4375  0.555556  0.118056\n",
       "SAR                  0.125    0.1875    0.0625\n",
       "log_pplx             0.125  0.444444  0.319444\n",
       "sement               0.125  0.333333  0.208333\n",
       "min_logprob          0.125    0.4375    0.3125\n",
       "len_norm_sement     0.5625  0.666667  0.104167\n",
       "ptrue_neg_log_prob    0.25     0.375     0.125\n",
       "GNLL                 0.375     0.625      0.25\n",
       "pplx                 0.125     0.125       0.0"
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a = _resdf[_resdf.COR=='bma_judge'].iloc[0][uncert_methods]\n",
    "b = _resdf.max(0)[uncert_methods]\n",
    "c = _resdf.max(0)[uncert_methods] - _resdf[_resdf.COR=='bma_judge'].iloc[0][uncert_methods]\n",
    "\n",
    "full = pd.concat([a,b,c], axis=1)\n",
    "full"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "id": "443be85b-8921-419a-b91d-250095e53b98",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Predent & 0.000 & 0.188 & +0.188 \\\\\n",
      "Predent (LN) & 0.000 & 0.125 & +0.125 \\\\\n",
      "Seq. Len. (sample) & 0.250 & 0.312 & +0.062 \\\\\n",
      "Seq. Len. (answer) & 0.312 & 0.562 & +0.250 \\\\\n",
      "EigenScore & 0.125 & 0.250 & +0.125 \\\\\n",
      "TOKEN SAR & 0.062 & 0.062 & +0.000 \\\\\n",
      "SENT SAR & 0.438 & 0.556 & +0.118 \\\\\n",
      "SAR & 0.125 & 0.188 & +0.062 \\\\\n",
      "Perplexity & 0.125 & 0.444 & +0.319 \\\\\n",
      "Semantic Ent. & 0.125 & 0.333 & +0.208 \\\\\n",
      "Min Tok. Log Prob. & 0.125 & 0.438 & +0.312 \\\\\n",
      "Semantic Ent. (LN) & 0.562 & 0.667 & +0.104 \\\\\n",
      "P(True) & 0.250 & 0.375 & +0.125 \\\\\n",
      "GNLL & 0.375 & 0.625 & +0.250 \\\\\n",
      " & 0.125 & 0.125 & +0.000 \\\\\n"
     ]
    }
   ],
   "source": [
    "name_map = {\n",
    "    'predent': 'Predent',\n",
    "    'len_norm_predent': 'Predent (LN)',\n",
    "    'seqlen_sample': 'Seq. Len. (sample)',\n",
    "    'seqlen_correct': 'Seq. Len. (answer)',\n",
    "    'EigenScore': 'EigenScore',\n",
    "    'TOKEN_SAR': 'TOKEN SAR',\n",
    "    'SENT_SAR': 'SENT SAR',\n",
    "    'SAR': 'SAR',\n",
    "    'log_pplx': 'Perplexity',\n",
    "    'sement': 'Semantic Ent.',\n",
    "    'min_logprob': 'Min Tok. Log Prob.',\n",
    "    'len_norm_sement': 'Semantic Ent. (LN)',\n",
    "    'ptrue_neg_log_prob': 'P(True)',\n",
    "    'GNLL': 'GNLL',\n",
    "    'pplx': ''\n",
    "}\n",
    "\n",
    "resdf[['COR', 'THRESH']].loc[_resdf.idxmax(0)[uncert_methods].to_list()]\n",
    "\n",
    "for idx, row in full.iterrows():\n",
    "    print(f\"{name_map[idx]} & {row.iloc[0]:0.3f} & {row.iloc[1]:0.3f} & +{row.iloc[2]:0.3f} \\\\\\\\\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "61f5a985-7a7d-4a5c-8794-1f45b6be4d5e",
   "metadata": {},
   "outputs": [],
   "source": [
    "for cor, (idx, row) in full.iterrows():\n",
    "    print(f\"{name_map[idx]} & {row.iloc[0]:0.3f} & {row.iloc[1]:0.3f} & +{row.iloc[2]:0.3f} \\\\\\\\\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
