{
 "cells": [
  {
   "cell_type": "code",
   "id": "6f1b99b6-273c-4b99-b42b-7e56f50ff191",
   "metadata": {},
   "source": [
    "# === Minimal PRR tables (single-model) — exact mask, display renaming, clean midrules, beam suffixes, MEAN col ===\n",
    "\n",
    "from lm_polygraph.estimators import *\n",
    "from lm_polygraph.estimators.eccentricity import *\n",
    "from typing import Sequence, Tuple, Dict, Callable, Optional\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from IPython.display import display, Markdown\n",
    "from lm_polygraph.ue_metrics import PredictionRejectionAreaNormalized\n",
    "\n",
    "# ---------------- exact mask (your preferred implementation) ----------------\n",
    "def _mask_for_dataset(man, filter_fn: Callable[[dict], bool]) -> np.ndarray:\n",
    "    stats = man['stats']\n",
    "    n = len(stats['input_texts'])\n",
    "    mask = np.empty(n, dtype=bool)\n",
    "    for i in range(n):\n",
    "        s = {}\n",
    "        for k in stats.keys():\n",
    "            try:\n",
    "                s[k] = stats[k][i]\n",
    "            except:\n",
    "                continue\n",
    "        mask[i] = bool(filter_fn(s))\n",
    "    return mask\n",
    "\n",
    "def _prr(man, method_name: str, rank_metric, quality_metric: str, mask: np.ndarray) -> float:\n",
    "    try:\n",
    "        uqs = np.asarray(man['estimations'][('sequence', method_name)])\n",
    "        tgt = 1 - np.asarray(man['gen_metrics'][('sequence', quality_metric)])\n",
    "        return float(rank_metric(uqs[mask], tgt[mask]))\n",
    "    except Exception:\n",
    "        return np.nan\n",
    "\n",
    "# ---------------- table builder ----------------\n",
    "def build_prr_single_model(\n",
    "    mans_by_dataset: Dict[str, Sequence],\n",
    "    dataset_order: Sequence[str],\n",
    "    model_index: int,\n",
    "    uq_methods: Sequence[Tuple[str, ...]],\n",
    "    *,\n",
    "    rank_metric = PredictionRejectionAreaNormalized(0.5),\n",
    "    quality_metric: str = 'AlignScore',\n",
    "    filters_by_dataset: Optional[Dict[str, Callable[[dict], bool]]] = None,\n",
    ") -> pd.DataFrame:\n",
    "    \"\"\"Index = all methods from uq_methods (flattened), columns = dataset_order, values = PRR.\"\"\"\n",
    "    masks: Dict[str, Optional[np.ndarray]] = {}\n",
    "    for ds in dataset_order:\n",
    "        mans = mans_by_dataset.get(ds)\n",
    "        man = mans[model_index] if mans and model_index < len(mans) else None\n",
    "        masks[ds] = _mask_for_dataset(man, (filters_by_dataset or {}).get(ds, lambda _: True)) if man else None\n",
    "\n",
    "    rows = [m for grp in uq_methods for m in grp]\n",
    "    data = []\n",
    "    for mname in rows:\n",
    "        vals = []\n",
    "        for ds in dataset_order:\n",
    "            mans = mans_by_dataset.get(ds)\n",
    "            man = mans[model_index] if mans and model_index < len(mans) else None\n",
    "            vals.append(_prr(man, mname, rank_metric, quality_metric, masks[ds]) if (man and masks[ds] is not None) else np.nan)\n",
    "        data.append(vals)\n",
    "    return pd.DataFrame(data, index=rows, columns=list(dataset_order))\n",
    "\n",
    "# ---------------- notebook display ----------------\n",
    "def show_prr(\n",
    "    df: pd.DataFrame,\n",
    "    uq_methods: Sequence[Tuple[str, ...]],\n",
    "    *,\n",
    "    title: str | None = None,\n",
    "    precision: int = 3,\n",
    "    method_rename: Callable[[str], str] = lambda x: x,\n",
    "    ds_rename: Callable[[str], str] = lambda x: x,\n",
    "    add_beam_suffix: bool = True,   # add \" + beamsearch\" (2nd) / \" + beamsearch + probs\" (3rd)\n",
    "    include_mean: bool = False,      # add a \"Mean\" column (row-wise mean over datasets)\n",
    "    mean_label: str = 'Mean',\n",
    ") -> None:\n",
    "    \"\"\"Per-column Greens; Top-1 bold, Top-2 underline; separators for multi-row groups; renamed labels + beam suffixes + optional Mean col.\"\"\"\n",
    "    ordered_rows = [m for grp in uq_methods for m in grp]\n",
    "    dfR = df.reindex(ordered_rows)\n",
    "\n",
    "    # add Mean column (over current dataset columns)\n",
    "    if include_mean:\n",
    "        dfR = dfR.copy()\n",
    "        dfR['Mean'] = dfR.mean(axis=1, skipna=True)\n",
    "\n",
    "    # build display index with optional suffixes by position in group\n",
    "    disp_index = []\n",
    "    for grp in uq_methods:\n",
    "        for j, m in enumerate(grp):\n",
    "            name = method_rename(m)\n",
    "            if add_beam_suffix:\n",
    "                if j == 1: name += ' + beamsearch'\n",
    "                elif j == 2: name += ' + beamsearch + probs'\n",
    "            disp_index.append(name)\n",
    "\n",
    "    disp = dfR.copy()\n",
    "    disp.index = disp_index\n",
    "    # rename datasets; keep mean as mean_label\n",
    "    disp.columns = [ds_rename(c) if c != 'Mean' else mean_label for c in disp.columns]\n",
    "\n",
    "    if title:\n",
    "        display(Markdown('## ' + title))\n",
    "\n",
    "    styled = disp.style.format(f'{{:.{precision}f}}')\n",
    "\n",
    "    # per-column gradient\n",
    "    for c in disp.columns:\n",
    "        col = pd.to_numeric(disp[c], errors='coerce').dropna()\n",
    "        if not col.empty:\n",
    "            vmin, vmax = float(col.min()), float(col.max())\n",
    "            if vmin == vmax: vmax = vmin + 1e-12\n",
    "            styled = styled.background_gradient(cmap='Greens', subset=[c], vmin=vmin, vmax=vmax)\n",
    "\n",
    "    # Top-1 bold, Top-2 underline\n",
    "    def _col_rank_styles(col: pd.Series):\n",
    "        s = pd.to_numeric(col, errors='coerce')\n",
    "        styles = pd.Series('', index=s.index)\n",
    "        ranked = s.dropna().sort_values(ascending=False, kind='mergesort')\n",
    "        if len(ranked) >= 1: styles.loc[ranked.index[0]] += 'font-weight:700;'\n",
    "        if len(ranked) >= 2: styles.loc[ranked.index[1]] += 'text-decoration: underline;'\n",
    "        return styles\n",
    "    styled = styled.apply(_col_rank_styles, axis=0)\n",
    "\n",
    "    # separators only around multi-row groups\n",
    "    start = 0\n",
    "    for grp in uq_methods:\n",
    "        if len(grp) >= 2:\n",
    "            end = start + len(grp) - 1\n",
    "            if start > 0:\n",
    "                styled = styled.set_properties(subset=pd.IndexSlice[disp.index[start], :], **{'border-top': '3px solid black'})\n",
    "            if end < len(disp) - 1:\n",
    "                styled = styled.set_properties(subset=pd.IndexSlice[disp.index[end], :], **{'border-bottom': '3px solid black'})\n",
    "        start += len(grp)\n",
    "\n",
    "    display(styled)\n",
    "\n",
    "# ---------------- LaTeX printer (clean midrules + beam suffixes + Mean col) ----------------\n",
    "def print_latex_prr(\n",
    "    df: pd.DataFrame,\n",
    "    uq_methods: Sequence[Tuple[str, ...]],\n",
    "    *,\n",
    "    precision: int = 3,\n",
    "    method_rename: Callable[[str], str] = lambda x: x,\n",
    "    ds_rename: Callable[[str], str] = lambda x: x,\n",
    "    save_path: Optional[str] = None,\n",
    "    up_arrow_tex: str = r'$\\uparrow$',\n",
    "    add_beam_suffix: bool = True,\n",
    "    include_mean: bool = False,\n",
    "    mean_label: str = 'Mean',\n",
    ") -> str:\n",
    "    \"\"\"\n",
    "    - rows in uq_methods order\n",
    "    - NO midrules for single-row groups; ONE \\midrule before each multi-row group (except first)\n",
    "    - bold Top-1, underline Top-2 per column\n",
    "    - ↑ if row > group's first row (per column)\n",
    "    - remove leading zeros\n",
    "    - optional Mean column (row-wise mean across datasets)\n",
    "    \"\"\"\n",
    "    rows_order = [m for grp in uq_methods for m in grp]\n",
    "    dfR = df.reindex(rows_order)\n",
    "    if include_mean:\n",
    "        dfR = dfR.copy()\n",
    "        dfR['Mean'] = dfR.mean(axis=1, skipna=True)\n",
    "\n",
    "    # ranking info (including Mean if present)\n",
    "    top1, top2 = {}, {}\n",
    "    for col in dfR.columns:\n",
    "        s = pd.to_numeric(dfR[col], errors='coerce').dropna().sort_values(ascending=False, kind='mergesort')\n",
    "        if len(s) >= 1: top1[col] = s.index[0]\n",
    "        if len(s) >= 2: top2[col] = s.index[1]\n",
    "\n",
    "    def _fmt_num(x: float) -> str:\n",
    "        if pd.isna(x): return ''\n",
    "        txt = f'{x:.{precision}f}'\n",
    "        if txt.startswith('0.'):  txt = txt[1:]\n",
    "        if txt.startswith('-0.'): txt = '-' + txt[2:]\n",
    "        return txt\n",
    "\n",
    "    def _fmt_cell(val, row_label, col_name, baseline_val, is_first_in_group):\n",
    "        txt = _fmt_num(val)\n",
    "        if txt == '': return txt\n",
    "        if col_name in top1 and row_label == top1[col_name]:\n",
    "            txt = r'\\textbf{' + txt + '}'\n",
    "        elif col_name in top2 and row_label == top2[col_name]:\n",
    "            txt = r'\\underline{' + txt + '}'\n",
    "        if (not is_first_in_group) and baseline_val is not None and not pd.isna(baseline_val):\n",
    "            try:\n",
    "                if float(val) > float(baseline_val):\n",
    "                    txt += up_arrow_tex\n",
    "            except Exception:\n",
    "                pass\n",
    "        return txt\n",
    "\n",
    "    cols = list(dfR.columns)\n",
    "    header = ['Method'] + [ds_rename(c) if c != 'Mean' else mean_label for c in cols]\n",
    "\n",
    "    lines = [\n",
    "        r'\\begin{tabular}{' + ('l' + 'c'*len(cols)) + '}',\n",
    "        r'\\toprule',\n",
    "        ' & '.join(header) + r' \\\\',\n",
    "        r'\\midrule'\n",
    "    ]\n",
    "\n",
    "    printed_any = False\n",
    "    for grp in uq_methods:\n",
    "        multi = (len(grp) >= 2)\n",
    "        if multi and printed_any:\n",
    "            lines.append(r'\\midrule')\n",
    "\n",
    "        baseline = dfR.loc[grp[0]] if grp[0] in dfR.index else None\n",
    "\n",
    "        for j, m in enumerate(grp):\n",
    "            row = dfR.loc[m] if m in dfR.index else pd.Series([np.nan]*len(cols), index=cols)\n",
    "            name = method_rename(m)\n",
    "            if add_beam_suffix:\n",
    "                if j == 1: name += ' + beamsearch'\n",
    "                elif j == 2: name += ' + beamsearch + probs'\n",
    "            cells = [name] + [\n",
    "                _fmt_cell(row[c], m, c, (None if (j==0 or baseline is None) else baseline[c]), j==0)\n",
    "                for c in cols\n",
    "            ]\n",
    "            lines.append(' & '.join(cells) + r' \\\\')\n",
    "            printed_any = True\n",
    "\n",
    "    lines.append(r'\\bottomrule')\n",
    "    lines.append(r'\\end{tabular}')\n",
    "\n",
    "    latex = '\\n'.join(lines)\n",
    "    if save_path:\n",
    "        with open(save_path, 'w') as f:\n",
    "            f.write(latex)\n",
    "    else:\n",
    "        print(latex)\n",
    "    return latex"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "99ea1e2e-46c1-4dc3-833b-0a3f0ed9c4f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "from updated_uq import *\n",
    "def update_uq(man, **process_probs_args):\n",
    "    update_uq_list = [\n",
    "        Dissimilarity(samples_source='sample'),\n",
    "        # DissimilarityP(samples_source='sample', **process_probs_args),\n",
    "        DissimilarityP(samples_source='beamsearch', **process_probs_args),\n",
    "\n",
    "        CocoaMSP(samples_source='sample'),\n",
    "        # CocoaMSPP(samples_source='sample', **process_probs_args),\n",
    "        CocoaMSPP(samples_source='beamsearch', **process_probs_args),\n",
    "\n",
    "        CocoaMTE(samples_source='sample'),\n",
    "        # CocoaMTEP(samples_source='sample', **process_probs_args),\n",
    "        CocoaMTEP(samples_source='beamsearch', **process_probs_args),\n",
    "\n",
    "        CocoaPPL(samples_source='sample'),\n",
    "        # CocoaPPLP(samples_source='sample', **process_probs_args),\n",
    "        CocoaPPLP(samples_source='beamsearch', **process_probs_args),\n",
    "\n",
    "        CocoaMSP(samples_source='sample'),\n",
    "        # CocoaMSPP(samples_source='sample', **process_probs_args),\n",
    "        CocoaMSPP(samples_source='beamsearch', **process_probs_args),\n",
    "        \n",
    "        EccentricityConf(samples_source='sample', formula='dist_to_mean'),\n",
    "        # EccentricityPConf(samples_source='sample', formula='dist_to_mean', **process_probs_args),\n",
    "        # EccentricityConf(samples_source='beamsearch', formula='dist_to_mean'),\n",
    "        EccentricityPConf(samples_source='beamsearch', formula='dist_to_mean', **process_probs_args),\n",
    "        \n",
    "        EccentricityConf(samples_source='sample', formula='mean_dist'),\n",
    "        # EccentricityPConf(samples_source='sample', formula='mean_dist', **process_probs_args),\n",
    "        # EccentricityConf(samples_source='beamsearch', formula='mean_dist'),\n",
    "        EccentricityPConf(samples_source='beamsearch', formula='mean_dist', **process_probs_args),\n",
    "    ]\n",
    "    for uq in tqdm(update_uq_list, total=len(update_uq_list), desc='Updating estimators', leave=False, position=2):\n",
    "        man['estimations'][uq.level, str(uq)] = uq(man['stats'])\n",
    "    return man"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "464d9196-f172-4356-8e5a-d6d32bbb04dc",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "from tqdm.notebook import tqdm\n",
    "\n",
    "def load_man(man_path: str):\n",
    "    return torch.load(man_path, weights_only=False)\n",
    "\n",
    "model_simple_titles = ['llama8b', 'llama8bit', 'gemma4b', 'gemma4bit', 'qwen8b', 'qwen8bit']\n",
    "model_titles = ['Llama 8B', 'Llama 8B Instruct', 'Gemma 4B', 'Gemma 4B Instruct', 'Qwen 8B Base', 'Qwen 8B Instruct']\n",
    "datasets = ['triviaqa', 'webq', 'coqa', 'hotpotqa', 'csqa', 'arcchallenge']\n",
    "filters = {\n",
    "    'webq': lambda s: len(s['greedy_tokens']) <= 5,\n",
    "    'arcchallenge': lambda s: len(s['greedy_tokens']) <= 5,\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "217295b8-3740-49c8-80d9-ffc917e62756",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e26b23c63c2e4a24966d35ec2ed8e224",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Dataset:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Model:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Model:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Model:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Model:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Model:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Model:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Updating estimators:   0%|          | 0/14 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# del mans_by_dataset\n",
    "if 'mans_by_dataset' not in globals():\n",
    "    mans_by_dataset = {\n",
    "        dataset: [\n",
    "            update_uq(\n",
    "                load_man(f'data/{dataset}_{model}.man'),\n",
    "                min_p=0.0,\n",
    "                normalize_all=True,\n",
    "            )\n",
    "            for model in tqdm(model_simple_titles, position=1, leave=False, desc='Model')\n",
    "        ]\n",
    "        for dataset in tqdm(datasets, desc='Dataset')\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "cf868ae6-e99b-4518-89a4-51b3782bd9bd",
   "metadata": {},
   "outputs": [],
   "source": [
    "uq_methods = [\n",
    "    ('MaximumSequenceProbability',),\n",
    "    ('MeanTokenEntropy',),\n",
    "    ('Perplexity',),\n",
    "    ('CCP',),\n",
    "    # ('TokenSAR',),\n",
    "    ('SAR',),\n",
    "    ('PTrue',),\n",
    "    \n",
    "    (\n",
    "        'SemanticEntropy',\n",
    "        # 'SemanticEntropy_beamsearch',\n",
    "    ),\n",
    "    # (\n",
    "    #     'SentenceSAR',\n",
    "    #     # 'SentenceSAR_beamsearch',\n",
    "    # ),\n",
    "    (\n",
    "        'LexicalSimilarity_rougeL',\n",
    "        # 'LexicalSimilarity_rougeL_beamsearch'),\n",
    "    ),\n",
    "    # (\n",
    "    #     'DegMat_NLI_score_entail',\n",
    "    #     # 'DegMat_beamsearch_NLI_score_entail',\n",
    "    # ),\n",
    "    (\n",
    "        'EigValLaplacian_NLI_score_entail',\n",
    "        # 'EigValLaplacian_beamsearch_NLI_score_entail',\n",
    "    ),\n",
    "    (\n",
    "        'NumSemSets',\n",
    "        # 'NumSemSets_beamsearch',\n",
    "    ),\n",
    "    \n",
    "    (\n",
    "        'Dissimilarity',\n",
    "        # 'Dissimilarity_beamsearch',\n",
    "        'DissimilarityP_beamsearch',\n",
    "    ),\n",
    "    (\n",
    "        'EccentricityConf_greedy+sample_NLI_score_entail',\n",
    "        # 'EccentricityConf_greedy+beamsearch_NLI_score_entail',\n",
    "        'EccentricityPConf_greedy+beamsearch_NLI_score_entail',\n",
    "    ),\n",
    "    (\n",
    "        'EigVecDissimilarity_greedy+sample_NLI_score_entail',\n",
    "        # 'EigVecDissimilarity_greedy+beamsearch_NLI_score_entail',\n",
    "        'EigVecDissimilarityP_greedy+beamsearch_NLI_score_entail',\n",
    "    ),\n",
    "    (\n",
    "        'CocoaMSP',\n",
    "        # 'CocoaMSP_beamsearch',\n",
    "        'CocoaMSPP_beamsearch',\n",
    "    ),\n",
    "    # (\n",
    "    #     'CocoaMTE',\n",
    "    #     # 'CocoaMTE_beamsearch',\n",
    "    #     'CocoaMTEP_beamsearch',\n",
    "    # ),\n",
    "    (\n",
    "        'CocoaPPL',\n",
    "        # 'CocoaPPL_beamsearch',\n",
    "        'CocoaPPLP_beamsearch',\n",
    "    ),\n",
    "]\n",
    "\n",
    "def uq_rename_fn(x):\n",
    "    for a, b in [\n",
    "        ('MaximumSequenceProbability', 'MSP'),\n",
    "        ('MeanTokenEntropy', 'MTE'),\n",
    "        ('PTrue', 'P(True)'),\n",
    "        ('DegMat_rougeL', 'DegMat'),\n",
    "        ('_NLI_score_entail', ''),\n",
    "        ('_greedy+sample', ''),\n",
    "        ('_greedy+beamsearch', ''),\n",
    "        ('EccentricityConf', 'Eccentricity'),\n",
    "        ('EccentricityPConf', 'Eccentricity'),\n",
    "        ('DissimilarityP', 'Dissimilarity'),\n",
    "        ('EigVecDissimilarityP', 'EigVecDissimilarity'),\n",
    "        ('CocoaMTEP', 'CocoaMTE'),\n",
    "        ('CocoaPPLP', 'CocoaPPL'),\n",
    "        ('CocoaMSPP', 'CocoaMSP'),\n",
    "        ('_beamsearch', ''),\n",
    "        ('LexicalSimilarity_rougeL', 'Lexical Similarity'),\n",
    "    ]:\n",
    "        x = x.replace(a, b)\n",
    "    return x\n",
    "\n",
    "def ds_rename_fn(x):\n",
    "    if x == 'triviaqa':\n",
    "        return 'TriviaQA'\n",
    "    if x == 'webq':\n",
    "        return r'\\multirowcell{Web\\\\Questions}'\n",
    "    if x == 'coqa':\n",
    "        return 'CoQA'\n",
    "    if x == 'hotpotqa':\n",
    "        return 'HotpotQA'\n",
    "    if x == 'csqa':\n",
    "        return r'\\multirowcell{Common\\\\senceQA}'\n",
    "    if x == 'arcchallenge':\n",
    "        return r'\\multirowcell{ARC-\\\\Challenge}'\n",
    "    return x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "f7ac5547-9d5c-462e-8a4e-2ce2c80f165a",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "## Gemma 4B, PRR per dataset"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<style type=\"text/css\">\n",
       "#T_43a5a_row0_col0 {\n",
       "  background-color: #19833e;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row0_col1 {\n",
       "  background-color: #29914a;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row0_col2 {\n",
       "  background-color: #76c578;\n",
       "  color: #000000;\n",
       "}\n",
       "#T_43a5a_row0_col3 {\n",
       "  background-color: #b0dfaa;\n",
       "  color: #000000;\n",
       "}\n",
       "#T_43a5a_row0_col4 {\n",
       "  background-color: #0b7734;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row0_col5, #T_43a5a_row2_col5, #T_43a5a_row3_col0 {\n",
       "  background-color: #0a7633;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row1_col0, #T_43a5a_row1_col1 {\n",
       "  background-color: #137d39;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row1_col2 {\n",
       "  background-color: #52b365;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row1_col3 {\n",
       "  background-color: #309950;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row1_col4 {\n",
       "  background-color: #62bb6d;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row1_col5 {\n",
       "  background-color: #66bd6f;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row2_col0 {\n",
       "  background-color: #208843;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row2_col1 {\n",
       "  background-color: #1e8741;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row2_col2 {\n",
       "  background-color: #4eb264;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row2_col3, #T_43a5a_row7_col1 {\n",
       "  background-color: #238b45;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row2_col4 {\n",
       "  background-color: #107a37;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row3_col1 {\n",
       "  background-color: #18823d;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row3_col2 {\n",
       "  background-color: #6dc072;\n",
       "  color: #000000;\n",
       "}\n",
       "#T_43a5a_row3_col3 {\n",
       "  background-color: #8dd08a;\n",
       "  color: #000000;\n",
       "}\n",
       "#T_43a5a_row3_col4 {\n",
       "  background-color: #0e7936;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row3_col5 {\n",
       "  background-color: #147e3a;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row4_col0 {\n",
       "  background-color: #1c8540;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row4_col1 {\n",
       "  background-color: #17813d;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row4_col2, #T_43a5a_row7_col2 {\n",
       "  background-color: #5db96b;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row4_col3 {\n",
       "  background-color: #a4da9e;\n",
       "  color: #000000;\n",
       "}\n",
       "#T_43a5a_row4_col4 {\n",
       "  background-color: #d7efd1;\n",
       "  color: #000000;\n",
       "}\n",
       "#T_43a5a_row4_col5, #T_43a5a_row9_col3 {\n",
       "  background-color: #b2e0ac;\n",
       "  color: #000000;\n",
       "}\n",
       "#T_43a5a_row5_col0, #T_43a5a_row5_col1, #T_43a5a_row5_col2, #T_43a5a_row5_col3, #T_43a5a_row5_col4, #T_43a5a_row5_col5 {\n",
       "  background-color: #f7fcf5;\n",
       "  color: #000000;\n",
       "}\n",
       "#T_43a5a_row6_col0 {\n",
       "  background-color: #2d954d;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row6_col1 {\n",
       "  background-color: #2f974e;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row6_col2 {\n",
       "  background-color: #7dc87e;\n",
       "  color: #000000;\n",
       "}\n",
       "#T_43a5a_row6_col3 {\n",
       "  background-color: #e8f6e3;\n",
       "  color: #000000;\n",
       "}\n",
       "#T_43a5a_row6_col4 {\n",
       "  background-color: #40aa5d;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row6_col5 {\n",
       "  background-color: #42ab5d;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row7_col0, #T_43a5a_row9_col5 {\n",
       "  background-color: #37a055;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row7_col3 {\n",
       "  background-color: #a9dca3;\n",
       "  color: #000000;\n",
       "}\n",
       "#T_43a5a_row7_col4, #T_43a5a_row8_col4 {\n",
       "  background-color: #50b264;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row7_col5 {\n",
       "  background-color: #5eb96b;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row8_col0 {\n",
       "  background-color: #157f3b;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row8_col1 {\n",
       "  background-color: #1d8640;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row8_col2 {\n",
       "  background-color: #70c274;\n",
       "  color: #000000;\n",
       "}\n",
       "#T_43a5a_row8_col3 {\n",
       "  background-color: #bee5b8;\n",
       "  color: #000000;\n",
       "}\n",
       "#T_43a5a_row8_col5 {\n",
       "  background-color: #38a156;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row9_col0 {\n",
       "  background-color: #1a843f;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row9_col1 {\n",
       "  background-color: #248c46;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row9_col2 {\n",
       "  background-color: #99d595;\n",
       "  color: #000000;\n",
       "}\n",
       "#T_43a5a_row9_col4 {\n",
       "  background-color: #78c679;\n",
       "  color: #000000;\n",
       "}\n",
       "#T_43a5a_row10_col0 {\n",
       "  background-color: #004a1e;\n",
       "  color: #f1f1f1;\n",
       "  text-decoration: underline;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row10_col1 {\n",
       "  background-color: #00471c;\n",
       "  color: #f1f1f1;\n",
       "  text-decoration: underline;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row10_col2 {\n",
       "  background-color: #005020;\n",
       "  color: #f1f1f1;\n",
       "  text-decoration: underline;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row10_col3 {\n",
       "  background-color: #006227;\n",
       "  color: #f1f1f1;\n",
       "  text-decoration: underline;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row10_col4 {\n",
       "  background-color: #005924;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row10_col5 {\n",
       "  background-color: #0b7734;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row11_col0, #T_43a5a_row11_col1, #T_43a5a_row11_col2, #T_43a5a_row11_col4, #T_43a5a_row13_col5 {\n",
       "  background-color: #00441b;\n",
       "  color: #f1f1f1;\n",
       "  font-weight: 700;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row11_col3 {\n",
       "  background-color: #00692a;\n",
       "  color: #f1f1f1;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row11_col5 {\n",
       "  background-color: #005622;\n",
       "  color: #f1f1f1;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row12_col0 {\n",
       "  background-color: #006529;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row12_col1 {\n",
       "  background-color: #006328;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row12_col2 {\n",
       "  background-color: #228a44;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row12_col3 {\n",
       "  background-color: #42ab5d;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row12_col4, #T_43a5a_row18_col2, #T_43a5a_row18_col3 {\n",
       "  background-color: #006227;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row12_col5, #T_43a5a_row16_col2 {\n",
       "  background-color: #097532;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row13_col0, #T_43a5a_row17_col5 {\n",
       "  background-color: #005522;\n",
       "  color: #f1f1f1;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row13_col1 {\n",
       "  background-color: #006c2c;\n",
       "  color: #f1f1f1;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row13_col2 {\n",
       "  background-color: #0b7734;\n",
       "  color: #f1f1f1;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row13_col3 {\n",
       "  background-color: #29914a;\n",
       "  color: #f1f1f1;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row13_col4 {\n",
       "  background-color: #00471c;\n",
       "  color: #f1f1f1;\n",
       "  text-decoration: underline;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row14_col0, #T_43a5a_row16_col0 {\n",
       "  background-color: #005622;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row14_col1 {\n",
       "  background-color: #005f26;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row14_col2 {\n",
       "  background-color: #29914a;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row14_col3 {\n",
       "  background-color: #45ad5f;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row14_col4 {\n",
       "  background-color: #0a7633;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row14_col5 {\n",
       "  background-color: #026f2e;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row15_col0 {\n",
       "  background-color: #004c1e;\n",
       "  color: #f1f1f1;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row15_col1 {\n",
       "  background-color: #005c25;\n",
       "  color: #f1f1f1;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row15_col2 {\n",
       "  background-color: #107a37;\n",
       "  color: #f1f1f1;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row15_col3 {\n",
       "  background-color: #339c52;\n",
       "  color: #f1f1f1;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row15_col4 {\n",
       "  background-color: #005924;\n",
       "  color: #f1f1f1;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row15_col5 {\n",
       "  background-color: #004c1e;\n",
       "  color: #f1f1f1;\n",
       "  text-decoration: underline;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row16_col1, #T_43a5a_row16_col5, #T_43a5a_row18_col5 {\n",
       "  background-color: #005c25;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row16_col3 {\n",
       "  background-color: #52b365;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row16_col4 {\n",
       "  background-color: #004a1e;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row17_col0 {\n",
       "  background-color: #005020;\n",
       "  color: #f1f1f1;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row17_col1 {\n",
       "  background-color: #005723;\n",
       "  color: #f1f1f1;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row17_col2 {\n",
       "  background-color: #006227;\n",
       "  color: #f1f1f1;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row17_col3 {\n",
       "  background-color: #258d47;\n",
       "  color: #f1f1f1;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row17_col4 {\n",
       "  background-color: #00491d;\n",
       "  color: #f1f1f1;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row18_col0 {\n",
       "  background-color: #005522;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row18_col1 {\n",
       "  background-color: #005723;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row18_col4 {\n",
       "  background-color: #004d1f;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_43a5a_row19_col0, #T_43a5a_row19_col1 {\n",
       "  background-color: #005020;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row19_col2 {\n",
       "  background-color: #005120;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row19_col3 {\n",
       "  background-color: #00441b;\n",
       "  color: #f1f1f1;\n",
       "  font-weight: 700;\n",
       "}\n",
       "#T_43a5a_row19_col4 {\n",
       "  background-color: #004c1e;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_43a5a_row19_col5 {\n",
       "  background-color: #005321;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "</style>\n",
       "<table id=\"T_43a5a\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th class=\"blank level0\" >&nbsp;</th>\n",
       "      <th id=\"T_43a5a_level0_col0\" class=\"col_heading level0 col0\" >triviaqa</th>\n",
       "      <th id=\"T_43a5a_level0_col1\" class=\"col_heading level0 col1\" >webq</th>\n",
       "      <th id=\"T_43a5a_level0_col2\" class=\"col_heading level0 col2\" >coqa</th>\n",
       "      <th id=\"T_43a5a_level0_col3\" class=\"col_heading level0 col3\" >hotpotqa</th>\n",
       "      <th id=\"T_43a5a_level0_col4\" class=\"col_heading level0 col4\" >csqa</th>\n",
       "      <th id=\"T_43a5a_level0_col5\" class=\"col_heading level0 col5\" >arcchallenge</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th id=\"T_43a5a_level0_row0\" class=\"row_heading level0 row0\" >MSP</th>\n",
       "      <td id=\"T_43a5a_row0_col0\" class=\"data row0 col0\" >0.659</td>\n",
       "      <td id=\"T_43a5a_row0_col1\" class=\"data row0 col1\" >0.521</td>\n",
       "      <td id=\"T_43a5a_row0_col2\" class=\"data row0 col2\" >0.312</td>\n",
       "      <td id=\"T_43a5a_row0_col3\" class=\"data row0 col3\" >0.274</td>\n",
       "      <td id=\"T_43a5a_row0_col4\" class=\"data row0 col4\" >0.511</td>\n",
       "      <td id=\"T_43a5a_row0_col5\" class=\"data row0 col5\" >0.548</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_43a5a_level0_row1\" class=\"row_heading level0 row1\" >MTE</th>\n",
       "      <td id=\"T_43a5a_row1_col0\" class=\"data row1 col0\" >0.670</td>\n",
       "      <td id=\"T_43a5a_row1_col1\" class=\"data row1 col1\" >0.583</td>\n",
       "      <td id=\"T_43a5a_row1_col2\" class=\"data row1 col2\" >0.363</td>\n",
       "      <td id=\"T_43a5a_row1_col3\" class=\"data row1 col3\" >0.494</td>\n",
       "      <td id=\"T_43a5a_row1_col4\" class=\"data row1 col4\" >0.364</td>\n",
       "      <td id=\"T_43a5a_row1_col5\" class=\"data row1 col5\" >0.381</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_43a5a_level0_row2\" class=\"row_heading level0 row2\" >Perplexity</th>\n",
       "      <td id=\"T_43a5a_row2_col0\" class=\"data row2 col0\" >0.647</td>\n",
       "      <td id=\"T_43a5a_row2_col1\" class=\"data row2 col1\" >0.553</td>\n",
       "      <td id=\"T_43a5a_row2_col2\" class=\"data row2 col2\" >0.369</td>\n",
       "      <td id=\"T_43a5a_row2_col3\" class=\"data row2 col3\" >0.527</td>\n",
       "      <td id=\"T_43a5a_row2_col4\" class=\"data row2 col4\" >0.503</td>\n",
       "      <td id=\"T_43a5a_row2_col5\" class=\"data row2 col5\" >0.547</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_43a5a_level0_row3\" class=\"row_heading level0 row3\" >CCP</th>\n",
       "      <td id=\"T_43a5a_row3_col0\" class=\"data row3 col0\" >0.686</td>\n",
       "      <td id=\"T_43a5a_row3_col1\" class=\"data row3 col1\" >0.569</td>\n",
       "      <td id=\"T_43a5a_row3_col2\" class=\"data row3 col2\" >0.326</td>\n",
       "      <td id=\"T_43a5a_row3_col3\" class=\"data row3 col3\" >0.337</td>\n",
       "      <td id=\"T_43a5a_row3_col4\" class=\"data row3 col4\" >0.506</td>\n",
       "      <td id=\"T_43a5a_row3_col5\" class=\"data row3 col5\" >0.527</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_43a5a_level0_row4\" class=\"row_heading level0 row4\" >SAR</th>\n",
       "      <td id=\"T_43a5a_row4_col0\" class=\"data row4 col0\" >0.656</td>\n",
       "      <td id=\"T_43a5a_row4_col1\" class=\"data row4 col1\" >0.571</td>\n",
       "      <td id=\"T_43a5a_row4_col2\" class=\"data row4 col2\" >0.347</td>\n",
       "      <td id=\"T_43a5a_row4_col3\" class=\"data row4 col3\" >0.296</td>\n",
       "      <td id=\"T_43a5a_row4_col4\" class=\"data row4 col4\" >0.183</td>\n",
       "      <td id=\"T_43a5a_row4_col5\" class=\"data row4 col5\" >0.264</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_43a5a_level0_row5\" class=\"row_heading level0 row5\" >P(True)</th>\n",
       "      <td id=\"T_43a5a_row5_col0\" class=\"data row5 col0\" >0.272</td>\n",
       "      <td id=\"T_43a5a_row5_col1\" class=\"data row5 col1\" >-0.004</td>\n",
       "      <td id=\"T_43a5a_row5_col2\" class=\"data row5 col2\" >0.031</td>\n",
       "      <td id=\"T_43a5a_row5_col3\" class=\"data row5 col3\" >0.075</td>\n",
       "      <td id=\"T_43a5a_row5_col4\" class=\"data row5 col4\" >0.090</td>\n",
       "      <td id=\"T_43a5a_row5_col5\" class=\"data row5 col5\" >0.090</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_43a5a_level0_row6\" class=\"row_heading level0 row6\" >SemanticEntropy</th>\n",
       "      <td id=\"T_43a5a_row6_col0\" class=\"data row6 col0\" >0.622</td>\n",
       "      <td id=\"T_43a5a_row6_col1\" class=\"data row6 col1\" >0.505</td>\n",
       "      <td id=\"T_43a5a_row6_col2\" class=\"data row6 col2\" >0.301</td>\n",
       "      <td id=\"T_43a5a_row6_col3\" class=\"data row6 col3\" >0.140</td>\n",
       "      <td id=\"T_43a5a_row6_col4\" class=\"data row6 col4\" >0.407</td>\n",
       "      <td id=\"T_43a5a_row6_col5\" class=\"data row6 col5\" >0.431</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_43a5a_level0_row7\" class=\"row_heading level0 row7\" >Lexical Similarity</th>\n",
       "      <td id=\"T_43a5a_row7_col0\" class=\"data row7 col0\" >0.602</td>\n",
       "      <td id=\"T_43a5a_row7_col1\" class=\"data row7 col1\" >0.540</td>\n",
       "      <td id=\"T_43a5a_row7_col2\" class=\"data row7 col2\" >0.349</td>\n",
       "      <td id=\"T_43a5a_row7_col3\" class=\"data row7 col3\" >0.286</td>\n",
       "      <td id=\"T_43a5a_row7_col4\" class=\"data row7 col4\" >0.386</td>\n",
       "      <td id=\"T_43a5a_row7_col5\" class=\"data row7 col5\" >0.392</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_43a5a_level0_row8\" class=\"row_heading level0 row8\" >EigValLaplacian</th>\n",
       "      <td id=\"T_43a5a_row8_col0\" class=\"data row8 col0\" >0.666</td>\n",
       "      <td id=\"T_43a5a_row8_col1\" class=\"data row8 col1\" >0.555</td>\n",
       "      <td id=\"T_43a5a_row8_col2\" class=\"data row8 col2\" >0.320</td>\n",
       "      <td id=\"T_43a5a_row8_col3\" class=\"data row8 col3\" >0.246</td>\n",
       "      <td id=\"T_43a5a_row8_col4\" class=\"data row8 col4\" >0.386</td>\n",
       "      <td id=\"T_43a5a_row8_col5\" class=\"data row8 col5\" >0.452</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_43a5a_level0_row9\" class=\"row_heading level0 row9\" >NumSemSets</th>\n",
       "      <td id=\"T_43a5a_row9_col0\" class=\"data row9 col0\" >0.656</td>\n",
       "      <td id=\"T_43a5a_row9_col1\" class=\"data row9 col1\" >0.538</td>\n",
       "      <td id=\"T_43a5a_row9_col2\" class=\"data row9 col2\" >0.257</td>\n",
       "      <td id=\"T_43a5a_row9_col3\" class=\"data row9 col3\" >0.268</td>\n",
       "      <td id=\"T_43a5a_row9_col4\" class=\"data row9 col4\" >0.338</td>\n",
       "      <td id=\"T_43a5a_row9_col5\" class=\"data row9 col5\" >0.454</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_43a5a_level0_row10\" class=\"row_heading level0 row10\" >Dissimilarity</th>\n",
       "      <td id=\"T_43a5a_row10_col0\" class=\"data row10 col0\" >0.755</td>\n",
       "      <td id=\"T_43a5a_row10_col1\" class=\"data row10 col1\" >0.715</td>\n",
       "      <td id=\"T_43a5a_row10_col2\" class=\"data row10 col2\" >0.578</td>\n",
       "      <td id=\"T_43a5a_row10_col3\" class=\"data row10 col3\" >0.626</td>\n",
       "      <td id=\"T_43a5a_row10_col4\" class=\"data row10 col4\" >0.561</td>\n",
       "      <td id=\"T_43a5a_row10_col5\" class=\"data row10 col5\" >0.545</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_43a5a_level0_row11\" class=\"row_heading level0 row11\" >Dissimilarity + beamsearch</th>\n",
       "      <td id=\"T_43a5a_row11_col0\" class=\"data row11 col0\" >0.766</td>\n",
       "      <td id=\"T_43a5a_row11_col1\" class=\"data row11 col1\" >0.722</td>\n",
       "      <td id=\"T_43a5a_row11_col2\" class=\"data row11 col2\" >0.600</td>\n",
       "      <td id=\"T_43a5a_row11_col3\" class=\"data row11 col3\" >0.611</td>\n",
       "      <td id=\"T_43a5a_row11_col4\" class=\"data row11 col4\" >0.595</td>\n",
       "      <td id=\"T_43a5a_row11_col5\" class=\"data row11 col5\" >0.604</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_43a5a_level0_row12\" class=\"row_heading level0 row12\" >Eccentricity</th>\n",
       "      <td id=\"T_43a5a_row12_col0\" class=\"data row12 col0\" >0.714</td>\n",
       "      <td id=\"T_43a5a_row12_col1\" class=\"data row12 col1\" >0.653</td>\n",
       "      <td id=\"T_43a5a_row12_col2\" class=\"data row12 col2\" >0.459</td>\n",
       "      <td id=\"T_43a5a_row12_col3\" class=\"data row12 col3\" >0.453</td>\n",
       "      <td id=\"T_43a5a_row12_col4\" class=\"data row12 col4\" >0.549</td>\n",
       "      <td id=\"T_43a5a_row12_col5\" class=\"data row12 col5\" >0.549</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_43a5a_level0_row13\" class=\"row_heading level0 row13\" >Eccentricity + beamsearch</th>\n",
       "      <td id=\"T_43a5a_row13_col0\" class=\"data row13 col0\" >0.739</td>\n",
       "      <td id=\"T_43a5a_row13_col1\" class=\"data row13 col1\" >0.633</td>\n",
       "      <td id=\"T_43a5a_row13_col2\" class=\"data row13 col2\" >0.505</td>\n",
       "      <td id=\"T_43a5a_row13_col3\" class=\"data row13 col3\" >0.514</td>\n",
       "      <td id=\"T_43a5a_row13_col4\" class=\"data row13 col4\" >0.590</td>\n",
       "      <td id=\"T_43a5a_row13_col5\" class=\"data row13 col5\" >0.636</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_43a5a_level0_row14\" class=\"row_heading level0 row14\" >EigVecDissimilarity</th>\n",
       "      <td id=\"T_43a5a_row14_col0\" class=\"data row14 col0\" >0.738</td>\n",
       "      <td id=\"T_43a5a_row14_col1\" class=\"data row14 col1\" >0.661</td>\n",
       "      <td id=\"T_43a5a_row14_col2\" class=\"data row14 col2\" >0.443</td>\n",
       "      <td id=\"T_43a5a_row14_col3\" class=\"data row14 col3\" >0.448</td>\n",
       "      <td id=\"T_43a5a_row14_col4\" class=\"data row14 col4\" >0.512</td>\n",
       "      <td id=\"T_43a5a_row14_col5\" class=\"data row14 col5\" >0.562</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_43a5a_level0_row15\" class=\"row_heading level0 row15\" >EigVecDissimilarity + beamsearch</th>\n",
       "      <td id=\"T_43a5a_row15_col0\" class=\"data row15 col0\" >0.753</td>\n",
       "      <td id=\"T_43a5a_row15_col1\" class=\"data row15 col1\" >0.668</td>\n",
       "      <td id=\"T_43a5a_row15_col2\" class=\"data row15 col2\" >0.497</td>\n",
       "      <td id=\"T_43a5a_row15_col3\" class=\"data row15 col3\" >0.487</td>\n",
       "      <td id=\"T_43a5a_row15_col4\" class=\"data row15 col4\" >0.562</td>\n",
       "      <td id=\"T_43a5a_row15_col5\" class=\"data row15 col5\" >0.621</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_43a5a_level0_row16\" class=\"row_heading level0 row16\" >CocoaMSP</th>\n",
       "      <td id=\"T_43a5a_row16_col0\" class=\"data row16 col0\" >0.738</td>\n",
       "      <td id=\"T_43a5a_row16_col1\" class=\"data row16 col1\" >0.666</td>\n",
       "      <td id=\"T_43a5a_row16_col2\" class=\"data row16 col2\" >0.509</td>\n",
       "      <td id=\"T_43a5a_row16_col3\" class=\"data row16 col3\" >0.430</td>\n",
       "      <td id=\"T_43a5a_row16_col4\" class=\"data row16 col4\" >0.583</td>\n",
       "      <td id=\"T_43a5a_row16_col5\" class=\"data row16 col5\" >0.595</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_43a5a_level0_row17\" class=\"row_heading level0 row17\" >CocoaMSP + beamsearch</th>\n",
       "      <td id=\"T_43a5a_row17_col0\" class=\"data row17 col0\" >0.747</td>\n",
       "      <td id=\"T_43a5a_row17_col1\" class=\"data row17 col1\" >0.679</td>\n",
       "      <td id=\"T_43a5a_row17_col2\" class=\"data row17 col2\" >0.548</td>\n",
       "      <td id=\"T_43a5a_row17_col3\" class=\"data row17 col3\" >0.523</td>\n",
       "      <td id=\"T_43a5a_row17_col4\" class=\"data row17 col4\" >0.586</td>\n",
       "      <td id=\"T_43a5a_row17_col5\" class=\"data row17 col5\" >0.606</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_43a5a_level0_row18\" class=\"row_heading level0 row18\" >CocoaPPL</th>\n",
       "      <td id=\"T_43a5a_row18_col0\" class=\"data row18 col0\" >0.739</td>\n",
       "      <td id=\"T_43a5a_row18_col1\" class=\"data row18 col1\" >0.678</td>\n",
       "      <td id=\"T_43a5a_row18_col2\" class=\"data row18 col2\" >0.548</td>\n",
       "      <td id=\"T_43a5a_row18_col3\" class=\"data row18 col3\" >0.625</td>\n",
       "      <td id=\"T_43a5a_row18_col4\" class=\"data row18 col4\" >0.580</td>\n",
       "      <td id=\"T_43a5a_row18_col5\" class=\"data row18 col5\" >0.595</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_43a5a_level0_row19\" class=\"row_heading level0 row19\" >CocoaPPL + beamsearch</th>\n",
       "      <td id=\"T_43a5a_row19_col0\" class=\"data row19 col0\" >0.748</td>\n",
       "      <td id=\"T_43a5a_row19_col1\" class=\"data row19 col1\" >0.694</td>\n",
       "      <td id=\"T_43a5a_row19_col2\" class=\"data row19 col2\" >0.577</td>\n",
       "      <td id=\"T_43a5a_row19_col3\" class=\"data row19 col3\" >0.681</td>\n",
       "      <td id=\"T_43a5a_row19_col4\" class=\"data row19 col4\" >0.582</td>\n",
       "      <td id=\"T_43a5a_row19_col5\" class=\"data row19 col5\" >0.610</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n"
      ],
      "text/plain": [
       "<pandas.io.formats.style.Styler at 0x14924d496f90>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\\begin{tabular}{lcccccc}\n",
      "\\toprule\n",
      "Method & TriviaQA & \\multirowcell{Web\\\\Questions} & CoQA & HotpotQA & \\multirowcell{Common\\\\senceQA} & \\multirowcell{ARC-\\\\Challenge} \\\\\n",
      "\\midrule\n",
      "MSP & .659 & .521 & .312 & .274 & .511 & .548 \\\\\n",
      "MTE & .670 & .583 & .363 & .494 & .364 & .381 \\\\\n",
      "Perplexity & .647 & .553 & .369 & .527 & .503 & .547 \\\\\n",
      "CCP & .686 & .569 & .326 & .337 & .506 & .527 \\\\\n",
      "SAR & .656 & .571 & .347 & .296 & .183 & .264 \\\\\n",
      "P(True) & .272 & -.004 & .031 & .075 & .090 & .090 \\\\\n",
      "SemanticEntropy & .622 & .505 & .301 & .140 & .407 & .431 \\\\\n",
      "Lexical Similarity & .602 & .540 & .349 & .286 & .386 & .392 \\\\\n",
      "EigValLaplacian & .666 & .555 & .320 & .246 & .386 & .452 \\\\\n",
      "NumSemSets & .656 & .538 & .257 & .268 & .338 & .454 \\\\\n",
      "\\midrule\n",
      "Dissimilarity & \\underline{.755} & \\underline{.715} & \\underline{.578} & \\underline{.626} & .561 & .545 \\\\\n",
      "Dissimilarity + beamsearch & \\textbf{.766}$\\uparrow$ & \\textbf{.722}$\\uparrow$ & \\textbf{.600}$\\uparrow$ & .611 & \\textbf{.595}$\\uparrow$ & .604$\\uparrow$ \\\\\n",
      "\\midrule\n",
      "Eccentricity & .714 & .653 & .459 & .453 & .549 & .549 \\\\\n",
      "Eccentricity + beamsearch & .739$\\uparrow$ & .633 & .505$\\uparrow$ & .514$\\uparrow$ & \\underline{.590}$\\uparrow$ & \\textbf{.636}$\\uparrow$ \\\\\n",
      "\\midrule\n",
      "EigVecDissimilarity & .738 & .661 & .443 & .448 & .512 & .562 \\\\\n",
      "EigVecDissimilarity + beamsearch & .753$\\uparrow$ & .668$\\uparrow$ & .497$\\uparrow$ & .487$\\uparrow$ & .562$\\uparrow$ & \\underline{.621}$\\uparrow$ \\\\\n",
      "\\midrule\n",
      "CocoaMSP & .738 & .666 & .509 & .430 & .583 & .595 \\\\\n",
      "CocoaMSP + beamsearch & .747$\\uparrow$ & .679$\\uparrow$ & .548$\\uparrow$ & .523$\\uparrow$ & .586$\\uparrow$ & .606$\\uparrow$ \\\\\n",
      "\\midrule\n",
      "CocoaPPL & .739 & .678 & .548 & .625 & .580 & .595 \\\\\n",
      "CocoaPPL + beamsearch & .748$\\uparrow$ & .694$\\uparrow$ & .577$\\uparrow$ & \\textbf{.681}$\\uparrow$ & .582$\\uparrow$ & .610$\\uparrow$ \\\\\n",
      "\\bottomrule\n",
      "\\end{tabular}\n"
     ]
    }
   ],
   "source": [
    "model_index = 2\n",
    "df = build_prr_single_model(\n",
    "    mans_by_dataset,\n",
    "    dataset_order=datasets,\n",
    "    model_index=model_index,\n",
    "    uq_methods=uq_methods,\n",
    "    rank_metric=PredictionRejectionAreaNormalized(0.5),\n",
    "    quality_metric='AlignScore',\n",
    "    filters_by_dataset=filters,\n",
    ")\n",
    "\n",
    "show_prr(df, uq_methods, title=f'{model_titles[model_index]}, PRR per dataset', method_rename=uq_rename_fn)\n",
    "_ = print_latex_prr(df, uq_methods, method_rename=uq_rename_fn, ds_rename=ds_rename_fn)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "5144f258-47ad-4621-8f87-c3379bb523bd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "257aba978607464492cb757044cfe9b8",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "for model_index in tqdm(range(len(model_titles)), total=len(model_titles)):\n",
    "    df = build_prr_single_model(\n",
    "        mans_by_dataset,\n",
    "        dataset_order=datasets,\n",
    "        model_index=model_index,\n",
    "        uq_methods=uq_methods,\n",
    "        rank_metric=PredictionRejectionAreaNormalized(0.5),\n",
    "        quality_metric='AlignScore',\n",
    "        filters_by_dataset=filters,\n",
    "    )\n",
    "    print_latex_prr(\n",
    "        df,\n",
    "        uq_methods,\n",
    "        method_rename=uq_rename_fn,\n",
    "        ds_rename=ds_rename_fn,\n",
    "        # save_path=f'latex_tables/qa_{model_simple_titles[model_index]}.tex',\n",
    "        save_path=f'latex_tables/qa_{model_simple_titles[model_index]}_minp0.tex',\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "882e4b09-e10c-492d-8fa9-ae790fdc972b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ---- Build table: columns = models, rows = methods; cell = mean PRR across datasets (PRR-only) ----\n",
    "from typing import Sequence, Tuple, Dict, Callable, Optional\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from lm_polygraph.ue_metrics import PredictionRejectionAreaNormalized\n",
    "\n",
    "def build_prr_models_means(\n",
    "    mans_by_dataset: Dict[str, Sequence],          # {'triviaqa': [man0..man5], 'webq': [man0..man5], ...}\n",
    "    dataset_order: Sequence[str],                  # e.g. ['triviaqa','webq','coqa','hotpotqa','csqa','arcchallenge']\n",
    "    model_titles: Sequence[str],                   # e.g. ['Llama 8B', 'Llama 8B Instruct', ...]\n",
    "    uq_methods: Sequence[Tuple[str, ...]],         # ordered method groups (tuples)\n",
    "    *,\n",
    "    quality_metric: str = 'AlignScore',            # supports 'AlignScore>0.5' too (your _prr handles it)\n",
    "    filters_by_dataset: Optional[Dict[str, Callable[[dict], bool]]] = None,\n",
    "    prr_threshold: float = 0.5,\n",
    ") -> pd.DataFrame:\n",
    "    \"\"\"\n",
    "    Returns a DataFrame:\n",
    "      - index = methods (flattened in uq_methods order)\n",
    "      - columns = model_titles\n",
    "      - value = mean PRR across datasets in dataset_order\n",
    "    \"\"\"\n",
    "    n_models = len(model_titles)\n",
    "    prr_metric = PredictionRejectionAreaNormalized(prr_threshold)\n",
    "\n",
    "    # Precompute masks per (dataset, model)\n",
    "    masks: Dict[str, list] = {}\n",
    "    for ds in dataset_order:\n",
    "        mans = mans_by_dataset.get(ds, [])\n",
    "        filt = (filters_by_dataset or {}).get(ds, lambda _: True)\n",
    "        ds_masks = []\n",
    "        for mi in range(n_models):\n",
    "            man = mans[mi] if mi < len(mans) else None\n",
    "            ds_masks.append(_mask_for_dataset(man, filt) if man is not None else None)\n",
    "        masks[ds] = ds_masks\n",
    "\n",
    "    # Build table\n",
    "    rows = [m for grp in uq_methods for m in grp]\n",
    "    result = pd.DataFrame(index=rows, columns=list(model_titles), dtype=float)\n",
    "\n",
    "    for mi, model_name in enumerate(model_titles):\n",
    "        for mname in rows:\n",
    "            prrs = []\n",
    "            for ds in dataset_order:\n",
    "                mans = mans_by_dataset.get(ds, [])\n",
    "                man = mans[mi] if mi < len(mans) else None\n",
    "                mask = masks[ds][mi] if ds in masks else None\n",
    "                if man is None or mask is None:\n",
    "                    prrs.append(np.nan)\n",
    "                else:\n",
    "                    prrs.append(_prr(man, mname, prr_metric, quality_metric, mask))\n",
    "            result.loc[mname, model_name] = float(np.nanmean(prrs)) if len(prrs) else np.nan\n",
    "\n",
    "    return result\n",
    "\n",
    "def show_model_means(\n",
    "    df_means: pd.DataFrame,\n",
    "    uq_methods: Sequence[Tuple[str, ...]],\n",
    "    *,\n",
    "    title: str | None = None,\n",
    "    precision: int = 3,\n",
    "    method_rename: Callable[[str], str] = lambda x: x,\n",
    "    add_beam_suffix: bool = True,\n",
    "    lower_is_better: bool = False,    # set True for ECE\n",
    ") -> None:\n",
    "    \"\"\"\n",
    "    Notebook display for the means table (columns = models).\n",
    "      - per-column green scaling (direction-aware)\n",
    "      - Top-1 bold, Top-2 underline (direction-aware)\n",
    "      - group separators; optional ' + beamsearch' / ' + beamsearch + probs' suffixes\n",
    "    \"\"\"\n",
    "    ordered_rows = [m for grp in uq_methods for m in grp]\n",
    "    dfR = df_means.reindex(ordered_rows)\n",
    "\n",
    "    # display index with optional suffixes by position in group\n",
    "    disp_index = []\n",
    "    for grp in uq_methods:\n",
    "        for j, m in enumerate(grp):\n",
    "            name = method_rename(m)\n",
    "            if add_beam_suffix:\n",
    "                if j == 1: name += ' + beamsearch'\n",
    "                elif j == 2: name += ' + beamsearch + probs'\n",
    "            disp_index.append(name)\n",
    "\n",
    "    disp = dfR.copy()\n",
    "    disp.index = disp_index\n",
    "\n",
    "    if title:\n",
    "        display(Markdown('## ' + title))\n",
    "\n",
    "    styled = disp.style.format(f'{{:.{precision}f}}')\n",
    "\n",
    "    # per-column gradient (invert for lower-better)\n",
    "    for c in disp.columns:\n",
    "        col = pd.to_numeric(disp[c], errors='coerce')\n",
    "        nonnan = col.dropna()\n",
    "        if nonnan.empty:\n",
    "            continue\n",
    "        if lower_is_better:\n",
    "            vmax, vmin = float(nonnan.max()), float(nonnan.min())\n",
    "            rng = (vmax - vmin) if vmax > vmin else 1.0\n",
    "            gmap = (vmax - col) / rng\n",
    "            styled = styled.background_gradient(cmap='Greens', subset=[c], gmap=gmap)\n",
    "        else:\n",
    "            vmin, vmax = float(nonnan.min()), float(nonnan.max())\n",
    "            if vmin == vmax: vmax = vmin + 1e-12\n",
    "            styled = styled.background_gradient(cmap='Greens', subset=[c], vmin=vmin, vmax=vmax)\n",
    "\n",
    "    # Top-1 bold, Top-2 underline per column (direction-aware)\n",
    "    def _col_rank_styles(col: pd.Series):\n",
    "        s = pd.to_numeric(col, errors='coerce')\n",
    "        styles = pd.Series('', index=s.index)\n",
    "        ranked = s.dropna().sort_values(ascending=lower_is_better, kind='mergesort')\n",
    "        if len(ranked) >= 1: styles.loc[ranked.index[0]] += 'font-weight:700;'\n",
    "        if len(ranked) >= 2: styles.loc[ranked.index[1]] += 'text-decoration: underline;'\n",
    "        return styles\n",
    "    styled = styled.apply(_col_rank_styles, axis=0)\n",
    "\n",
    "    # separators only around multi-row groups\n",
    "    start = 0\n",
    "    for grp in uq_methods:\n",
    "        if len(grp) >= 2:\n",
    "            end = start + len(grp) - 1\n",
    "            if start > 0:\n",
    "                styled = styled.set_properties(subset=pd.IndexSlice[disp.index[start], :], **{'border-top': '3px solid black'})\n",
    "            if end < len(disp) - 1:\n",
    "                styled = styled.set_properties(subset=pd.IndexSlice[disp.index[end], :], **{'border-bottom': '3px solid black'})\n",
    "        start += len(grp)\n",
    "\n",
    "    display(styled)\n",
    "\n",
    "\n",
    "def print_latex_model_means(\n",
    "    df_means: pd.DataFrame,\n",
    "    uq_methods: Sequence[Tuple[str, ...]],\n",
    "    *,\n",
    "    precision: int = 3,\n",
    "    method_rename: Callable[[str], str] = lambda x: x,\n",
    "    add_beam_suffix: bool = True,\n",
    "    lower_is_better: bool = False,   # set True for ECE\n",
    "    up_arrow_tex: str = r'$\\uparrow$',\n",
    "    save_path: Optional[str] = None,\n",
    ") -> str:\n",
    "    \"\"\"\n",
    "    Booktabs LaTeX for the means table (columns = models):\n",
    "      - single header row: model names\n",
    "      - Top-1 bold / Top-2 underline (direction-aware)\n",
    "      - one \\midrule before each multi-row group (no midrules for singletons)\n",
    "      - ↑ in multi-row groups if a row beats the first method of the group in that column\n",
    "      - remove leading zeros\n",
    "    \"\"\"\n",
    "    ordered_rows = [m for grp in uq_methods for m in grp]\n",
    "    disp = df_means.reindex(ordered_rows)\n",
    "\n",
    "    # ranking info per column\n",
    "    top1, top2 = {}, {}\n",
    "    for c in disp.columns:\n",
    "        s = pd.to_numeric(disp[c], errors='coerce').dropna().sort_values(ascending=lower_is_better, kind='mergesort')\n",
    "        if len(s) >= 1: top1[c] = s.index[0]\n",
    "        if len(s) >= 2: top2[c] = s.index[1]\n",
    "\n",
    "    def _fmt_num(x: float) -> str:\n",
    "        if pd.isna(x): return ''\n",
    "        txt = f'{x:.{precision}f}'\n",
    "        if txt.startswith('0.'):  txt = txt[1:]\n",
    "        if txt.startswith('-0.'): txt = '-' + txt[2:]\n",
    "        return txt\n",
    "\n",
    "    def _fmt_cell(val, row_label, col_name, baseline_val, is_first_in_group):\n",
    "        txt = _fmt_num(val)\n",
    "        if txt == '': return txt\n",
    "        if col_name in top1 and row_label == top1[col_name]:\n",
    "            txt = r'\\textbf{' + txt + '}'\n",
    "        elif col_name in top2 and row_label == top2[col_name]:\n",
    "            txt = r'\\underline{' + txt + '}'\n",
    "        if (not is_first_in_group) and baseline_val is not None and not pd.isna(baseline_val):\n",
    "            try:\n",
    "                vb, v0 = float(val), float(baseline_val)\n",
    "                better = (vb < v0) if lower_is_better else (vb > v0)\n",
    "                if better:\n",
    "                    txt += up_arrow_tex\n",
    "            except Exception:\n",
    "                pass\n",
    "        return txt\n",
    "\n",
    "    cols = list(disp.columns)\n",
    "    lines = [\n",
    "        r'\\begin{tabular}{' + ('l' + 'c'*len(cols)) + '}',\n",
    "        r'\\toprule',\n",
    "        ' & '.join(['Method'] + cols) + r' \\\\',\n",
    "        r'\\midrule'\n",
    "    ]\n",
    "\n",
    "    printed_any = False\n",
    "    for grp in uq_methods:\n",
    "        multi = (len(grp) >= 2)\n",
    "        if multi and printed_any:\n",
    "            lines.append(r'\\midrule')\n",
    "\n",
    "        base_name = grp[0]\n",
    "        base_vals = disp.loc[base_name] if base_name in disp.index else None\n",
    "\n",
    "        for j, m in enumerate(grp):\n",
    "            row = disp.loc[m] if m in disp.index else pd.Series([np.nan]*len(cols), index=cols)\n",
    "            name = method_rename(m)\n",
    "            if add_beam_suffix:\n",
    "                if j == 1: name += ' + beamsearch'\n",
    "                elif j == 2: name += ' + beamsearch + probs'\n",
    "            cells = [name] + [\n",
    "                _fmt_cell(row[c], m, c, (None if (j==0 or base_vals is None) else base_vals[c]), j==0)\n",
    "                for c in cols\n",
    "            ]\n",
    "            lines.append(' & '.join(cells) + r' \\\\')\n",
    "            printed_any = True\n",
    "\n",
    "    lines.append(r'\\bottomrule')\n",
    "    lines.append(r'\\end{tabular}')\n",
    "\n",
    "    latex = '\\n'.join(lines)\n",
    "    if save_path:\n",
    "        with open(save_path, 'w') as f:\n",
    "            f.write(latex)\n",
    "    else:\n",
    "        print(latex)\n",
    "    return latex"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "1467ba82-edbf-4072-8175-66dcab0e9157",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "## Mean PRR across datasets (columns = models)"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<style type=\"text/css\">\n",
       "#T_00f36_row0_col0, #T_00f36_row7_col0, #T_00f36_row9_col5 {\n",
       "  background-color: #238b45;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row0_col1 {\n",
       "  background-color: #1e8741;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row0_col2 {\n",
       "  background-color: #349d53;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row0_col3, #T_00f36_row6_col3 {\n",
       "  background-color: #006b2b;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row0_col4 {\n",
       "  background-color: #29914a;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row0_col5 {\n",
       "  background-color: #0e7936;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row1_col0, #T_00f36_row8_col5 {\n",
       "  background-color: #1d8640;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row1_col1, #T_00f36_row2_col0, #T_00f36_row3_col1, #T_00f36_row3_col5 {\n",
       "  background-color: #0d7836;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row1_col2 {\n",
       "  background-color: #319a50;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row1_col3 {\n",
       "  background-color: #005924;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row1_col4 {\n",
       "  background-color: #137d39;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row1_col5 {\n",
       "  background-color: #0a7633;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row2_col1 {\n",
       "  background-color: #2d954d;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row2_col2, #T_00f36_row7_col3 {\n",
       "  background-color: #1c8540;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row2_col3 {\n",
       "  background-color: #016e2d;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row2_col4 {\n",
       "  background-color: #2c944c;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row2_col5 {\n",
       "  background-color: #16803c;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row3_col0 {\n",
       "  background-color: #278f48;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row3_col2, #T_00f36_row7_col5 {\n",
       "  background-color: #2b934b;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row3_col3 {\n",
       "  background-color: #004e1f;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row3_col4 {\n",
       "  background-color: #37a055;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row4_col0 {\n",
       "  background-color: #3ea75a;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row4_col1 {\n",
       "  background-color: #00692a;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row4_col2 {\n",
       "  background-color: #68be70;\n",
       "  color: #000000;\n",
       "}\n",
       "#T_00f36_row4_col3 {\n",
       "  background-color: #208843;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row4_col4 {\n",
       "  background-color: #329b51;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row4_col5, #T_00f36_row7_col1 {\n",
       "  background-color: #0c7735;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row5_col0, #T_00f36_row5_col1, #T_00f36_row5_col2, #T_00f36_row5_col3, #T_00f36_row5_col4, #T_00f36_row5_col5 {\n",
       "  background-color: #f7fcf5;\n",
       "  color: #000000;\n",
       "}\n",
       "#T_00f36_row6_col0, #T_00f36_row9_col3 {\n",
       "  background-color: #218944;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row6_col1, #T_00f36_row7_col4 {\n",
       "  background-color: #05712f;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row6_col2 {\n",
       "  background-color: #5eb96b;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row6_col4 {\n",
       "  background-color: #58b668;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row6_col5 {\n",
       "  background-color: #097532;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row7_col2 {\n",
       "  background-color: #4bb062;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row8_col0 {\n",
       "  background-color: #1a843f;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row8_col1 {\n",
       "  background-color: #087432;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row8_col2 {\n",
       "  background-color: #43ac5e;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row8_col3 {\n",
       "  background-color: #248c46;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row8_col4 {\n",
       "  background-color: #147e3a;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row9_col0 {\n",
       "  background-color: #2a924a;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row9_col1 {\n",
       "  background-color: #2f984f;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row9_col2 {\n",
       "  background-color: #52b365;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row9_col4 {\n",
       "  background-color: #309950;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row10_col0, #T_00f36_row16_col0 {\n",
       "  background-color: #005b25;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_00f36_row10_col1 {\n",
       "  background-color: #026f2e;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_00f36_row10_col2 {\n",
       "  background-color: #004e1f;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_00f36_row10_col3 {\n",
       "  background-color: #329b51;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_00f36_row10_col4 {\n",
       "  background-color: #00451c;\n",
       "  color: #f1f1f1;\n",
       "  text-decoration: underline;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_00f36_row10_col5, #T_00f36_row18_col5 {\n",
       "  background-color: #006428;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_00f36_row11_col0, #T_00f36_row11_col2, #T_00f36_row11_col4, #T_00f36_row15_col5, #T_00f36_row17_col1, #T_00f36_row17_col3 {\n",
       "  background-color: #00441b;\n",
       "  color: #f1f1f1;\n",
       "  font-weight: 700;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_00f36_row11_col1 {\n",
       "  background-color: #004c1e;\n",
       "  color: #f1f1f1;\n",
       "  text-decoration: underline;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_00f36_row11_col3 {\n",
       "  background-color: #18823d;\n",
       "  color: #f1f1f1;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_00f36_row11_col5 {\n",
       "  background-color: #005221;\n",
       "  color: #f1f1f1;\n",
       "  text-decoration: underline;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_00f36_row12_col0 {\n",
       "  background-color: #0c7735;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_00f36_row12_col1 {\n",
       "  background-color: #0b7734;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_00f36_row12_col2, #T_00f36_row14_col0 {\n",
       "  background-color: #087432;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_00f36_row12_col3 {\n",
       "  background-color: #258d47;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_00f36_row12_col4 {\n",
       "  background-color: #1c8540;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_00f36_row12_col5 {\n",
       "  background-color: #248c46;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_00f36_row13_col0 {\n",
       "  background-color: #005b25;\n",
       "  color: #f1f1f1;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_00f36_row13_col1 {\n",
       "  background-color: #005e26;\n",
       "  color: #f1f1f1;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_00f36_row13_col2 {\n",
       "  background-color: #005f26;\n",
       "  color: #f1f1f1;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_00f36_row13_col3 {\n",
       "  background-color: #026f2e;\n",
       "  color: #f1f1f1;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_00f36_row13_col4 {\n",
       "  background-color: #117b38;\n",
       "  color: #f1f1f1;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_00f36_row13_col5, #T_00f36_row15_col0 {\n",
       "  background-color: #005924;\n",
       "  color: #f1f1f1;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_00f36_row14_col1, #T_00f36_row14_col2 {\n",
       "  background-color: #097532;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_00f36_row14_col3 {\n",
       "  background-color: #228a44;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_00f36_row14_col4 {\n",
       "  background-color: #05712f;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_00f36_row14_col5 {\n",
       "  background-color: #218944;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_00f36_row15_col1 {\n",
       "  background-color: #004e1f;\n",
       "  color: #f1f1f1;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_00f36_row15_col2 {\n",
       "  background-color: #006227;\n",
       "  color: #f1f1f1;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_00f36_row15_col3 {\n",
       "  background-color: #006428;\n",
       "  color: #f1f1f1;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_00f36_row15_col4 {\n",
       "  background-color: #005c25;\n",
       "  color: #f1f1f1;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_00f36_row16_col1 {\n",
       "  background-color: #005924;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_00f36_row16_col2 {\n",
       "  background-color: #00682a;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_00f36_row16_col3 {\n",
       "  background-color: #005a24;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_00f36_row16_col4, #T_00f36_row18_col4 {\n",
       "  background-color: #005221;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_00f36_row16_col5 {\n",
       "  background-color: #006027;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_00f36_row17_col0 {\n",
       "  background-color: #005120;\n",
       "  color: #f1f1f1;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_00f36_row17_col2, #T_00f36_row17_col5 {\n",
       "  background-color: #005723;\n",
       "  color: #f1f1f1;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_00f36_row17_col4 {\n",
       "  background-color: #00481d;\n",
       "  color: #f1f1f1;\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_00f36_row18_col0 {\n",
       "  background-color: #005020;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_00f36_row18_col1 {\n",
       "  background-color: #005f26;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_00f36_row18_col2 {\n",
       "  background-color: #005120;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_00f36_row18_col3 {\n",
       "  background-color: #005c25;\n",
       "  color: #f1f1f1;\n",
       "  border-top: 3px solid black;\n",
       "}\n",
       "#T_00f36_row19_col0, #T_00f36_row19_col3 {\n",
       "  background-color: #00481d;\n",
       "  color: #f1f1f1;\n",
       "  text-decoration: underline;\n",
       "}\n",
       "#T_00f36_row19_col1 {\n",
       "  background-color: #005020;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row19_col2 {\n",
       "  background-color: #00441b;\n",
       "  color: #f1f1f1;\n",
       "  text-decoration: underline;\n",
       "}\n",
       "#T_00f36_row19_col4 {\n",
       "  background-color: #005221;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_00f36_row19_col5 {\n",
       "  background-color: #005e26;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "</style>\n",
       "<table id=\"T_00f36\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th class=\"blank level0\" >&nbsp;</th>\n",
       "      <th id=\"T_00f36_level0_col0\" class=\"col_heading level0 col0\" >\\multirowcell{Llama 3.1 8B\\\\base}</th>\n",
       "      <th id=\"T_00f36_level0_col1\" class=\"col_heading level0 col1\" >\\multirowcell{Llama 3.1 8B\\\\instruct}</th>\n",
       "      <th id=\"T_00f36_level0_col2\" class=\"col_heading level0 col2\" >\\multirowcell{Gemma 3 4B\\\\base}</th>\n",
       "      <th id=\"T_00f36_level0_col3\" class=\"col_heading level0 col3\" >\\multirowcell{Gemma 3 4B\\\\instruct}</th>\n",
       "      <th id=\"T_00f36_level0_col4\" class=\"col_heading level0 col4\" >\\multirowcell{Qwen 3 8B\\\\base}</th>\n",
       "      <th id=\"T_00f36_level0_col5\" class=\"col_heading level0 col5\" >\\multirowcell{Qwen 3 8B\\\\instruct}</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th id=\"T_00f36_level0_row0\" class=\"row_heading level0 row0\" >MSP</th>\n",
       "      <td id=\"T_00f36_row0_col0\" class=\"data row0 col0\" >0.410</td>\n",
       "      <td id=\"T_00f36_row0_col1\" class=\"data row0 col1\" >0.344</td>\n",
       "      <td id=\"T_00f36_row0_col2\" class=\"data row0 col2\" >0.471</td>\n",
       "      <td id=\"T_00f36_row0_col3\" class=\"data row0 col3\" >0.292</td>\n",
       "      <td id=\"T_00f36_row0_col4\" class=\"data row0 col4\" >0.376</td>\n",
       "      <td id=\"T_00f36_row0_col5\" class=\"data row0 col5\" >0.289</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_00f36_level0_row1\" class=\"row_heading level0 row1\" >MTE</th>\n",
       "      <td id=\"T_00f36_row1_col0\" class=\"data row1 col0\" >0.422</td>\n",
       "      <td id=\"T_00f36_row1_col1\" class=\"data row1 col1\" >0.364</td>\n",
       "      <td id=\"T_00f36_row1_col2\" class=\"data row1 col2\" >0.476</td>\n",
       "      <td id=\"T_00f36_row1_col3\" class=\"data row1 col3\" >0.317</td>\n",
       "      <td id=\"T_00f36_row1_col4\" class=\"data row1 col4\" >0.407</td>\n",
       "      <td id=\"T_00f36_row1_col5\" class=\"data row1 col5\" >0.297</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_00f36_level0_row2\" class=\"row_heading level0 row2\" >Perplexity</th>\n",
       "      <td id=\"T_00f36_row2_col0\" class=\"data row2 col0\" >0.452</td>\n",
       "      <td id=\"T_00f36_row2_col1\" class=\"data row2 col1\" >0.323</td>\n",
       "      <td id=\"T_00f36_row2_col2\" class=\"data row2 col2\" >0.525</td>\n",
       "      <td id=\"T_00f36_row2_col3\" class=\"data row2 col3\" >0.288</td>\n",
       "      <td id=\"T_00f36_row2_col4\" class=\"data row2 col4\" >0.372</td>\n",
       "      <td id=\"T_00f36_row2_col5\" class=\"data row2 col5\" >0.276</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_00f36_level0_row3\" class=\"row_heading level0 row3\" >CCP</th>\n",
       "      <td id=\"T_00f36_row3_col0\" class=\"data row3 col0\" >0.401</td>\n",
       "      <td id=\"T_00f36_row3_col1\" class=\"data row3 col1\" >0.364</td>\n",
       "      <td id=\"T_00f36_row3_col2\" class=\"data row3 col2\" >0.492</td>\n",
       "      <td id=\"T_00f36_row3_col3\" class=\"data row3 col3\" >0.331</td>\n",
       "      <td id=\"T_00f36_row3_col4\" class=\"data row3 col4\" >0.355</td>\n",
       "      <td id=\"T_00f36_row3_col5\" class=\"data row3 col5\" >0.291</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_00f36_level0_row4\" class=\"row_heading level0 row4\" >SAR</th>\n",
       "      <td id=\"T_00f36_row4_col0\" class=\"data row4 col0\" >0.352</td>\n",
       "      <td id=\"T_00f36_row4_col1\" class=\"data row4 col1\" >0.385</td>\n",
       "      <td id=\"T_00f36_row4_col2\" class=\"data row4 col2\" >0.386</td>\n",
       "      <td id=\"T_00f36_row4_col3\" class=\"data row4 col3\" >0.239</td>\n",
       "      <td id=\"T_00f36_row4_col4\" class=\"data row4 col4\" >0.363</td>\n",
       "      <td id=\"T_00f36_row4_col5\" class=\"data row4 col5\" >0.292</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_00f36_level0_row5\" class=\"row_heading level0 row5\" >P(True)</th>\n",
       "      <td id=\"T_00f36_row5_col0\" class=\"data row5 col0\" >0.015</td>\n",
       "      <td id=\"T_00f36_row5_col1\" class=\"data row5 col1\" >0.072</td>\n",
       "      <td id=\"T_00f36_row5_col2\" class=\"data row5 col2\" >0.093</td>\n",
       "      <td id=\"T_00f36_row5_col3\" class=\"data row5 col3\" >-0.096</td>\n",
       "      <td id=\"T_00f36_row5_col4\" class=\"data row5 col4\" >0.110</td>\n",
       "      <td id=\"T_00f36_row5_col5\" class=\"data row5 col5\" >-0.114</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_00f36_level0_row6\" class=\"row_heading level0 row6\" >SemanticEntropy</th>\n",
       "      <td id=\"T_00f36_row6_col0\" class=\"data row6 col0\" >0.414</td>\n",
       "      <td id=\"T_00f36_row6_col1\" class=\"data row6 col1\" >0.376</td>\n",
       "      <td id=\"T_00f36_row6_col2\" class=\"data row6 col2\" >0.401</td>\n",
       "      <td id=\"T_00f36_row6_col3\" class=\"data row6 col3\" >0.293</td>\n",
       "      <td id=\"T_00f36_row6_col4\" class=\"data row6 col4\" >0.319</td>\n",
       "      <td id=\"T_00f36_row6_col5\" class=\"data row6 col5\" >0.299</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_00f36_level0_row7\" class=\"row_heading level0 row7\" >Lexical Similarity</th>\n",
       "      <td id=\"T_00f36_row7_col0\" class=\"data row7 col0\" >0.411</td>\n",
       "      <td id=\"T_00f36_row7_col1\" class=\"data row7 col1\" >0.366</td>\n",
       "      <td id=\"T_00f36_row7_col2\" class=\"data row7 col2\" >0.426</td>\n",
       "      <td id=\"T_00f36_row7_col3\" class=\"data row7 col3\" >0.247</td>\n",
       "      <td id=\"T_00f36_row7_col4\" class=\"data row7 col4\" >0.425</td>\n",
       "      <td id=\"T_00f36_row7_col5\" class=\"data row7 col5\" >0.237</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_00f36_level0_row8\" class=\"row_heading level0 row8\" >EigValLaplacian</th>\n",
       "      <td id=\"T_00f36_row8_col0\" class=\"data row8 col0\" >0.426</td>\n",
       "      <td id=\"T_00f36_row8_col1\" class=\"data row8 col1\" >0.371</td>\n",
       "      <td id=\"T_00f36_row8_col2\" class=\"data row8 col2\" >0.437</td>\n",
       "      <td id=\"T_00f36_row8_col3\" class=\"data row8 col3\" >0.233</td>\n",
       "      <td id=\"T_00f36_row8_col4\" class=\"data row8 col4\" >0.406</td>\n",
       "      <td id=\"T_00f36_row8_col5\" class=\"data row8 col5\" >0.265</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_00f36_level0_row9\" class=\"row_heading level0 row9\" >NumSemSets</th>\n",
       "      <td id=\"T_00f36_row9_col0\" class=\"data row9 col0\" >0.396</td>\n",
       "      <td id=\"T_00f36_row9_col1\" class=\"data row9 col1\" >0.319</td>\n",
       "      <td id=\"T_00f36_row9_col2\" class=\"data row9 col2\" >0.418</td>\n",
       "      <td id=\"T_00f36_row9_col3\" class=\"data row9 col3\" >0.238</td>\n",
       "      <td id=\"T_00f36_row9_col4\" class=\"data row9 col4\" >0.365</td>\n",
       "      <td id=\"T_00f36_row9_col5\" class=\"data row9 col5\" >0.253</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_00f36_level0_row10\" class=\"row_heading level0 row10\" >Dissimilarity</th>\n",
       "      <td id=\"T_00f36_row10_col0\" class=\"data row10 col0\" >0.505</td>\n",
       "      <td id=\"T_00f36_row10_col1\" class=\"data row10 col1\" >0.379</td>\n",
       "      <td id=\"T_00f36_row10_col2\" class=\"data row10 col2\" >0.630</td>\n",
       "      <td id=\"T_00f36_row10_col3\" class=\"data row10 col3\" >0.206</td>\n",
       "      <td id=\"T_00f36_row10_col4\" class=\"data row10 col4\" >0.477</td>\n",
       "      <td id=\"T_00f36_row10_col5\" class=\"data row10 col5\" >0.327</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_00f36_level0_row11\" class=\"row_heading level0 row11\" >Dissimilarity + beamsearch</th>\n",
       "      <td id=\"T_00f36_row11_col0\" class=\"data row11 col0\" >0.543</td>\n",
       "      <td id=\"T_00f36_row11_col1\" class=\"data row11 col1\" >0.417</td>\n",
       "      <td id=\"T_00f36_row11_col2\" class=\"data row11 col2\" >0.650</td>\n",
       "      <td id=\"T_00f36_row11_col3\" class=\"data row11 col3\" >0.252</td>\n",
       "      <td id=\"T_00f36_row11_col4\" class=\"data row11 col4\" >0.478</td>\n",
       "      <td id=\"T_00f36_row11_col5\" class=\"data row11 col5\" >0.355</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_00f36_level0_row12\" class=\"row_heading level0 row12\" >Eccentricity</th>\n",
       "      <td id=\"T_00f36_row12_col0\" class=\"data row12 col0\" >0.453</td>\n",
       "      <td id=\"T_00f36_row12_col1\" class=\"data row12 col1\" >0.368</td>\n",
       "      <td id=\"T_00f36_row12_col2\" class=\"data row12 col2\" >0.563</td>\n",
       "      <td id=\"T_00f36_row12_col3\" class=\"data row12 col3\" >0.231</td>\n",
       "      <td id=\"T_00f36_row12_col4\" class=\"data row12 col4\" >0.396</td>\n",
       "      <td id=\"T_00f36_row12_col5\" class=\"data row12 col5\" >0.251</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_00f36_level0_row13\" class=\"row_heading level0 row13\" >Eccentricity + beamsearch</th>\n",
       "      <td id=\"T_00f36_row13_col0\" class=\"data row13 col0\" >0.505</td>\n",
       "      <td id=\"T_00f36_row13_col1\" class=\"data row13 col1\" >0.397</td>\n",
       "      <td id=\"T_00f36_row13_col2\" class=\"data row13 col2\" >0.603</td>\n",
       "      <td id=\"T_00f36_row13_col3\" class=\"data row13 col3\" >0.285</td>\n",
       "      <td id=\"T_00f36_row13_col4\" class=\"data row13 col4\" >0.410</td>\n",
       "      <td id=\"T_00f36_row13_col5\" class=\"data row13 col5\" >0.345</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_00f36_level0_row14\" class=\"row_heading level0 row14\" >EigVecDissimilarity</th>\n",
       "      <td id=\"T_00f36_row14_col0\" class=\"data row14 col0\" >0.463</td>\n",
       "      <td id=\"T_00f36_row14_col1\" class=\"data row14 col1\" >0.370</td>\n",
       "      <td id=\"T_00f36_row14_col2\" class=\"data row14 col2\" >0.561</td>\n",
       "      <td id=\"T_00f36_row14_col3\" class=\"data row14 col3\" >0.236</td>\n",
       "      <td id=\"T_00f36_row14_col4\" class=\"data row14 col4\" >0.425</td>\n",
       "      <td id=\"T_00f36_row14_col5\" class=\"data row14 col5\" >0.256</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_00f36_level0_row15\" class=\"row_heading level0 row15\" >EigVecDissimilarity + beamsearch</th>\n",
       "      <td id=\"T_00f36_row15_col0\" class=\"data row15 col0\" >0.510</td>\n",
       "      <td id=\"T_00f36_row15_col1\" class=\"data row15 col1\" >0.414</td>\n",
       "      <td id=\"T_00f36_row15_col2\" class=\"data row15 col2\" >0.598</td>\n",
       "      <td id=\"T_00f36_row15_col3\" class=\"data row15 col3\" >0.301</td>\n",
       "      <td id=\"T_00f36_row15_col4\" class=\"data row15 col4\" >0.450</td>\n",
       "      <td id=\"T_00f36_row15_col5\" class=\"data row15 col5\" >0.376</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_00f36_level0_row16\" class=\"row_heading level0 row16\" >CocoaMSP</th>\n",
       "      <td id=\"T_00f36_row16_col0\" class=\"data row16 col0\" >0.505</td>\n",
       "      <td id=\"T_00f36_row16_col1\" class=\"data row16 col1\" >0.404</td>\n",
       "      <td id=\"T_00f36_row16_col2\" class=\"data row16 col2\" >0.587</td>\n",
       "      <td id=\"T_00f36_row16_col3\" class=\"data row16 col3\" >0.314</td>\n",
       "      <td id=\"T_00f36_row16_col4\" class=\"data row16 col4\" >0.461</td>\n",
       "      <td id=\"T_00f36_row16_col5\" class=\"data row16 col5\" >0.334</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_00f36_level0_row17\" class=\"row_heading level0 row17\" >CocoaMSP + beamsearch</th>\n",
       "      <td id=\"T_00f36_row17_col0\" class=\"data row17 col0\" >0.521</td>\n",
       "      <td id=\"T_00f36_row17_col1\" class=\"data row17 col1\" >0.426</td>\n",
       "      <td id=\"T_00f36_row17_col2\" class=\"data row17 col2\" >0.615</td>\n",
       "      <td id=\"T_00f36_row17_col3\" class=\"data row17 col3\" >0.345</td>\n",
       "      <td id=\"T_00f36_row17_col4\" class=\"data row17 col4\" >0.473</td>\n",
       "      <td id=\"T_00f36_row17_col5\" class=\"data row17 col5\" >0.347</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_00f36_level0_row18\" class=\"row_heading level0 row18\" >CocoaPPL</th>\n",
       "      <td id=\"T_00f36_row18_col0\" class=\"data row18 col0\" >0.523</td>\n",
       "      <td id=\"T_00f36_row18_col1\" class=\"data row18 col1\" >0.397</td>\n",
       "      <td id=\"T_00f36_row18_col2\" class=\"data row18 col2\" >0.628</td>\n",
       "      <td id=\"T_00f36_row18_col3\" class=\"data row18 col3\" >0.312</td>\n",
       "      <td id=\"T_00f36_row18_col4\" class=\"data row18 col4\" >0.461</td>\n",
       "      <td id=\"T_00f36_row18_col5\" class=\"data row18 col5\" >0.327</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_00f36_level0_row19\" class=\"row_heading level0 row19\" >CocoaPPL + beamsearch</th>\n",
       "      <td id=\"T_00f36_row19_col0\" class=\"data row19 col0\" >0.536</td>\n",
       "      <td id=\"T_00f36_row19_col1\" class=\"data row19 col1\" >0.412</td>\n",
       "      <td id=\"T_00f36_row19_col2\" class=\"data row19 col2\" >0.649</td>\n",
       "      <td id=\"T_00f36_row19_col3\" class=\"data row19 col3\" >0.339</td>\n",
       "      <td id=\"T_00f36_row19_col4\" class=\"data row19 col4\" >0.461</td>\n",
       "      <td id=\"T_00f36_row19_col5\" class=\"data row19 col5\" >0.337</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n"
      ],
      "text/plain": [
       "<pandas.io.formats.style.Styler at 0x1492c4bfa550>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "df_means_prr = build_prr_models_means(\n",
    "    mans_by_dataset,\n",
    "    dataset_order=datasets,\n",
    "    model_titles=[\n",
    "        r'\\multirowcell{Llama 3.1 8B\\\\base}',\n",
    "        r'\\multirowcell{Llama 3.1 8B\\\\instruct}',\n",
    "        r'\\multirowcell{Gemma 3 4B\\\\base}',\n",
    "        r'\\multirowcell{Gemma 3 4B\\\\instruct}',\n",
    "        r'\\multirowcell{Qwen 3 8B\\\\base}',\n",
    "        r'\\multirowcell{Qwen 3 8B\\\\instruct}',\n",
    "    ],\n",
    "    uq_methods=uq_methods,\n",
    "    quality_metric='AlignScore',   # or 'AlignScore>0.5'\n",
    "    filters_by_dataset={\n",
    "        'webq': lambda s: len(s['greedy_tokens']) <= 5,\n",
    "        'arcchallenge': lambda s: len(s['greedy_tokens']) <= 5,\n",
    "    },\n",
    "    prr_threshold=0.5,\n",
    ")\n",
    "\n",
    "show_model_means(\n",
    "    df_means_prr, uq_methods,\n",
    "    title='Mean PRR across datasets (columns = models)',\n",
    "    precision=3,\n",
    "    method_rename=uq_rename_fn,\n",
    "    add_beam_suffix=True,\n",
    "    lower_is_better=False,   # PRR is higher-better\n",
    ")\n",
    "\n",
    "_ = print_latex_model_means(\n",
    "    df_means_prr, uq_methods,\n",
    "    precision=3,\n",
    "    method_rename=uq_rename_fn,\n",
    "    add_beam_suffix=True,\n",
    "    lower_is_better=False,\n",
    "    save_path='latex_tables/qa_mean_prr.tex',\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2cf6a362-ad19-4ca1-98af-d49b2118b2cc",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
