{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "6498cb42-df65-4dad-8c62-224835f86aa7",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/envs/uqlm_test2/lib/python3.12/site-packages/google/cloud/aiplatform/models.py:52: FutureWarning: Support for google-cloud-storage < 3.0.0 will be removed in a future version of google-cloud-aiplatform. Please upgrade to google-cloud-storage >= 3.0.0.\n",
      "  from google.cloud.aiplatform.utils import gcs_utils\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "from utils import (\n",
    "    load_llms,\n",
    "    create_progress_bar,\n",
    "    generate_responses_and_samples,\n",
    "    decompose_claims,\n",
    "    decompose_sentences,\n",
    "    grade_claims,\n",
    "    evaluate_objectivity,\n",
    "    merge_claims,\n",
    "    score_unit_response,\n",
    "    score_matched_unit,\n",
    "    score_unit_qa,\n",
    "    score_graph_uq,\n",
    "    compute_metrics,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "52992b7c-d9be-4bc8-adc3-f8dceceac73e",
   "metadata": {
    "tags": []
   },
   "source": [
    "### Load data and set up LLMs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "a80e76d0-aeae-4bee-bba2-293857049295",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Set sample size for illustrative use\n",
    "SAMPLE_SIZE = 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "fd38dc7f-4953-45de-9508-d208238d5258",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>entity</th>\n",
       "      <th>question</th>\n",
       "      <th>answer</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Suthida</td>\n",
       "      <td>Tell me a bio of Suthida within 100 words.\\n</td>\n",
       "      <td>Suthida Bajrasudhabimalalakshana (Thai: สมเด็จ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Miguel Ángel Félix Gallardo</td>\n",
       "      <td>Tell me a bio of Miguel Ángel Félix Gallardo w...</td>\n",
       "      <td>Miguel Ángel Félix Gallardo (born January 8, 1...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Iggy Azalea</td>\n",
       "      <td>Tell me a bio of Iggy Azalea within 100 words.\\n</td>\n",
       "      <td>Amethyst Amelia Kelly (born 7 June 1990), know...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Fernando da Costa Novaes</td>\n",
       "      <td>Tell me a bio of Fernando da Costa Novaes with...</td>\n",
       "      <td>Fernando da Costa Novaes (April 6, 1927 – Marc...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Jan Zamoyski</td>\n",
       "      <td>Tell me a bio of Jan Zamoyski within 100 words.\\n</td>\n",
       "      <td>Jan Sariusz Zamoyski (Latin: Ioannes Zamoyski ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                        entity  \\\n",
       "0                      Suthida   \n",
       "1  Miguel Ángel Félix Gallardo   \n",
       "2                  Iggy Azalea   \n",
       "3     Fernando da Costa Novaes   \n",
       "4                 Jan Zamoyski   \n",
       "\n",
       "                                            question  \\\n",
       "0       Tell me a bio of Suthida within 100 words.\\n   \n",
       "1  Tell me a bio of Miguel Ángel Félix Gallardo w...   \n",
       "2   Tell me a bio of Iggy Azalea within 100 words.\\n   \n",
       "3  Tell me a bio of Fernando da Costa Novaes with...   \n",
       "4  Tell me a bio of Jan Zamoyski within 100 words.\\n   \n",
       "\n",
       "                                              answer  \n",
       "0  Suthida Bajrasudhabimalalakshana (Thai: สมเด็จ...  \n",
       "1  Miguel Ángel Félix Gallardo (born January 8, 1...  \n",
       "2  Amethyst Amelia Kelly (born 7 June 1990), know...  \n",
       "3  Fernando da Costa Novaes (April 6, 1927 – Marc...  \n",
       "4  Jan Sariusz Zamoyski (Latin: Ioannes Zamoyski ...  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "BIO_PATH = \"./bio\"\n",
    "factscore_bio = pd.read_parquet(f\"{BIO_PATH}/factscore_bio.parquet\")[[\"entity\", \"hundredw_prompt\", \"wikipedia_text\"]].rename(columns={\"hundredw_prompt\": \"question\", \"wikipedia_text\": \"answer\"}).head(SAMPLE_SIZE)\n",
    "factscore_bio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "39859bee-bffd-4ba9-a175-2a0e4f82cb50",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>category</th>\n",
       "      <th>entity</th>\n",
       "      <th>answer</th>\n",
       "      <th>length</th>\n",
       "      <th>question</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>nerves</td>\n",
       "      <td>Nasopalatine nerve</td>\n",
       "      <td>The nasopalatine nerve (also Scarpa's nerve or...</td>\n",
       "      <td>2239</td>\n",
       "      <td>Provide me with a paragraph detailing some fac...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>nerves</td>\n",
       "      <td>Deep branch of the radial nerve</td>\n",
       "      <td>The radial nerve divides into a superficial (s...</td>\n",
       "      <td>2260</td>\n",
       "      <td>Provide me with a paragraph detailing some fac...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>nerves</td>\n",
       "      <td>Lesser occipital nerve</td>\n",
       "      <td>The lesser occipital nerve (or small occipital...</td>\n",
       "      <td>2270</td>\n",
       "      <td>Provide me with a paragraph detailing some fac...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>nerves</td>\n",
       "      <td>Lateral cutaneous nerve of forearm</td>\n",
       "      <td>The lateral  cutaneous nerve of forearm (or la...</td>\n",
       "      <td>2278</td>\n",
       "      <td>Provide me with a paragraph detailing some fac...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>nerves</td>\n",
       "      <td>Obturator nerve</td>\n",
       "      <td>The obturator nerve in human anatomy arises fr...</td>\n",
       "      <td>2278</td>\n",
       "      <td>Provide me with a paragraph detailing some fac...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  category                              entity  \\\n",
       "0   nerves                  Nasopalatine nerve   \n",
       "1   nerves     Deep branch of the radial nerve   \n",
       "2   nerves              Lesser occipital nerve   \n",
       "3   nerves  Lateral cutaneous nerve of forearm   \n",
       "4   nerves                     Obturator nerve   \n",
       "\n",
       "                                              answer  length  \\\n",
       "0  The nasopalatine nerve (also Scarpa's nerve or...    2239   \n",
       "1  The radial nerve divides into a superficial (s...    2260   \n",
       "2  The lesser occipital nerve (or small occipital...    2270   \n",
       "3  The lateral  cutaneous nerve of forearm (or la...    2278   \n",
       "4  The obturator nerve in human anatomy arises fr...    2278   \n",
       "\n",
       "                                            question  \n",
       "0  Provide me with a paragraph detailing some fac...  \n",
       "1  Provide me with a paragraph detailing some fac...  \n",
       "2  Provide me with a paragraph detailing some fac...  \n",
       "3  Provide me with a paragraph detailing some fac...  \n",
       "4  Provide me with a paragraph detailing some fac...  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "STEM_GEO_PATH = \"./stem_geo\"\n",
    "factscore_stem_geo = pd.read_parquet(f\"{STEM_GEO_PATH}/factscore_stem_geo.parquet\").rename(columns={\"prompts\": \"question\", \"wikipedia_text\": \"answer\"}).head(SAMPLE_SIZE)\n",
    "factscore_stem_geo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "61931641-9ca8-457e-acad-9c76e2aafc6a",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "datasets = {\n",
    "    BIO_PATH: factscore_bio, \n",
    "    STEM_GEO_PATH: factscore_stem_geo\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "8ffbe882-fdce-488f-927f-9afb9674b5ee",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "gpt4o, gpt4o_mini, gemini_pro, gemini_flash = load_llms()\n",
    "\n",
    "llm_dict = {\n",
    "    \"gemini_flash\": gemini_flash,\n",
    "    # \"gemini_pro\": gemini_pro,\n",
    "    # \"gpt4o_mini\": gpt4o_mini,\n",
    "    # \"gpt4o\": gpt4o,\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "df309293-ccf7-48c2-acc9-00562b369763",
   "metadata": {
    "tags": []
   },
   "source": [
    "### Generate LLM responses, sampled responses, and score with full-response black-box UQ (short-form)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "7cac86b2-5726-467c-9537-a6c8e5a9017e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5d654a6c6de94c8c8a635df7af5a22bc",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Output()"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
      ],
      "text/plain": []
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
      ],
      "text/plain": []
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5791af25fce9426ea0b965eb2ed2fa19",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Output()"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
      ],
      "text/plain": []
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
      ],
      "text/plain": []
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "for path, dataset in datasets.items():\n",
    "    await generate_responses_and_samples(dataset=dataset, llm_dict=llm_dict, write_path=path)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "693e2d47-a40a-4efb-9ca7-9fd64dd1bd75",
   "metadata": {
    "tags": []
   },
   "source": [
    "### Decompose responses and sampled responses"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "656996df-c72b-4ae5-a4e5-6129f7e9804f",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "for path, dataset in datasets.items():\n",
    "    await decompose_claims(llm_dict=llm_dict, path=path)\n",
    "    decompose_sentences(llm_dict=llm_dict, path=path)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4fca66e8-f4c3-4bcc-8d6d-c60157187782",
   "metadata": {
    "tags": []
   },
   "source": [
    "### Grade claims and sentences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "5692335f-fc88-4e6e-8b82-f96a580bdf77",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "for path, dataset in datasets.items():\n",
    "    for granularity in [\"claim\", \"sentence\"]:\n",
    "        await grade_claims(llm_dict=llm_dict, dataset=dataset, granularity=granularity, path=path)\n",
    "    await evaluate_objectivity(llm_dict=llm_dict, granularity=\"claim\", path=path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "9550c2b8-8e51-4eb3-b876-f607f83af26d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>responses</th>\n",
       "      <th>sampled_responses</th>\n",
       "      <th>prompts</th>\n",
       "      <th>semantic_negentropy</th>\n",
       "      <th>noncontradiction</th>\n",
       "      <th>exact_match</th>\n",
       "      <th>cosine_sim</th>\n",
       "      <th>bert_score</th>\n",
       "      <th>claims</th>\n",
       "      <th>sampled_claims</th>\n",
       "      <th>sentences</th>\n",
       "      <th>sampled_sentences</th>\n",
       "      <th>claim_grades</th>\n",
       "      <th>sentence_grades</th>\n",
       "      <th>claim_objectivity</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Suthida Bajrasudhabimalalakshana is the curren...</td>\n",
       "      <td>[Queen Suthida Bajrasudhabimalalakshana, born ...</td>\n",
       "      <td>Tell me a bio of Suthida within 100 words.\\n</td>\n",
       "      <td>0.613154</td>\n",
       "      <td>0.989313</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.969457</td>\n",
       "      <td>0.935116</td>\n",
       "      <td>[Suthida Bajrasudhabimalalakshana is the curre...</td>\n",
       "      <td>[[Queen Suthida Bajrasudhabimalalakshana was b...</td>\n",
       "      <td>[Suthida Bajrasudhabimalalakshana is the curre...</td>\n",
       "      <td>[[Queen Suthida Bajrasudhabimalalakshana, born...</td>\n",
       "      <td>[True, True, False, True, True, True, False, F...</td>\n",
       "      <td>[True, True, True, True, False]</td>\n",
       "      <td>[True, True, True, True, True, True, True, Tru...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Miguel Ángel Félix Gallardo, \"El Padrino,\" was...</td>\n",
       "      <td>[Miguel Ángel Félix Gallardo, known as \"El Pad...</td>\n",
       "      <td>Tell me a bio of Miguel Ángel Félix Gallardo w...</td>\n",
       "      <td>0.586186</td>\n",
       "      <td>0.997167</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.974601</td>\n",
       "      <td>0.927233</td>\n",
       "      <td>[Miguel Ángel Félix Gallardo was \"El Padrino\"....</td>\n",
       "      <td>[[Miguel Ángel Félix Gallardo is known as \"El ...</td>\n",
       "      <td>[Miguel Ángel Félix Gallardo, \"El Padrino,\" wa...</td>\n",
       "      <td>[[Miguel Ángel Félix Gallardo, known as \"El Pa...</td>\n",
       "      <td>[True, True, True, True, True, True, True, Tru...</td>\n",
       "      <td>[True, True, True, False]</td>\n",
       "      <td>[True, False, True, True, True, True, False, T...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Iggy Azalea, born Amethyst Amelia Kelly, is an...</td>\n",
       "      <td>[Amethyst Amelia Kelly, known as Iggy Azalea, ...</td>\n",
       "      <td>Tell me a bio of Iggy Azalea within 100 words.\\n</td>\n",
       "      <td>0.387907</td>\n",
       "      <td>0.998009</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.961517</td>\n",
       "      <td>0.936799</td>\n",
       "      <td>[Iggy Azalea was born Amethyst Amelia Kelly., ...</td>\n",
       "      <td>[[Amethyst Amelia Kelly is known as Iggy Azale...</td>\n",
       "      <td>[Iggy Azalea, born Amethyst Amelia Kelly, is a...</td>\n",
       "      <td>[[Amethyst Amelia Kelly, known as Iggy Azalea,...</td>\n",
       "      <td>[True, True, True, False, True, True, True, Tr...</td>\n",
       "      <td>[False, True, True, False, False, False]</td>\n",
       "      <td>[True, True, True, True, True, True, True, Tru...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Fernando da Costa Novaes (1942-2004) was a dis...</td>\n",
       "      <td>[Fernando da Costa Novaes is a distinguished B...</td>\n",
       "      <td>Tell me a bio of Fernando da Costa Novaes with...</td>\n",
       "      <td>0.484262</td>\n",
       "      <td>0.406326</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.951422</td>\n",
       "      <td>0.921762</td>\n",
       "      <td>[Fernando da Costa Novaes was born in 1942., F...</td>\n",
       "      <td>[[Fernando da Costa Novaes is a historian., Fe...</td>\n",
       "      <td>[Fernando da Costa Novaes (1942-2004) was a di...</td>\n",
       "      <td>[[Fernando da Costa Novaes is a distinguished ...</td>\n",
       "      <td>[False, True, False, False, False, False, Fals...</td>\n",
       "      <td>[False, False, False, False]</td>\n",
       "      <td>[True, True, True, False, True, False, False, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Jan Zamoyski (1542–1605) was a preeminent Poli...</td>\n",
       "      <td>[Jan Zamoyski (1542–1605) was a towering figur...</td>\n",
       "      <td>Tell me a bio of Jan Zamoyski within 100 words.\\n</td>\n",
       "      <td>0.710935</td>\n",
       "      <td>0.998347</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.965984</td>\n",
       "      <td>0.931907</td>\n",
       "      <td>[Jan Zamoyski was born in 1542., Jan Zamoyski ...</td>\n",
       "      <td>[[Jan Zamoyski was born in 1542., Jan Zamoyski...</td>\n",
       "      <td>[Jan Zamoyski (1542–1605) was a preeminent Pol...</td>\n",
       "      <td>[[Jan Zamoyski (1542–1605) was a towering figu...</td>\n",
       "      <td>[True, True, True, True, True, True, True, Tru...</td>\n",
       "      <td>[True, True, True, True, False]</td>\n",
       "      <td>[True, True, True, True, False, True, True, Tr...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                           responses  \\\n",
       "0  Suthida Bajrasudhabimalalakshana is the curren...   \n",
       "1  Miguel Ángel Félix Gallardo, \"El Padrino,\" was...   \n",
       "2  Iggy Azalea, born Amethyst Amelia Kelly, is an...   \n",
       "3  Fernando da Costa Novaes (1942-2004) was a dis...   \n",
       "4  Jan Zamoyski (1542–1605) was a preeminent Poli...   \n",
       "\n",
       "                                   sampled_responses  \\\n",
       "0  [Queen Suthida Bajrasudhabimalalakshana, born ...   \n",
       "1  [Miguel Ángel Félix Gallardo, known as \"El Pad...   \n",
       "2  [Amethyst Amelia Kelly, known as Iggy Azalea, ...   \n",
       "3  [Fernando da Costa Novaes is a distinguished B...   \n",
       "4  [Jan Zamoyski (1542–1605) was a towering figur...   \n",
       "\n",
       "                                             prompts  semantic_negentropy  \\\n",
       "0       Tell me a bio of Suthida within 100 words.\\n             0.613154   \n",
       "1  Tell me a bio of Miguel Ángel Félix Gallardo w...             0.586186   \n",
       "2   Tell me a bio of Iggy Azalea within 100 words.\\n             0.387907   \n",
       "3  Tell me a bio of Fernando da Costa Novaes with...             0.484262   \n",
       "4  Tell me a bio of Jan Zamoyski within 100 words.\\n             0.710935   \n",
       "\n",
       "   noncontradiction  exact_match  cosine_sim  bert_score  \\\n",
       "0          0.989313          0.0    0.969457    0.935116   \n",
       "1          0.997167          0.0    0.974601    0.927233   \n",
       "2          0.998009          0.0    0.961517    0.936799   \n",
       "3          0.406326          0.0    0.951422    0.921762   \n",
       "4          0.998347          0.0    0.965984    0.931907   \n",
       "\n",
       "                                              claims  \\\n",
       "0  [Suthida Bajrasudhabimalalakshana is the curre...   \n",
       "1  [Miguel Ángel Félix Gallardo was \"El Padrino\"....   \n",
       "2  [Iggy Azalea was born Amethyst Amelia Kelly., ...   \n",
       "3  [Fernando da Costa Novaes was born in 1942., F...   \n",
       "4  [Jan Zamoyski was born in 1542., Jan Zamoyski ...   \n",
       "\n",
       "                                      sampled_claims  \\\n",
       "0  [[Queen Suthida Bajrasudhabimalalakshana was b...   \n",
       "1  [[Miguel Ángel Félix Gallardo is known as \"El ...   \n",
       "2  [[Amethyst Amelia Kelly is known as Iggy Azale...   \n",
       "3  [[Fernando da Costa Novaes is a historian., Fe...   \n",
       "4  [[Jan Zamoyski was born in 1542., Jan Zamoyski...   \n",
       "\n",
       "                                           sentences  \\\n",
       "0  [Suthida Bajrasudhabimalalakshana is the curre...   \n",
       "1  [Miguel Ángel Félix Gallardo, \"El Padrino,\" wa...   \n",
       "2  [Iggy Azalea, born Amethyst Amelia Kelly, is a...   \n",
       "3  [Fernando da Costa Novaes (1942-2004) was a di...   \n",
       "4  [Jan Zamoyski (1542–1605) was a preeminent Pol...   \n",
       "\n",
       "                                   sampled_sentences  \\\n",
       "0  [[Queen Suthida Bajrasudhabimalalakshana, born...   \n",
       "1  [[Miguel Ángel Félix Gallardo, known as \"El Pa...   \n",
       "2  [[Amethyst Amelia Kelly, known as Iggy Azalea,...   \n",
       "3  [[Fernando da Costa Novaes is a distinguished ...   \n",
       "4  [[Jan Zamoyski (1542–1605) was a towering figu...   \n",
       "\n",
       "                                        claim_grades  \\\n",
       "0  [True, True, False, True, True, True, False, F...   \n",
       "1  [True, True, True, True, True, True, True, Tru...   \n",
       "2  [True, True, True, False, True, True, True, Tr...   \n",
       "3  [False, True, False, False, False, False, Fals...   \n",
       "4  [True, True, True, True, True, True, True, Tru...   \n",
       "\n",
       "                            sentence_grades  \\\n",
       "0           [True, True, True, True, False]   \n",
       "1                 [True, True, True, False]   \n",
       "2  [False, True, True, False, False, False]   \n",
       "3              [False, False, False, False]   \n",
       "4           [True, True, True, True, False]   \n",
       "\n",
       "                                   claim_objectivity  \n",
       "0  [True, True, True, True, True, True, True, Tru...  \n",
       "1  [True, False, True, True, True, True, False, T...  \n",
       "2  [True, True, True, True, True, True, True, Tru...  \n",
       "3  [True, True, True, False, True, False, False, ...  \n",
       "4  [True, True, True, True, False, True, True, Tr...  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Preview data: prompts, responses, decomposed responses, unit grades, and full-response black-box UQ scores\n",
    "llm_name = \"gemini_flash\"\n",
    "with open(f\"bio/responses_{llm_name}.json\") as json_file:\n",
    "    tmp_dict = json.load(json_file)\n",
    "pd.DataFrame(tmp_dict).head(SAMPLE_SIZE)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0cbaa2df-9f11-4150-9934-327bfe96046d",
   "metadata": {
    "tags": []
   },
   "source": [
    "### Compute unit-response agreement scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "c22c2731-9458-4f0a-8e81-ee8a2d548bab",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "for path, dataset in datasets.items():\n",
    "    for granularity in [\"claim\", \"sentence\"]:\n",
    "        await score_unit_response(llm_dict=llm_dict, granularity=granularity, path=path)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b4c13dee-b12e-4855-a109-1c64587dd7c0",
   "metadata": {
    "tags": []
   },
   "source": [
    "### Compute matched-unit agreement scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "b0c285f4-029d-4023-9541-bde539884d89",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "for path, dataset in datasets.items():\n",
    "    await score_matched_unit(llm_dict=llm_dict, granularity=\"sentence\", path=path)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "173c7bde-616b-4a5c-a24e-9b7199e6192e",
   "metadata": {
    "tags": []
   },
   "source": [
    "### Compute Claim-QA Scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "907d2220-2855-4827-ab8a-0b614c431174",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "03ebeca0c893468c9297e516065845e0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Output()"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
      ],
      "text/plain": []
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b7f6447d1a6a41969c5e0d8e7c114b73",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Output()"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
      ],
      "text/plain": []
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7b40f82fe0974ebca99e3fabf6b52ad9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Output()"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
      ],
      "text/plain": []
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "536a2144b39f426c8e2747fa33949574",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Output()"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
      ],
      "text/plain": []
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "for path, dataset in datasets.items():\n",
    "    for granularity in [\"claim\", \"sentence\"]:\n",
    "        await score_unit_qa(llm_dict=llm_dict, dataset=dataset, granularity=granularity, path=path)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "82cd2c33-6200-414d-9b1c-a05eac4be617",
   "metadata": {
    "tags": []
   },
   "source": [
    "### Compute Graph UQ Scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "48b51c94-8164-4fdc-a6d2-bbc2e42e5688",
   "metadata": {},
   "outputs": [],
   "source": [
    "for path, dataset in datasets.items():\n",
    "    await merge_claims(llm_dict=llm_dict, path=path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "98ae958a-dab4-4321-a87c-21f9242d37ad",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "for path, dataset in datasets.items():\n",
    "    await score_graph_uq(llm_dict=llm_dict, dataset=dataset, path=path)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ab257b71-91f3-4bdf-a9e7-c69bf61e41e6",
   "metadata": {
    "tags": []
   },
   "source": [
    "### Compute Evaluation Metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "7db6ed84-5190-4137-9d4a-03593d6e184f",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "for path, dataset in datasets.items():\n",
    "    for granularity in [\"claim\", \"sentence\"]:\n",
    "        objective_only = True if granularity == \"claim\" else False\n",
    "        compute_metrics(llm_dict, dataset=dataset, granularity=granularity, path=path, objective_only=objective_only)"
   ]
  }
 ],
 "metadata": {
  "environment": {
   "kernel": "uqlm2",
   "name": "workbench-notebooks.m126",
   "type": "gcloud",
   "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/workbench-notebooks:m126"
  },
  "kernelspec": {
   "display_name": "uqlm2",
   "language": "python",
   "name": "uqlm2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
