{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "%cd ..\n",
    "!uv pip install -e ."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
    "from src.text_poa_graph import TextPOAGraph\n",
    "from src.new_text_alignment import TextSeqGraphAlignment\n",
    "from src.text_poa_graph_utils import path_sim_cosine, path_sim_llm\n",
    "from src.factscore_eval.factscorer import FactScorer\n",
    "from src.generation_methods import decode_consensus\n",
    "from src.global_edit_utils import clean_up_text\n",
    "from src.utils import detect_abstain\n",
    "\n",
    "import json\n",
    "import pickle\n",
    "import numpy as np\n",
    "import os\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_name = \"qwen72b\"\n",
    "task = \"biographies\"\n",
    "short_task_code = \"bio\"\n",
    "results_dir = f\"results/final_baseline/factscore_halogen/Qwen72b/{short_task_code}\"\n",
    "baseline_methods = [\"mbr\", \"abs_nllm_cons\", \"shrt_res\", \"t0\"]\n",
    "\n",
    "data = {}\n",
    "for method in baseline_methods:\n",
    "    with open(f'{results_dir}/res_{method}_halogen_{task}_gen-{model_name}_scored.json', 'r') as f:\n",
    "        results = json.load(f)\n",
    "        for i, item in tqdm(enumerate(results)):\n",
    "            prompt = list(item.keys())[0]\n",
    "            res = item[prompt]\n",
    "            if i not in data:\n",
    "                data[i] = {}\n",
    "                data[i][\"prompt\"] = prompt\n",
    "\n",
    "            if res[\"response\"] == \"Abstain\":\n",
    "                data[i][f\"{method}_hallucination_score\"] = 0.0\n",
    "                data[i][f\"{method}_response_ratio\"] = 0.0\n",
    "                data[i][f\"{method}_utility_score\"] = 1.0\n",
    "            else:\n",
    "                data[i][f\"{method}_hallucination_score\"] = res[\"scores\"][\"hallucination_score\"]\n",
    "                data[i][f\"{method}_response_ratio\"] = res[\"scores\"][\"response_ratio\"]\n",
    "                data[i][f\"{method}_utility_score\"] = res[\"scores\"][\"utility_score\"]\n",
    "    \n",
    "            if data[i][f\"{method}_response_ratio\"] == 1.0 and data[i][f\"{method}_hallucination_score\"] == 0.0:\n",
    "                data[i][f\"{method}_utility_score\"] = 1.0\n",
    "\n",
    "            if task == \"biographies\":\n",
    "                data[i][f\"{method}_num_supported_facts\"] = res[\"num_supported_facts\"]\n",
    "                data[i][f\"{method}_num_unsupported_facts\"] = res[\"num_unsupported_facts\"]\n",
    "                data[i][f\"{method}_total_num_of_facts\"] = res[\"num_supported_facts\"] + res[\"num_unsupported_facts\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(f'{results_dir}/res_5_samples_halogen_{task}_gen-{model_name}_scored.json', 'r') as f:\n",
    "    # process 5 at a time, and aggregate into a single result, each prompt will have 5 results consecutively in the list, and we will take the average of the utility scores\n",
    "    results = json.load(f)\n",
    "    for i in range(0, len(results), 5):\n",
    "        hallucination_scores = []\n",
    "        response_ratios = []\n",
    "        utility_scores = []\n",
    "        if task == \"biographies\":\n",
    "            num_supported_facts = []\n",
    "            num_unsupported_facts = []\n",
    "            total_num_of_facts = []\n",
    "        for j in range(i, i+5):\n",
    "            prompt = list(results[j].keys())[0]\n",
    "            res = results[j][prompt]\n",
    "            hallucination_scores.append(res[\"scores\"][\"hallucination_score\"])\n",
    "            response_ratios.append(res[\"scores\"][\"response_ratio\"])\n",
    "            utility_scores.append(res[\"scores\"][\"utility_score\"])\n",
    "            if response_ratios[-1]== 1.0 and hallucination_scores[-1] == 0.0:\n",
    "                utility_scores[-1] = 1.0\n",
    "            if task == \"biographies\":\n",
    "                num_supported_facts.append(res[\"num_supported_facts\"])\n",
    "                num_unsupported_facts.append(res[\"num_unsupported_facts\"])\n",
    "                total_num_of_facts.append(res[\"num_supported_facts\"] + res[\"num_unsupported_facts\"])\n",
    "        data[i//5][\"5_samples_hallucination_score\"] = np.mean(hallucination_scores)\n",
    "        data[i//5][\"5_samples_response_ratio\"] = np.mean(response_ratios)\n",
    "        data[i//5][\"5_samples_utility_score\"] = np.mean(utility_scores)\n",
    "        if task == \"biographies\":\n",
    "            data[i//5][\"5_samples_num_supported_facts\"] = np.mean(num_supported_facts)\n",
    "            data[i//5][\"5_samples_num_unsupported_facts\"] = np.mean(num_unsupported_facts)\n",
    "            data[i//5][\"5_samples_total_num_of_facts\"] = np.mean(total_num_of_facts)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for threshold in [0.3, 0.5]:\n",
    "    with open(f'{results_dir}/{short_task_code}_{model_name}_consensus_texts_{threshold}_scored.json', 'r') as f:\n",
    "        results = json.load(f)\n",
    "        for i, item in tqdm(enumerate(results)):\n",
    "            prompt = list(item.keys())[0]\n",
    "            res = item[prompt]\n",
    "            if i not in data:\n",
    "                data[i] = {}\n",
    "                data[i][\"prompt\"] = prompt\n",
    "            \n",
    "            if res[\"response\"] == \"Abstain\":\n",
    "                data[i][f\"consensus_{threshold}_hallucination_score\"] = 0.0\n",
    "                data[i][f\"consensus_{threshold}_response_ratio\"] = 0.0\n",
    "                data[i][f\"consensus_{threshold}_utility_score\"] = 1.0\n",
    "            else:\n",
    "                data[i][f\"consensus_{threshold}_hallucination_score\"] = res[\"scores\"][\"hallucination_score\"]\n",
    "                data[i][f\"consensus_{threshold}_response_ratio\"] = res[\"scores\"][\"response_ratio\"]\n",
    "                data[i][f\"consensus_{threshold}_utility_score\"] = res[\"scores\"][\"utility_score\"]\n",
    "    \n",
    "            if data[i][f\"consensus_{threshold}_response_ratio\"] == 1.0 and data[i][f\"consensus_{threshold}_hallucination_score\"] == 0.0:\n",
    "                data[i][f\"consensus_{threshold}_utility_score\"] = 1.0\n",
    "            \n",
    "            if task == \"biographies\":\n",
    "                data[i][f\"consensus_{threshold}_num_supported_facts\"] = res[\"num_supported_facts\"]\n",
    "                data[i][f\"consensus_{threshold}_num_unsupported_facts\"] = res[\"num_unsupported_facts\"]\n",
    "                data[i][f\"consensus_{threshold}_total_num_of_facts\"] = res[\"num_supported_facts\"] + res[\"num_unsupported_facts\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mean_utility_scores = {}\n",
    "mean_hallucination_scores = {}\n",
    "mean_response_ratios = {}\n",
    "if task == \"biographies\":\n",
    "    mean_num_supported_facts = {}\n",
    "    mean_num_unsupported_facts = {}\n",
    "    mean_total_num_of_facts = {}\n",
    "\n",
    "baseline_methods = [\"mbr\", \"abs_nllm_cons\", \"shrt_res\", \"t0\", \"5_samples\"]\n",
    "for method in baseline_methods:\n",
    "    mean_utility_scores[method] = np.mean([data[i][f\"{method}_utility_score\"] for i in data])\n",
    "    mean_hallucination_scores[method] = np.mean([data[i][f\"{method}_hallucination_score\"] for i in data])\n",
    "    mean_response_ratios[method] = np.mean([data[i][f\"{method}_response_ratio\"] for i in data])\n",
    "\n",
    "    if task == \"biographies\":\n",
    "        mean_num_supported_facts[method] = np.mean([data[i][f\"{method}_num_supported_facts\"] for i in data])\n",
    "        mean_num_unsupported_facts[method] = np.mean([data[i][f\"{method}_num_unsupported_facts\"] for i in data])\n",
    "        mean_total_num_of_facts[method] = np.mean([data[i][f\"{method}_total_num_of_facts\"] for i in data])\n",
    "\n",
    "for threshold in [0.3, 0.5]:\n",
    "    mean_utility_scores[f\"consensus_{threshold}\"] = np.mean([data[i][f\"consensus_{threshold}_utility_score\"] for i in data])\n",
    "    mean_hallucination_scores[f\"consensus_{threshold}\"] = np.mean([data[i][f\"consensus_{threshold}_hallucination_score\"] for i in data])\n",
    "    mean_response_ratios[f\"consensus_{threshold}\"] = np.mean([data[i][f\"consensus_{threshold}_response_ratio\"] for i in data])\n",
    "\n",
    "    if task == \"biographies\":\n",
    "        mean_num_supported_facts[f\"consensus_{threshold}\"] = np.mean([data[i][f\"consensus_{threshold}_num_supported_facts\"] for i in data])\n",
    "        mean_num_unsupported_facts[f\"consensus_{threshold}\"] = np.mean([data[i][f\"consensus_{threshold}_num_unsupported_facts\"] for i in data])\n",
    "        mean_total_num_of_facts[f\"consensus_{threshold}\"] = np.mean([data[i][f\"consensus_{threshold}_total_num_of_facts\"] for i in data])\n",
    "\n",
    "print(\"Mean utility scores:\")\n",
    "print(json.dumps(mean_utility_scores, indent=4))\n",
    "print(\"Mean hallucination scores:\")\n",
    "print(json.dumps(mean_hallucination_scores, indent=4))\n",
    "print(\"Mean response ratios:\")\n",
    "print(json.dumps(mean_response_ratios, indent=4))\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
