{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Define Chain(s)\n",
    "\n",
    "from niagara import Chain, Model, ModelIntrinsicLogProb, NullTransformation, LogisticRegressionCalibrator\n",
    "from niagara import OpenAIClient, FireworksClient\n",
    "\n",
    "import os\n",
    "\n",
    "os.environ[\"FIREWORKS_API_KEY\"] = \"leave-this-line-but-there-is-no-need-to-add-an-API-key\"\n",
    "\n",
    "llama_chain = Chain(\n",
    "    models = [\n",
    "        Model(\n",
    "            model_name=name, \n",
    "            thresholds={\"reject\": -10000, \"accept\": 0.0},\n",
    "            conf_signal=ModelIntrinsicLogProb(),\n",
    "            conf_signal_transform=NullTransformation(),\n",
    "            conf_signal_calibrator=LogisticRegressionCalibrator()\n",
    "        )\n",
    "        for name in [\"llama3.2-1b\", \"llama3.2-3b\", \"llama3.1-8b\", \"llama3.1-70b\", \"llama3.1-405b\"]\n",
    "    ]\n",
    ")\n",
    "\n",
    "qwen_oai_chain = Chain(\n",
    "    models = [\n",
    "        Model(\n",
    "            model_name=name, \n",
    "            thresholds={\"reject\": -10000, \"accept\": 0.0},\n",
    "            conf_signal=ModelIntrinsicLogProb(),\n",
    "            conf_signal_transform=NullTransformation(),\n",
    "            conf_signal_calibrator=LogisticRegressionCalibrator(),\n",
    "            client=client\n",
    "        )\n",
    "        for name, client in [(\"gpt-4o-mini\", None), (\"qwen2.5-32b-coder-instruct\", None), (\"qwen2.5-72b-instruct\", None), (\"gpt-4o\", None)]\n",
    "    ]\n",
    ")\n",
    "\n",
    "### Select chain, benchmark, transformation, and grab data\n",
    "\n",
    "import numpy as np\n",
    "import pickle\n",
    "from niagara import OneSidedAsymptoticLog, TwoSidedAsymptoticLog\n",
    "from niagara.utils import compute_ece\n",
    "\n",
    "PRETTY_NAMES = {\n",
    "    \"xsum\": \"XSum\",\n",
    "    \"mmlu\": \"MMLU\",\n",
    "    \"medmcqa\": \"MedMCQA\",\n",
    "    \"triviaqa\": \"TriviaQA\",\n",
    "    \"truthfulqa\": \"TruthfulQA\",\n",
    "    \"gsm8k\": \"GSM8K\"\n",
    "}\n",
    "\n",
    "records = []\n",
    "\n",
    "for NAME, TRANSFORM in zip([\"xsum\", \"mmlu\", \"medmcqa\", \"triviaqa\", \"truthfulqa\", \"gsm8k\"], [\n",
    "    TwoSidedAsymptoticLog(), \n",
    "    OneSidedAsymptoticLog(), \n",
    "    OneSidedAsymptoticLog(), \n",
    "    TwoSidedAsymptoticLog(), \n",
    "    TwoSidedAsymptoticLog(), \n",
    "    TwoSidedAsymptoticLog()\n",
    "]):\n",
    "    for CHAIN_NAME, CHAIN in zip([\"qwen_oai_chain\", \"llama_chain\"], [qwen_oai_chain, llama_chain]):\n",
    "        # Update the transformation for the chain\n",
    "        for model in CHAIN.models:\n",
    "            model.conf_signal_transform = TRANSFORM\n",
    "\n",
    "        with open(f'../benchmarks/data/{NAME}/chain_results/{NAME}_full_{CHAIN_NAME}_results_train.pkl', 'rb') as f:\n",
    "            results_train = pickle.load(f)\n",
    "        with open(f'../benchmarks/data/{NAME}/chain_results/{NAME}_full_{CHAIN_NAME}_results_test.pkl', 'rb') as f:\n",
    "            results_test = pickle.load(f)\n",
    "\n",
    "        ### Compute calibrated confidence values\n",
    "\n",
    "        process_scores = lambda scores: sum(scores.values()) >= 20\n",
    "\n",
    "        if NAME==\"xsum\":\n",
    "            raw_corr_train = { k: [process_scores(x) for x in v] for k,v in results_train['model_correctness'].items() }\n",
    "        else:\n",
    "            raw_corr_train= results_train['model_correctness']\n",
    "\n",
    "        raw_conf_train = results_train['raw_confidences']\n",
    "\n",
    "        corr_train = [\n",
    "            raw_corr_train[model_name] for model_name in CHAIN.model_names\n",
    "        ]\n",
    "\n",
    "        transformed_conf_train = [ \n",
    "            list(TRANSFORM.transform_confidence_signal(raw_conf_train[model_name]))\n",
    "                for model_name in CHAIN.model_names\n",
    "        ]\n",
    "\n",
    "        calibration_data = [\n",
    "            {\"correctness\": corr, \"transformed_confidence\": conf} \n",
    "                for (corr, conf, model_name) \n",
    "                    in zip(corr_train, transformed_conf_train, CHAIN.model_names)\n",
    "        ]\n",
    "\n",
    "        CHAIN.calibrate(calibration_data)\n",
    "\n",
    "        calibrated_conf_train = [\n",
    "            list(\n",
    "                CHAIN.models[model_idx].conf_signal_calibrator.calibrate_confidence_signal(\n",
    "                    transformed_conf_train[model_idx]\n",
    "                )\n",
    "            )\n",
    "            for model_idx in range(len(CHAIN.model_names))\n",
    "        ]\n",
    "\n",
    "        ### Compute test data\n",
    "\n",
    "        if NAME==\"xsum\":\n",
    "            raw_corr_test = { k: [process_scores(x) for x in v] for k,v in results_test['model_correctness'].items() }\n",
    "        else:\n",
    "            raw_corr_test= results_test['model_correctness']\n",
    "\n",
    "        raw_conf_test = results_test['raw_confidences']\n",
    "\n",
    "        corr_test = [\n",
    "            raw_corr_test[model_name] for model_name in CHAIN.model_names\n",
    "        ]\n",
    "\n",
    "        transformed_conf_test = [ \n",
    "            list(TRANSFORM.transform_confidence_signal(raw_conf_test[model_name]))\n",
    "                for model_name in CHAIN.model_names\n",
    "        ]\n",
    "\n",
    "        calibrated_conf_test = [\n",
    "            list(\n",
    "                CHAIN.models[model_idx].conf_signal_calibrator.calibrate_confidence_signal(\n",
    "                    transformed_conf_test[model_idx]\n",
    "                )\n",
    "            )\n",
    "            for model_idx in range(len(CHAIN.model_names))\n",
    "        ]\n",
    "\n",
    "        for model_idx, model_name in enumerate(CHAIN.model_names):\n",
    "            records.append(\n",
    "                {\n",
    "                    \"model_name\": model_name,\n",
    "                    \"model_idx\": model_idx,\n",
    "                    \"chain\": CHAIN_NAME,\n",
    "                    \"benchmark\": NAME,\n",
    "                    \"test_acc\": np.mean(corr_test[model_idx]),\n",
    "                    \"test_ece\": compute_ece(calibrated_conf_test[model_idx], corr_test[model_idx], n_bins=10)['ece'],\n",
    "                    \"test_frac_certain\": np.mean(np.isinf(raw_conf_test[model_name]) | (np.array(raw_conf_test[model_name]) == 0.0)),\n",
    "                    \"test_frac_neginf\": np.mean(np.isinf(raw_conf_test[model_name])),\n",
    "                    \"test_frac_zero\": np.mean(np.array(raw_conf_test[model_name]) == 0.0),\n",
    "                }\n",
    "            )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
