{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Define Chain(s)\n",
    "\n",
    "from niagara import Chain, Model, ModelIntrinsicLogProb, NullTransformation, LogisticRegressionCalibrator\n",
    "import os\n",
    "os.environ[\"FIREWORKS_API_KEY\"] = \"leave-this-line-but-there-is-no-need-to-add-an-API-key\"\n",
    "\n",
    "llama_chain = Chain(\n",
    "    models = [\n",
    "        Model(\n",
    "            model_name=name, \n",
    "            thresholds={\"reject\": -10000, \"accept\": 0.0},\n",
    "            conf_signal=ModelIntrinsicLogProb(),\n",
    "            conf_signal_transform=NullTransformation(),\n",
    "            conf_signal_calibrator=LogisticRegressionCalibrator()\n",
    "        )\n",
    "        for name in [\"llama3.2-1b\", \"llama3.2-3b\", \"llama3.1-8b\", \"llama3.1-70b\", \"llama3.1-405b\"]\n",
    "    ]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Select chain, benchmark, transformation, and grab data\n",
    "\n",
    "import pickle\n",
    "from niagara import OneSidedAsymptoticLog, TwoSidedAsymptoticLog\n",
    "\n",
    "NAME = \"xsum\"\n",
    "TRANSFORM = TwoSidedAsymptoticLog()\n",
    "\n",
    "# Update the transformation for the chain\n",
    "for model in llama_chain.models:\n",
    "    model.conf_signal_transform = TRANSFORM\n",
    "\n",
    "CHAIN_NAME = \"llama_chain\"\n",
    "CHAIN = llama_chain\n",
    "\n",
    "with open(f'../benchmarks/data/{NAME}/chain_results/{NAME}_full_{CHAIN_NAME}_results_train.pkl', 'rb') as f:\n",
    "    results_train = pickle.load(f)\n",
    "with open(f'../benchmarks/data/{NAME}/chain_results/{NAME}_full_{CHAIN_NAME}_results_test.pkl', 'rb') as f:\n",
    "    results_test = pickle.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Compute calibrated confidence values\n",
    "\n",
    "process_scores = lambda scores: sum(scores.values()) >= 20\n",
    "\n",
    "if NAME==\"xsum\":\n",
    "    raw_corr_train = { k: [process_scores(x) for x in v] for k,v in results_train['model_correctness'].items() }\n",
    "else:\n",
    "    raw_corr_train= results_train['model_correctness']\n",
    "\n",
    "raw_conf_train = results_train['raw_confidences']\n",
    "\n",
    "corr_train = [\n",
    "    raw_corr_train[model_name] for model_name in CHAIN.model_names\n",
    "]\n",
    "\n",
    "transformed_conf_train = [ \n",
    "    list(TRANSFORM.transform_confidence_signal(raw_conf_train[model_name]))\n",
    "        for model_name in CHAIN.model_names\n",
    "]\n",
    "\n",
    "calibration_data = [\n",
    "    {\"correctness\": corr, \"transformed_confidence\": conf} \n",
    "        for (corr, conf, model_name) \n",
    "            in zip(corr_train, transformed_conf_train, CHAIN.model_names)\n",
    "]\n",
    "\n",
    "CHAIN.calibrate(calibration_data)\n",
    "\n",
    "calibrated_conf_train = [\n",
    "    list(\n",
    "        CHAIN.models[model_idx].conf_signal_calibrator.calibrate_confidence_signal(\n",
    "            transformed_conf_train[model_idx]\n",
    "        )\n",
    "    )\n",
    "    for model_idx in range(len(CHAIN.model_names))\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Compute test data\n",
    "\n",
    "if NAME==\"xsum\":\n",
    "    raw_corr_test = { k: [process_scores(x) for x in v] for k,v in results_test['model_correctness'].items() }\n",
    "else:\n",
    "    raw_corr_test= results_test['model_correctness']\n",
    "\n",
    "raw_conf_test = results_test['raw_confidences']\n",
    "\n",
    "corr_test = [\n",
    "    raw_corr_test[model_name] for model_name in CHAIN.model_names\n",
    "]\n",
    "\n",
    "transformed_conf_test = [ \n",
    "    list(TRANSFORM.transform_confidence_signal(raw_conf_test[model_name]))\n",
    "        for model_name in CHAIN.model_names\n",
    "]\n",
    "\n",
    "calibrated_conf_test = [\n",
    "    list(\n",
    "        CHAIN.models[model_idx].conf_signal_calibrator.calibrate_confidence_signal(\n",
    "            transformed_conf_test[model_idx]\n",
    "        )\n",
    "    )\n",
    "    for model_idx in range(len(CHAIN.model_names))\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import statsmodels.api as sm\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "covariate_data = pd.DataFrame({ name: calibrated_conf_test[i] for i, name in enumerate(CHAIN.model_names)})\n",
    "covariate_data = sm.add_constant(covariate_data, has_constant='add')\n",
    "\n",
    "data = {}\n",
    "coef_data = {}\n",
    "\n",
    "for model_idx in range(2,5):\n",
    "    model_name = CHAIN.model_names[model_idx]\n",
    "    prior_model_idx = model_idx-1\n",
    "    prior_model_name = CHAIN.model_names[prior_model_idx]\n",
    "\n",
    "    # do pairwise log reg of model_idx-1 with any other prior model, evaluate p value of other model\n",
    "    pvals_of_predecessor = []\n",
    "    coefs_of_predecessor = []\n",
    "\n",
    "    curr_data = {}\n",
    "    curr_coef_data = {}\n",
    "\n",
    "    for j in range(prior_model_idx):\n",
    "        covariate_data_pair = covariate_data.iloc[:, [0, j+1, prior_model_idx+1]]\n",
    "        model = sm.Logit(corr_test[model_idx], covariate_data_pair)\n",
    "        result = model.fit()\n",
    "        pvals = result.pvalues\n",
    "        pvals_wo_const = pvals[1:]\n",
    "        # save results\n",
    "        contender_name = CHAIN.model_names[j]\n",
    "        pval_of_contender = pvals.iloc[1]\n",
    "        coef_of_contender = result.params.iloc[1]\n",
    "\n",
    "        curr_data[contender_name] = pval_of_contender\n",
    "        curr_coef_data[contender_name] = coef_of_contender\n",
    "\n",
    "        pval_of_predecessor = pvals.iloc[-1]\n",
    "        coef_of_predecessor = result.params.iloc[-1]\n",
    "        pvals_of_predecessor.append(pval_of_predecessor)\n",
    "        coefs_of_predecessor.append(coef_of_predecessor)\n",
    "\n",
    "    avg_pval_of_predecessor = np.mean(pvals_of_predecessor)\n",
    "    avg_coef_of_predecessor = np.mean(coefs_of_predecessor)\n",
    "    curr_data[prior_model_name] = avg_pval_of_predecessor\n",
    "    curr_coef_data[prior_model_name] = avg_coef_of_predecessor\n",
    "\n",
    "    data[model_name] = curr_data\n",
    "    coef_data[model_name] = curr_coef_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
