{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Define Chain(s)\n",
    "\n",
    "from niagara import Chain, Model, ModelIntrinsicLogProb, NullTransformation, LogisticRegressionCalibrator\n",
    "from niagara import OpenAIClient, FireworksClient\n",
    "\n",
    "import os\n",
    "os.environ[\"FIREWORKS_API_KEY\"] = \"leave-this-line-but-there-is-no-need-to-add-an-API-key\"\n",
    "\n",
    "llama_chain = Chain(\n",
    "    models = [\n",
    "        Model(\n",
    "            model_name=name, \n",
    "            thresholds={\"reject\": -10000, \"accept\": 0.0},\n",
    "            conf_signal=ModelIntrinsicLogProb(),\n",
    "            conf_signal_transform=NullTransformation(),\n",
    "            conf_signal_calibrator=LogisticRegressionCalibrator()\n",
    "        )\n",
    "        for name in [\"llama3.2-1b\", \"llama3.2-3b\", \"llama3.1-8b\", \"llama3.1-70b\", \"llama3.1-405b\"]\n",
    "    ]\n",
    ")\n",
    "\n",
    "qwen_oai_chain = Chain(\n",
    "    models = [\n",
    "        Model(\n",
    "            model_name=name, \n",
    "            thresholds={\"reject\": -10000, \"accept\": 0.0},\n",
    "            conf_signal=ModelIntrinsicLogProb(),\n",
    "            conf_signal_transform=NullTransformation(),\n",
    "            conf_signal_calibrator=LogisticRegressionCalibrator(),\n",
    "            client=client\n",
    "        )\n",
    "        for name, client in [(\"gpt-4o-mini\", None), (\"qwen2.5-32b-coder-instruct\", None), (\"qwen2.5-72b-instruct\", None), (\"gpt-4o\", None)]\n",
    "    ]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Select chain, benchmark, transformation, and grab data\n",
    "\n",
    "import pickle\n",
    "from niagara import OneSidedAsymptoticLog, TwoSidedAsymptoticLog\n",
    "\n",
    "PRETTY_NAMES = {\n",
    "    \"xsum\": \"XSum\",\n",
    "    \"mmlu\": \"MMLU\",\n",
    "    \"medmcqa\": \"MedMCQA\",\n",
    "    \"triviaqa\": \"TriviaQA\",\n",
    "    \"truthfulqa\": \"TruthfulQA\",\n",
    "    \"gsm8k\": \"GSM8K\"\n",
    "}\n",
    "\n",
    "NAME = \"xsum\"\n",
    "TRANSFORM = TwoSidedAsymptoticLog()\n",
    "CHAIN_NAME = \"llama_chain\"\n",
    "CHAIN = llama_chain\n",
    "\n",
    "# Update the transformation for the chain\n",
    "for model in CHAIN.models:\n",
    "    model.conf_signal_transform = TRANSFORM\n",
    "\n",
    "with open(f'../benchmarks/data/{NAME}/chain_results/{NAME}_full_{CHAIN_NAME}_results_train.pkl', 'rb') as f:\n",
    "    results_train = pickle.load(f)\n",
    "with open(f'../benchmarks/data/{NAME}/chain_results/{NAME}_full_{CHAIN_NAME}_results_test.pkl', 'rb') as f:\n",
    "    results_test = pickle.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Compute calibrated confidence values\n",
    "\n",
    "process_scores = lambda scores: sum(scores.values()) >= 20\n",
    "\n",
    "if NAME==\"xsum\":\n",
    "    raw_corr_train = { k: [process_scores(x) for x in v] for k,v in results_train['model_correctness'].items() }\n",
    "else:\n",
    "    raw_corr_train= results_train['model_correctness']\n",
    "\n",
    "raw_conf_train = results_train['raw_confidences']\n",
    "\n",
    "corr_train = [\n",
    "    raw_corr_train[model_name] for model_name in CHAIN.model_names\n",
    "]\n",
    "\n",
    "transformed_conf_train = [ \n",
    "    list(TRANSFORM.transform_confidence_signal(raw_conf_train[model_name]))\n",
    "        for model_name in CHAIN.model_names\n",
    "]\n",
    "\n",
    "calibration_data = [\n",
    "    {\"correctness\": corr, \"transformed_confidence\": conf} \n",
    "        for (corr, conf, model_name) \n",
    "            in zip(corr_train, transformed_conf_train, CHAIN.model_names)\n",
    "]\n",
    "\n",
    "CHAIN.calibrate(calibration_data)\n",
    "\n",
    "calibrated_conf_train = [\n",
    "    list(\n",
    "        CHAIN.models[model_idx].conf_signal_calibrator.calibrate_confidence_signal(\n",
    "            transformed_conf_train[model_idx]\n",
    "        )\n",
    "    )\n",
    "    for model_idx in range(len(CHAIN.model_names))\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Compute test data\n",
    "\n",
    "if NAME==\"xsum\":\n",
    "    raw_corr_test = { k: [process_scores(x) for x in v] for k,v in results_test['model_correctness'].items() }\n",
    "else:\n",
    "    raw_corr_test= results_test['model_correctness']\n",
    "\n",
    "raw_conf_test = results_test['raw_confidences']\n",
    "\n",
    "corr_test = [\n",
    "    raw_corr_test[model_name] for model_name in CHAIN.model_names\n",
    "]\n",
    "\n",
    "transformed_conf_test = [ \n",
    "    list(TRANSFORM.transform_confidence_signal(raw_conf_test[model_name]))\n",
    "        for model_name in CHAIN.model_names\n",
    "]\n",
    "\n",
    "calibrated_conf_test = [\n",
    "    list(\n",
    "        CHAIN.models[model_idx].conf_signal_calibrator.calibrate_confidence_signal(\n",
    "            transformed_conf_test[model_idx]\n",
    "        )\n",
    "    )\n",
    "    for model_idx in range(len(CHAIN.model_names))\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "import numpy as np\n",
    "from statsmodels.distributions.copula.api import GumbelCopula, IndependenceCopula\n",
    "from statsmodels.distributions.empirical_distribution import ECDF\n",
    "from scipy.stats import kendalltau\n",
    "\n",
    "def compute_cramer_von_mises(C, n_grid=1000):\n",
    "    \"\"\" Compute Cramer von Mises statistic. \"\"\"\n",
    "\n",
    "    # Step 1: Compute cdf K_empirical using empirical copula\n",
    "    n = C.shape[0] # this is the number of rows\n",
    "    U_ij = (np.argsort(np.argsort(C, axis=0), axis=0) + 1)/(n+1)\n",
    "    V_i_empirical = np.array([ np.mean(np.all(U_ij <= U_ij[[i],:], axis=1)) for i in range(n) ])\n",
    "    K_empirical = ECDF(V_i_empirical)\n",
    "\n",
    "    # Step 2: Compute cdf K_fitted using Gumbel copula\n",
    "    kendall_tau = kendalltau(C[:,0], C[:,1]).statistic\n",
    "    gumbel_theta = 1/(1-kendall_tau)\n",
    "    V_i_fitted = np.exp(-np.sum((-np.log(U_ij)) ** (gumbel_theta), axis=1)**(1/gumbel_theta))\n",
    "    K_fitted = ECDF(V_i_fitted)\n",
    "\n",
    "    # Step 3: Compute Cramer-von Mises statistic, via trapezoidal rule\n",
    "    integration_grid = np.linspace(0, 1, n_grid)\n",
    "    integrand = [ np.sqrt(n) * (K_empirical(v) - K_fitted(v))**2 for v in integration_grid ]\n",
    "    sum_1 = np.sum(integrand[1:] * np.diff(K_fitted(integration_grid)))\n",
    "    sum_2 = np.sum(integrand[:-1] * np.diff(K_fitted(integration_grid)))\n",
    "    integral = (sum_1 + sum_2) / 2\n",
    "\n",
    "    return integral\n",
    "\n",
    "\n",
    "def run_parametric_bootstrap(C, B=1000):\n",
    "    \"\"\" Carry out parametric bootstrap. \"\"\"\n",
    "    cvm_stats = []\n",
    "    theta_list = []\n",
    "    integration_grid = np.linspace(0, 1, 1000)\n",
    "\n",
    "    for b in tqdm(range(B)):\n",
    "        n = C.shape[0]\n",
    "        null_kendall_tau = kendalltau(C[:,0], C[:,1]).statistic\n",
    "        null_theta = 1/(1-null_kendall_tau)\n",
    "\n",
    "        try:\n",
    "            null_copula = GumbelCopula(null_theta, k_dim=2)\n",
    "            null_samples = null_copula.rvs(nobs=n)\n",
    "        except ValueError as e: # check if theta is <= 1\n",
    "            assert null_theta <= 1\n",
    "            # sample from independence copula instead\n",
    "            null_copula = IndependenceCopula(k_dim=2)\n",
    "            null_samples = null_copula.rvs(nobs=n)\n",
    "\n",
    "        # Compute K dist by fitting Gumbel copula\n",
    "        theta_b = 1/(1-kendalltau(null_samples[:,0], null_samples[:,1]).statistic)\n",
    "        theta_list.append(theta_b)\n",
    "        V_i_fitted_b = np.exp(\n",
    "            -np.sum((-np.log(null_samples)) ** (theta_b), axis=1)**(1/theta_b)\n",
    "        )\n",
    "        K_fitted_b = ECDF(V_i_fitted_b)\n",
    "\n",
    "        # Compute K dist with empirical copula\n",
    "        U_ij_b = (np.argsort(np.argsort(null_samples, axis=0), axis=0) + 1)/(n+1)\n",
    "        V_i_empirical_b = np.array([ np.mean(np.all(U_ij_b <= U_ij_b[[i],:], axis=1)) for i in range(n) ])\n",
    "        K_empirical_b = ECDF(V_i_empirical_b)\n",
    "\n",
    "        # Compute Cramer-von Mises statistic\n",
    "        integrand_b = [ np.sqrt(n) * (K_empirical_b(v) - K_fitted_b(v))**2 for v in integration_grid ]\n",
    "        sum_1_b = np.sum(integrand_b[1:] * np.diff(K_fitted_b(integration_grid)))\n",
    "        sum_2_b = np.sum(integrand_b[:-1] * np.diff(K_fitted_b(integration_grid)))\n",
    "        integral_b = (sum_1_b + sum_2_b) / 2\n",
    "        cvm_stats.append(integral_b)\n",
    "\n",
    "    # Get results\n",
    "    return cvm_stats, theta_list\n",
    "\n",
    "\n",
    "def compute_pval(test_statistic, bootstrapped_values):\n",
    "    \"\"\" Compute p value for Cramer von Mises with parametrized bootstrapping. \"\"\"\n",
    "    pval = np.mean(np.array(bootstrapped_values) >= test_statistic)\n",
    "    return pval\n",
    "\n",
    "\n",
    "def compute_pval_e2e(C, B=1000, return_test_statistic=False):\n",
    "    \"\"\" Compute end-to-end p value for Cramer von Mises with parametrized bootstrapping. \"\"\"\n",
    "    test_statistic = compute_cramer_von_mises(C)\n",
    "    bootstrapped_values, _ = run_parametric_bootstrap(C, B)\n",
    "    pval = compute_pval(test_statistic, bootstrapped_values)\n",
    "    if return_test_statistic:\n",
    "        return pval, test_statistic\n",
    "    else:\n",
    "        return pval\n",
    "\n",
    "\n",
    "def prepare_calibrated_conf(calibrated_conf_data, j1, j2, n_test=150, seed=123):\n",
    "    \"\"\" Get data for two models and reduce to n_test points. \"\"\"\n",
    "    C = np.array(calibrated_conf_test).transpose()[:,[j1, j2]]\n",
    "    if n_test is not None:\n",
    "        np.random.seed(seed)\n",
    "        C = C[np.random.choice(C.shape[0], n_test, replace=False), :]\n",
    "    return C"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Plot the correlations between the calibrated confidences of pairs of LLMs\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "from statsmodels.distributions.empirical_distribution import ECDF\n",
    "import seaborn as sns\n",
    "import numpy as np\n",
    "from matplotlib import rcParams\n",
    "\n",
    "for j1 in range(len(CHAIN.model_names)):\n",
    "    for j2 in range(j1+1, len(CHAIN.model_names)):\n",
    "\n",
    "        c2 = np.array(calibrated_conf_test[j2])\n",
    "        c1 = np.array(calibrated_conf_test[j1])\n",
    "\n",
    "        x_smooth = (c2 > np.min(c2)) & (c2 < np.max(c2))\n",
    "        y_smooth = (c1 > np.min(c1)) & (c1 < np.max(c1))\n",
    "\n",
    "        x = c2[x_smooth & y_smooth] # columns are j2\n",
    "        y = c1[x_smooth & y_smooth] # rows are j1\n",
    "        name_j2 = CHAIN.model_names[j2]\n",
    "        name_j1 = CHAIN.model_names[j1]\n",
    "\n",
    "        rank_corr = kendalltau(x,y).statistic\n",
    "\n",
    "        # Transform to copula coordinates\n",
    "        x = ECDF(x)(x)\n",
    "        y = ECDF(y)(y)\n",
    "\n",
    "        # Enable LaTeX text rendering in Matplotlib for consistent fonts\n",
    "        rcParams[\"text.usetex\"] = True\n",
    "        rcParams[\"font.family\"] = \"serif\"\n",
    "        rcParams[\"font.serif\"] = [\"Computer Modern Roman\"]\n",
    "        rcParams[\"font.size\"] = 10\n",
    "\n",
    "        sns.set_style(\"white\")\n",
    "        sns.set_context(\"paper\", font_scale=1.0)\n",
    "\n",
    "        fig, ax = plt.subplots(figsize=(3.5, 2.5))\n",
    "\n",
    "        # Create a scatter plot\n",
    "        ax.scatter(x, y, color=\"#2E86C1\", edgecolor=\"none\", alpha=0.7, linewidth=0.5, s=10)\n",
    "\n",
    "        # Add labels and a title\n",
    "        ax.set_xlabel(r\"\\textbf{\" + name_j2 + r\"}\", fontsize=10)\n",
    "        ax.set_ylabel(r\"\\textbf{\" + name_j1 + r\"}\", fontsize=10, rotation=90, labelpad=12)\n",
    "        # Remove top and right spines for a clean look\n",
    "        ax.spines[\"top\"].set_visible(False)\n",
    "        ax.spines[\"right\"].set_visible(False)\n",
    "\n",
    "        # Optionally, set a consistent x- and y-limit if needed\n",
    "        ax.set_xlim(-0.05, 1.05)\n",
    "        ax.set_ylim(-0.05, 1.05)\n",
    "\n",
    "        ax.text(\n",
    "            0.95, 0.05,  # Position in axes coordinates (lower right)\n",
    "            r\"$\\tau=\" + f\"{rank_corr:.2f}\" + \"$\",  # LaTeX formatting\n",
    "            transform=ax.transAxes,  # Use axes coordinates\n",
    "            ha='right',  # Right alignment\n",
    "            va='bottom',  # Bottom alignment\n",
    "            fontsize=8,   # Smaller font size\n",
    "            bbox=dict(\n",
    "                facecolor='white',\n",
    "                alpha=0.7,\n",
    "                edgecolor='none',\n",
    "                pad=2\n",
    "            )\n",
    "        )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Compute copula p values and analyze them\n",
    "\n",
    "import pandas as pd\n",
    "\n",
    "N_TEST = len(calibrated_conf_test[0])\n",
    "\n",
    "pvals = []\n",
    "\n",
    "for j1 in range(len(CHAIN.model_names)):\n",
    "    for j2 in range(j1 + 1, len(CHAIN.model_names)):\n",
    "        C_j1_j2 = prepare_calibrated_conf(calibrated_conf_test, j1, j2, n_test=N_TEST)\n",
    "        cvm_pval, cvm_value = compute_pval_e2e(C_j1_j2, return_test_statistic=True)\n",
    "        pvals.append({\"model_pair\": [j1, j2], \"chain\": CHAIN_NAME, \"benchmark\": NAME, \"cvm\": cvm_value, \"cvm_pval\": cvm_pval})\n",
    "\n",
    "def transform_data(df):\n",
    "    return pd.Series({\n",
    "        # 'sqrt_cvm': df['sqrt_cvm'].mean(),\n",
    "        'cvm': df['cvm'].mean(),\n",
    "        'n_reject': df['reject'].sum(),\n",
    "        'rejection_rate': df['reject'].mean(),\n",
    "        'geommean_pval': np.exp(np.mean(np.log(df['cvm_pval'].clip(lower=1e-4)))),\n",
    "        'avg_pval': np.mean(df['cvm_pval'])\n",
    "    })\n",
    "    \n",
    "df = pd.DataFrame(pvals)\n",
    "df['reject'] = (df['cvm_pval'] < 0.05)\n",
    "\n",
    "print(df.groupby(by=['chain', 'benchmark'])[['cvm', 'reject', 'cvm_pval']].apply(transform_data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy.stats import kendalltau\n",
    "import numpy as np\n",
    "\n",
    "### Compute pairwise Kendall's tau\n",
    "\n",
    "pairwise_kendalls_tau = [ \n",
    "    kendalltau(calibrated_conf_test[j1], calibrated_conf_test[j2]).statistic\n",
    "        for j1 in range(len(CHAIN.model_names))\n",
    "            for j2 in range(j1+1, len(CHAIN.model_names))\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
