{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Evaluation for GC dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import math\n",
    "from math import comb\n",
    "from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score\n",
    "\n",
    "\n",
    "# Your excel file path\n",
    "excel_file_path = \"mapped_gc.xlsx\"\n",
    "\n",
    "# Read excel, skip the first row, and select only the desired columns\n",
    "df = pd.read_excel(excel_file_path)\n",
    "\n",
    "# Show the first few rows to verify\n",
    "print(df.head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define model columns\n",
    "models = ['agent_CUI', 'phetag_CUI', 'BERT_CUI', 'meta_CUI', 'ctakes_CUI', 'GPT5_CUI']\n",
    "\n",
    "# Map to more readable model names\n",
    "model_name_map = {\n",
    "    'agent_CUI': 'GenOMA',\n",
    "    'phetag_CUI': 'PhenoTagger',\n",
    "    'BERT_CUI': 'PhenoBERT',\n",
    "    'meta_CUI': 'MetaMap',\n",
    "    'ctakes_CUI': 'cTAKES',\n",
    "    'GPT5_CUI': 'GPT-5'\n",
    "}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "results = {\n",
    "    'Model': [],\n",
    "    'With_Term_Correct': [],\n",
    "    'With_Term_Wrong': [],     # Predicted but mismatched\n",
    "    'With_Term_NoPred': [],    # No prediction made\n",
    "    'With_Term_Total': [],\n",
    "    'No_Term_Correct': [],\n",
    "    'No_Term_Incorrect': [],   # Should be empty but predicted something\n",
    "    'No_Term_Total': []\n",
    "}\n",
    "\n",
    "for model in models:\n",
    "    model_name = model_name_map.get(model, model)\n",
    "\n",
    "    with_term = df[df['true_CUI'].notna() & (df['true_CUI'] != '')]\n",
    "    no_term   = df[df['true_CUI'].isna() | (df['true_CUI'] == '')]\n",
    "\n",
    "    # Cases where the gold has a term\n",
    "    with_term_total   = len(with_term)\n",
    "    with_term_correct = (with_term[model] == with_term['true_CUI']).sum()\n",
    "    with_term_nopred  = (with_term[model].isna() | (with_term[model] == '')).sum()\n",
    "    with_term_wrong   = with_term_total - with_term_correct - with_term_nopred\n",
    "\n",
    "    # Cases where the gold has no term\n",
    "    no_term_total     = len(no_term)\n",
    "    no_term_correct   = (no_term[model].isna() | (no_term[model] == '')).sum()\n",
    "    no_term_incorrect = no_term_total - no_term_correct  # Should be empty but output produced\n",
    "\n",
    "    # Save results\n",
    "    results['Model'].append(model_name)\n",
    "    results['With_Term_Correct'].append(with_term_correct)\n",
    "    results['With_Term_Wrong'].append(with_term_wrong)\n",
    "    results['With_Term_NoPred'].append(with_term_nopred)\n",
    "    results['With_Term_Total'].append(with_term_total)\n",
    "    results['No_Term_Correct'].append(no_term_correct)\n",
    "    results['No_Term_Incorrect'].append(no_term_incorrect)\n",
    "    results['No_Term_Total'].append(no_term_total)\n",
    "\n",
    "summary_df = pd.DataFrame(results)\n",
    "print(summary_df)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import warnings\n",
    "\n",
    "# ---- 1)  ---------------------------------------------------------\n",
    "summary_df['With_Term_Incorrect'] = summary_df['With_Term_Total'] - summary_df['With_Term_Correct']\n",
    "summary_df['No_Term_Incorrect']   = summary_df['No_Term_Total']   - summary_df['No_Term_Correct']\n",
    "\n",
    "if 'With_Term_Wrong' not in summary_df.columns or 'With_Term_NoPred' not in summary_df.columns:\n",
    "    if 'With_Term_Predicted' in summary_df.columns:\n",
    "        # When there is a \"Number of Predictions\" column: Wrong = Prediction - Correct; NoPred = Total - Prediction\n",
    "        summary_df['With_Term_Wrong']  = (summary_df['With_Term_Predicted'] - summary_df['With_Term_Correct']).clip(lower=0)\n",
    "        summary_df['With_Term_NoPred'] = (summary_df['With_Term_Total']     - summary_df['With_Term_Predicted']).clip(lower=0)\n",
    "    elif 'With_Term_NoPred' in summary_df.columns:\n",
    "        summary_df['With_Term_Wrong']  = (summary_df['With_Term_Incorrect'] - summary_df['With_Term_NoPred']).clip(lower=0)\n",
    "    elif 'With_Term_Wrong' in summary_df.columns:\n",
    "        summary_df['With_Term_NoPred'] = (summary_df['With_Term_Incorrect'] - summary_df['With_Term_Wrong']).clip(lower=0)\n",
    "    else:\n",
    "        warnings.warn(\n",
    "            \"无法从 summary_df 推导 With_Term_Wrong / With_Term_NoPred，\"\n",
    "            \"暂把所有错误视为 Wrong；建议提供 With_Term_Predicted 或 With_Term_NoPred。\"\n",
    "        )\n",
    "        summary_df['With_Term_Wrong']  = summary_df['With_Term_Incorrect'].copy()\n",
    "        summary_df['With_Term_NoPred'] = 0\n",
    "\n",
    "# ---- 2)  == With_Term_Total ----------------------------\n",
    "summary_df['With_Term_NoPred'] = summary_df['With_Term_NoPred'].clip(lower=0)\n",
    "summary_df['With_Term_Wrong']  = (\n",
    "    summary_df['With_Term_Total']\n",
    "    - summary_df['With_Term_Correct']\n",
    "    - summary_df['With_Term_NoPred']\n",
    ").clip(lower=0)\n",
    "\n",
    "# (Optional) Assertion Checks\n",
    "assert (\n",
    "    (summary_df['With_Term_Correct'] + summary_df['With_Term_Wrong'] + summary_df['With_Term_NoPred'])\n",
    "    == summary_df['With_Term_Total']\n",
    ").all(), \"Decomposition does not sum to With_Term_Total.\"\n",
    "\n",
    "# ---- 3) painting ----------------------------------------------------------------\n",
    "bar_width = 0.45\n",
    "x = np.arange(len(summary_df['Model']))\n",
    "gap = 0\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(10, 8))\n",
    "\n",
    "# color\n",
    "color_with_correct   = '#66c2a5'  \n",
    "color_with_wrong     = '#fc8d62' \n",
    "color_with_nopred    = '#fef0d9'  \n",
    "# Unused right column colors are reserved\n",
    "color_no_correct     = '#ccece6'\n",
    "color_no_incorrect   = '#fddbc7'\n",
    "\n",
    "# Left column (gold has term）\n",
    "ax.bar(x - gap, summary_df['With_Term_Correct'],  bar_width,\n",
    "       label='Correct (with term)', color=color_with_correct)\n",
    "ax.bar(x - gap, summary_df['With_Term_Wrong'],    bar_width,\n",
    "       bottom=summary_df['With_Term_Correct'],\n",
    "       label='Type I error', color=color_with_wrong)\n",
    "ax.bar(x - gap, summary_df['With_Term_NoPred'],   bar_width,\n",
    "       bottom=summary_df['With_Term_Correct'] + summary_df['With_Term_Wrong'],\n",
    "       label='Type II error', color=color_with_nopred)\n",
    "\n",
    "\n",
    "# Axes and Titles\n",
    "ax.set_xlabel('Model', fontsize=12)\n",
    "ax.set_ylabel('Number of Samples', fontsize=12)\n",
    "ax.set_title('Outcome Decomposition of Model Predictions (GC)', fontsize=16)\n",
    "ax.set_xticks(x)\n",
    "ax.set_xticklabels(summary_df['Model'], fontsize=12)\n",
    "ax.tick_params(axis='y', labelsize=12)\n",
    "\n",
    "# y \n",
    "max_height_with = summary_df['With_Term_Total'].max()\n",
    "max_height_no   = summary_df['No_Term_Total'].max()\n",
    "max_height = max(max_height_with, max_height_no)\n",
    "ax.set_ylim(0, max_height * 1.12)\n",
    "\n",
    "# ---- 4) Integer percentage labels (maximum remainder method, ensuring totals 100%) --------------------------\n",
    "label_min_frac = 0.03 \n",
    "\n",
    "def int_percentages(parts, total):\n",
    "\n",
    "    if total <= 0:\n",
    "        return [0, 0, 0]\n",
    "    parts = np.array(parts, dtype=float)\n",
    "    raw = parts / float(total) * 100.0          \n",
    "    flo = np.floor(raw)                          \n",
    "    remain = int(100 - flo.sum())                \n",
    "    if remain > 0:\n",
    "        remainders = raw - flo\n",
    "        order = np.argsort(-remainders)          \n",
    "        flo[order[:remain]] += 1\n",
    "    return flo.astype(int).tolist()\n",
    "\n",
    "for i in range(len(summary_df)):\n",
    "    with_total   = float(summary_df['With_Term_Total'][i])\n",
    "    with_correct = float(summary_df['With_Term_Correct'][i])\n",
    "    with_wrong   = float(summary_df['With_Term_Wrong'][i])\n",
    "    with_nopred  = float(summary_df['With_Term_NoPred'][i])\n",
    "\n",
    "    if with_total <= 0:\n",
    "        continue\n",
    "\n",
    "    # Calculate integer percentages (guaranteed to be 100%)\n",
    "    int_pcts = int_percentages([with_correct, with_wrong, with_nopred], with_total)\n",
    "    int_labels = [f\"{p}%\" for p in int_pcts]\n",
    "\n",
    "    x_left = x[i] - gap\n",
    "    bottoms = [0.0, with_correct, with_correct + with_wrong]\n",
    "    heights = [with_correct, with_wrong, with_nopred]\n",
    "\n",
    "    for (b, h, lab) in zip(bottoms, heights, int_labels):\n",
    "        if h <= 0:\n",
    "            continue\n",
    "        frac = h / with_total\n",
    "        if frac >= label_min_frac:\n",
    "            y_pos, va, y_adj = b + h / 2.0, 'center', 0\n",
    "        else:\n",
    "            y_pos, va, y_adj = b + h, 'bottom', 1\n",
    "        ax.text(x_left, y_pos + y_adj, lab, ha='center', va=va, fontsize=12)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.subplots_adjust(bottom=0.25)\n",
    "plt.savefig(\"5-1.pdf\", format=\"pdf\", bbox_inches=\"tight\")\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2. Define model-to-column mapping\n",
    "model_columns = {\n",
    "    'GenOMA': 'agent_CUI',\n",
    "    'PhenoTagger': 'phetag_CUI',\n",
    "    'PhenoBERT': 'BERT_CUI',\n",
    "    'MetaMap': 'meta_CUI',\n",
    "    'cTAKES': 'ctakes_CUI',\n",
    "    'GPT-5': 'GPT5_CUI',\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from matplotlib.colors import LinearSegmentedColormap\n",
    "import pandas as pd\n",
    "\n",
    "# Canonical null representation\n",
    "df['true_CUI'] = df['true_CUI'].fillna('None').astype(str).str.strip()\n",
    "\n",
    "results = {}\n",
    "\n",
    "def safe_div(n, d):\n",
    "    return n / d if d > 0 else 0.0\n",
    "\n",
    "for model_name, pred_col in model_columns.items():\n",
    "    if pred_col not in df.columns:\n",
    "        print(f\"Column '{pred_col}' not found, skipping {model_name}\")\n",
    "        continue\n",
    "\n",
    "    df[pred_col] = df[pred_col].fillna('None').astype(str).str.strip()\n",
    "    y_true = df['true_CUI']\n",
    "    y_pred = df[pred_col]\n",
    "\n",
    "    # TP: correct code when gold exists\n",
    "    tp = ((y_pred == y_true) & (y_true != 'None')).sum()\n",
    "\n",
    "    # FP: (1) wrong code when gold exists OR (2) any code when gold is empty\n",
    "    fp_wrong_when_gold = ((y_true != 'None') & (y_pred != 'None') & (y_pred != y_true)).sum()\n",
    "    fp_pred_when_empty = ((y_true == 'None') & (y_pred != 'None')).sum()\n",
    "    fp = fp_wrong_when_gold + fp_pred_when_empty\n",
    "\n",
    "    # FN: predicted nothing when gold exists\n",
    "    fn = ((y_pred == 'None') & (y_true != 'None')).sum()\n",
    "\n",
    "    # TNs are excluded from headline metrics by design\n",
    "    # tn = ((y_true == 'None') & (y_pred == 'None')).sum()  # not used\n",
    "\n",
    "    precision = safe_div(tp, tp + fp)\n",
    "    recall    = safe_div(tp, tp + fn)\n",
    "    f1        = safe_div(2 * precision * recall, (precision + recall))\n",
    "\n",
    "    # Mapping accuracy (positives-only): excludes TNs\n",
    "    mapping_accuracy = safe_div(tp, tp + fp + fn)\n",
    "\n",
    "    results[model_name] = {\n",
    "        'tp': int(tp),\n",
    "        'fp': int(fp),\n",
    "        'fn': int(fn),\n",
    "        'precision': round(precision, 4),\n",
    "        'recall': round(recall, 4),\n",
    "        'f1': round(f1, 4),\n",
    "        'mapping_accuracy': round(mapping_accuracy, 4)\n",
    "    }\n",
    "\n",
    "results_df = pd.DataFrame(results).T\n",
    "print(results_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def wilson_ci(k: int, n: int, z: float = 1.96):\n",
    "    \"\"\"\n",
    "    Wilson confidence interval (95% when z=1.96). Returns (p_hat, lo, hi).\n",
    "    If n==0, returns (nan, nan, nan).\n",
    "    \"\"\"\n",
    "    if n == 0:\n",
    "        return float('nan'), float('nan'), float('nan')\n",
    "    p_hat = k / n\n",
    "    z2 = z * z\n",
    "    denom = 1.0 + z2 / n\n",
    "    center = (p_hat + z2 / (2 * n)) / denom\n",
    "    half = (z / denom) * math.sqrt(p_hat * (1 - p_hat) / n + z2 / (4 * n * n))\n",
    "    lo = max(0.0, center - half)\n",
    "    hi = min(1.0, center + half)\n",
    "    return p_hat, lo, hi\n",
    "\n",
    "rows = []\n",
    "for model, r in results_df.iterrows():\n",
    "    tp = int(r['tp']); fp = int(r['fp']); fn = int(r['fn'])\n",
    "\n",
    "    # Denominators (aligned with your definitions)\n",
    "    acc_n  = tp + fp + fn             # accuracy denominator (positives-only)\n",
    "    prec_n = tp + fp                  # precision denominator (predicted positives)\n",
    "    rec_n  = tp + fn                  # recall denominator (actual positives)\n",
    "\n",
    "    # Wilson CI\n",
    "    acc, acc_lo, acc_hi   = wilson_ci(tp, acc_n)\n",
    "    prec, prec_lo, prec_hi = wilson_ci(tp, prec_n)\n",
    "    rec, rec_lo, rec_hi    = wilson_ci(tp, rec_n)\n",
    "\n",
    "    rows.append({\n",
    "        \"model\": model,\n",
    "        \"tp\": tp, \"fp\": fp, \"fn\": fn,\n",
    "        # Point estimates (recomputed to avoid accumulated rounding error)\n",
    "        \"accuracy\": acc, \"accuracy_CI_low\": acc_lo, \"accuracy_CI_high\": acc_hi, \"acc_n\": acc_n,\n",
    "        \"precision\": prec, \"precision_CI_low\": prec_lo, \"precision_CI_high\": prec_hi, \"prec_n\": prec_n,\n",
    "        \"recall\": rec, \"recall_CI_low\": rec_lo, \"recall_CI_high\": rec_hi, \"rec_n\": rec_n,\n",
    "    })\n",
    "\n",
    "metrics_with_ci = pd.DataFrame(rows).set_index(\"model\")\n",
    "\n",
    "# Pretty print (keep 3 decimals; change to .round(2) if preferred)\n",
    "cols_to_round = [\n",
    "    \"accuracy\",\"accuracy_CI_low\",\"accuracy_CI_high\",\n",
    "    \"precision\",\"precision_CI_low\",\"precision_CI_high\",\n",
    "    \"recall\",\"recall_CI_low\",\"recall_CI_high\"\n",
    "]\n",
    "metrics_with_ci[cols_to_round] = metrics_with_ci[cols_to_round].round(3)\n",
    "\n",
    "print(metrics_with_ci)\n",
    "\n",
    "# If you want to merge with original results_df (won't overwrite point estimates)\n",
    "# merged = results_df.join(metrics_with_ci, how=\"left\", rsuffix=\"_wilson\")\n",
    "# print(merged)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from matplotlib.ticker import PercentFormatter  # New addition\n",
    "\n",
    "# Assume results_df is already defined\n",
    "results_df = pd.DataFrame(results).T.reset_index().rename(columns={'index': 'Model'})\n",
    "\n",
    "# Custom colors: green shades from dark to light (soft)\n",
    "green_shades = ['#00441b', '#006d2c', '#238b45', '#41ab5d', '#74c476', '#c7e9c0']\n",
    "\n",
    "# Manual X coordinate positioning\n",
    "x = np.arange(len(results_df))  \n",
    "bar_width = 0.6  \n",
    "\n",
    "fig, ax = plt.subplots(figsize=(6, 5))\n",
    "\n",
    "# Plot bar chart\n",
    "bars = ax.bar(x, results_df['mapping_accuracy'], width=bar_width, color=green_shades)\n",
    "\n",
    "# Add accuracy values (percentage)\n",
    "for i, bar in enumerate(bars):\n",
    "    height = bar.get_height()\n",
    "    ax.text(bar.get_x() + bar.get_width()/2, height + 0.02,\n",
    "            f'{height:.2%}', ha='center', va='bottom', fontsize=12)\n",
    "\n",
    "# Set title and axes\n",
    "ax.set_xlabel('Model', fontsize=10)\n",
    "ax.set_title('Model Accuracy Comparison (GC)', fontsize=16)\n",
    "ax.set_ylabel('Accuracy', fontsize=10)\n",
    "ax.set_xticks(x)\n",
    "ax.set_xticklabels(results_df['Model'], fontsize=9)\n",
    "\n",
    "# Y-axis percentage display\n",
    "ax.yaxis.set_major_formatter(PercentFormatter(xmax=1.0))\n",
    "\n",
    "ax.set_ylim(0, 1.05)\n",
    "ax.tick_params(axis='y', labelsize=10)\n",
    "\n",
    "# Remove extra borders\n",
    "ax.spines['top'].set_visible(False)\n",
    "ax.spines['right'].set_visible(False)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig(\"3-1.pdf\", format=\"pdf\", pad_inches=0.0, bbox_inches=\"tight\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Models and performance data\n",
    "models = ['GenOMA', 'PhenoTagger', 'PhenoBERT', 'MetaMap', 'cTAKES', 'GPT-5']\n",
    "precision = [0.9144, 0.8235, 0.8105, 0.6012, 0.3583, 0.5281]\n",
    "recall    = [1.0000, 0.8917, 0.7848, 0.8033, 1.0000, 0.9126]\n",
    "f1_score  = [0.9553, 0.8563, 0.7974, 0.6877, 0.5276, 0.6690]\n",
    "\n",
    "# Metric names\n",
    "metrics = ['Precision', 'Recall', 'F1 Score']\n",
    "metric_values = [precision, recall, f1_score]\n",
    "\n",
    "# Axis setup\n",
    "x = np.arange(len(metrics))  # [0, 1, 2]\n",
    "bar_width = 0.14\n",
    "\n",
    "# Professional palette: 6 green shades from dark to light\n",
    "colors = ['#00441b', '#006d2c', '#238b45', '#41ab5d', '#74c476', '#c7e9c0']\n",
    "\n",
    "# Figure style\n",
    "fig, ax = plt.subplots(figsize=(10, 5.5))\n",
    "plt.rcParams.update({'font.size': 12})\n",
    "\n",
    "# Plot bars for each model\n",
    "for i, model in enumerate(models):\n",
    "    values = [metric[i] for metric in metric_values]\n",
    "    bars = ax.bar(x + i * bar_width, values, bar_width, label=model, color=colors[i])\n",
    "\n",
    "    # Labels above bars\n",
    "    for bar in bars:\n",
    "        height = bar.get_height()\n",
    "        ax.annotate(f'{height:.2f}',\n",
    "                    xy=(bar.get_x() + bar.get_width() / 2, height + 0.01),\n",
    "                    ha='center', va='bottom',\n",
    "                    fontsize=10, color='black')\n",
    "\n",
    "# Axes and title\n",
    "ax.set_ylabel('Score', fontsize=14)\n",
    "ax.set_title('Model Evaluation Metric Comparison (GC)', fontsize=16, pad=15)\n",
    "ax.set_xticks(x + 1.5 * bar_width)\n",
    "ax.set_xticklabels(metrics, fontsize=13)\n",
    "ax.set_ylim(0, 1.05)\n",
    "ax.yaxis.grid(True, linestyle='--', alpha=0.6)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig(\"4-1.pdf\", format=\"pdf\", bbox_inches=\"tight\")\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "GOLD_COL = \"true_CUI\"       # Gold standard\n",
    "GEN_COL  = \"agent_CUI\"      # GenOMA\n",
    "BASE_COL = \"phetag_CUI\"     # PhenoTagger (strongest baseline)\n",
    "def _norm_code(x):\n",
    "    \"\"\"Trim whitespace, normalize case; treat empty/None/nan as empty string.\"\"\"\n",
    "    if pd.isna(x):\n",
    "        return \"\"\n",
    "    s = str(x).strip()\n",
    "    if s.lower() in {\"\", \"none\", \"nan\", \"null\"}:\n",
    "        return \"\"\n",
    "    return s.upper()\n",
    "\n",
    "def compute_mcnemar_counts(\n",
    "    df: pd.DataFrame,\n",
    "    gold_col=GOLD_COL,\n",
    "    gen_col=GEN_COL,\n",
    "    base_col=BASE_COL,\n",
    "    include_empty_gold: bool = False\n",
    "):\n",
    "    g  = df[gold_col].map(_norm_code)\n",
    "    g1 = df[gen_col].map(_norm_code)\n",
    "    g2 = df[base_col].map(_norm_code)\n",
    "\n",
    "    if include_empty_gold:\n",
    "        # Compare on all rows; prediction equals gold counts as correct (including both empty)\n",
    "        mask = pd.Series(True, index=df.index)\n",
    "        gen_ok  = (g1 == g)\n",
    "        base_ok = (g2 == g)\n",
    "    else:\n",
    "        # Compare only where true_CUI is non-empty\n",
    "        mask = (g != \"\")\n",
    "        gen_ok  = (g1 == g) & mask\n",
    "        base_ok = (g2 == g) & mask\n",
    "\n",
    "    a = int(((gen_ok)  & (base_ok) & mask).sum())   # Both correct\n",
    "    b = int(((gen_ok)  & (~base_ok) & mask).sum())  # GenOMA correct, baseline wrong\n",
    "    c = int(((~gen_ok) & (base_ok) & mask).sum())   # GenOMA wrong, baseline correct\n",
    "    d = int(((~gen_ok) & (~base_ok) & mask).sum())  # Both wrong\n",
    "    N = int(mask.sum())\n",
    "\n",
    "    return {\"a\": a, \"b\": b, \"c\": c, \"d\": d, \"N\": N}\n",
    "\n",
    "def mcnemar_exact_p(b: int, c: int) -> float:\n",
    "    \"\"\"McNemar exact binomial (two-sided) p-value; valid for any b+c.\"\"\"\n",
    "    n = b + c\n",
    "    if n == 0:\n",
    "        return 1.0\n",
    "    k = min(b, c)\n",
    "    tail = sum(comb(n, i) for i in range(0, k + 1)) / (2 ** n)\n",
    "    return min(1.0, 2 * tail)\n",
    "\n",
    "# ==== Computation ====\n",
    "counts = compute_mcnemar_counts(df, include_empty_gold=False)  # Set to True to include rows with empty true_CUI\n",
    "a, b, c, d, N = counts[\"a\"], counts[\"b\"], counts[\"c\"], counts[\"d\"], counts[\"N\"]\n",
    "p_exact = mcnemar_exact_p(b, c)\n",
    "\n",
    "print(f\"N={N}\")\n",
    "print(f\"a={a}, b={b}, c={c}, d={d}\")\n",
    "print(f\"McNemar exact binomial (two-sided) p = {p_exact:.4g}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1) Assemble the \"long table\" (containing 6 models; automatically discarding NaNs)\n",
    "df_long = pd.concat([\n",
    "    pd.DataFrame({'Model': 'GenOMA',     'LLM_Score': df['agent_LLM_Score']}),\n",
    "    pd.DataFrame({'Model': 'PhenoTagger', 'LLM_Score': df['phetag_LLM_Score']}),\n",
    "    pd.DataFrame({'Model': 'PhenoBERT',   'LLM_Score': df['BERT_LLM_Score']}),\n",
    "    pd.DataFrame({'Model': 'MetaMap',     'LLM_Score': df['meta_LLM_Score']}),\n",
    "    pd.DataFrame({'Model': 'cTAKES',      'LLM_Score': df['ctake_LLM_Score']}),\n",
    "    pd.DataFrame({'Model': 'GPT-5',        'LLM_Score': df['GPT5_LLM_Score']}),\n",
    "], ignore_index=True).dropna(subset=['LLM_Score'])\n",
    "\n",
    "# 2) Calculate the mean and standard deviation of each model (can be sorted in descending order by mean)\n",
    "stats = (df_long\n",
    "         .groupby('Model', as_index=False)['LLM_Score']\n",
    "         .agg(mean='mean', std='std'))\n",
    "\n",
    "# Sort by average from high to low (delete this row if not needed)\n",
    "stats = stats.sort_values('mean', ascending=False)\n",
    "\n",
    "models = stats['Model'].tolist()\n",
    "means  = stats['mean'].to_numpy()\n",
    "stds   = stats['std'].fillna(0).to_numpy()   # When there is only one sample, std may be NaN, set to 0\n",
    "\n",
    "# 3) ploting\n",
    "fig, ax = plt.subplots(figsize=(9.5, 5.6))\n",
    "\n",
    "# Optional: Soft green 6-level gradient (dark→light)\n",
    "green_shades = ['#00441b', '#006d2c', '#238b45', '#41ab5d', '#74c476', '#c7e9c0']\n",
    "\n",
    "bars = ax.bar(models, means, yerr=stds, capsize=6, linewidth=0, color=green_shades[:len(models)])\n",
    "\n",
    "# 4) Smart placement of value labels (on top of column + error bars, plus a little adaptive offset)\n",
    "ymin, ymax = ax.get_ylim()\n",
    "offset = 0.015 * (ymax - ymin) \n",
    "for rect, mean, std in zip(bars, means, stds):\n",
    "    top = rect.get_height() + std + offset\n",
    "    ax.text(rect.get_x() + rect.get_width()/2, top,\n",
    "            f'{mean:.3f}',\n",
    "            ha='center', va='bottom', fontsize=10)\n",
    "\n",
    "# 5) Axes and Styles\n",
    "ax.set_ylabel('Mean LLM Similarity Score', fontsize=12)\n",
    "ax.set_title('Mean LLM Score with Standard Deviation (DS-PALS)', fontsize=14, pad=10)\n",
    "ax.set_ylim(0, max(means + stds) * 1.15)     \n",
    "ax.yaxis.grid(True, linestyle='--', alpha=0.6)\n",
    "ax.set_axisbelow(True)                        \n",
    "ax.tick_params(axis='x', labelrotation=0)     \n",
    "\n",
    "# 6) Legend (can be used if required)\n",
    "# ax.legend(handles=bars, labels=models, title=\"Model\", frameon=False, fontsize=10, title_fontsize=11)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Evaluation for DS-PALS dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Your excel file path\n",
    "excel_file_path = \"1result.xlsx\"\n",
    "\n",
    "# Read excel, skip the first row, and select only the desired columns\n",
    "df = pd.read_excel(excel_file_path)\n",
    "\n",
    "# Show the first few rows to verify\n",
    "print(df.head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Defining model columns\n",
    "models = ['agent_CUI', 'phetag_CUI', 'BERT_CUI', 'meta_CUI', 'ctake_CUI', 'GPT5_CUI']\n",
    "\n",
    "# Mapping to more friendly names\n",
    "model_name_map = {\n",
    "    'agent_CUI': 'GenOMA',\n",
    "    'phetag_CUI': 'PhenoTagger',\n",
    "    'BERT_CUI': 'PhenoBERT',\n",
    "    'meta_CUI': 'MetaMap',\n",
    "    'ctake_CUI': 'cTAKES',\n",
    "    'GPT5_CUI': 'GPT-5'\n",
    "}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "results = {\n",
    "    'Model': [],\n",
    "    'With_Term_Correct': [],\n",
    "    'With_Term_Wrong': [],     \n",
    "    'With_Term_NoPred': [],    \n",
    "    'With_Term_Total': [],\n",
    "    'No_Term_Correct': [],\n",
    "    'No_Term_Incorrect': [],   \n",
    "    'No_Term_Total': []\n",
    "}\n",
    "\n",
    "for model in models:\n",
    "    model_name = model_name_map.get(model, model)\n",
    "\n",
    "    with_term = df[df['true_CUI'].notna() & (df['true_CUI'] != '')]\n",
    "    no_term   = df[df['true_CUI'].isna() | (df['true_CUI'] == '')]\n",
    "\n",
    "    # Wiyh term section\n",
    "    with_term_total   = len(with_term)\n",
    "    with_term_correct = (with_term[model] == with_term['true_CUI']).sum()\n",
    "    with_term_nopred  = (with_term[model].isna() | (with_term[model] == '')).sum()\n",
    "    with_term_wrong   = with_term_total - with_term_correct - with_term_nopred\n",
    "\n",
    "    # No term section\n",
    "    no_term_total     = len(no_term)\n",
    "    no_term_correct   = (no_term[model].isna() | (no_term[model] == '')).sum()\n",
    "    no_term_incorrect = no_term_total - no_term_correct  \n",
    "\n",
    "    # save resluts\n",
    "    results['Model'].append(model_name)\n",
    "    results['With_Term_Correct'].append(with_term_correct)\n",
    "    results['With_Term_Wrong'].append(with_term_wrong)\n",
    "    results['With_Term_NoPred'].append(with_term_nopred)\n",
    "    results['With_Term_Total'].append(with_term_total)\n",
    "    results['No_Term_Correct'].append(no_term_correct)\n",
    "    results['No_Term_Incorrect'].append(no_term_incorrect)\n",
    "    results['No_Term_Total'].append(no_term_total)\n",
    "\n",
    "summary_df = pd.DataFrame(results)\n",
    "print(summary_df)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import warnings\n",
    "\n",
    "# ---- 1) Compute error counts (original) -------------------------------------\n",
    "summary_df['With_Term_Incorrect'] = summary_df['With_Term_Total'] - summary_df['With_Term_Correct']\n",
    "summary_df['No_Term_Incorrect']   = summary_df['No_Term_Total']   - summary_df['No_Term_Correct']\n",
    "\n",
    "# ---- 2) Split With_Term_Incorrect -> Wrong / NoPred -------------------------\n",
    "if 'With_Term_Wrong' not in summary_df.columns or 'With_Term_NoPred' not in summary_df.columns:\n",
    "    if 'With_Term_Predicted' in summary_df.columns:\n",
    "        # Wrong = Predicted - Correct; NoPred = Total - Predicted\n",
    "        summary_df['With_Term_Wrong']  = (summary_df['With_Term_Predicted'] - summary_df['With_Term_Correct']).clip(lower=0)\n",
    "        summary_df['With_Term_NoPred'] = (summary_df['With_Term_Total']     - summary_df['With_Term_Predicted']).clip(lower=0)\n",
    "    elif 'With_Term_NoPred' in summary_df.columns:\n",
    "        summary_df['With_Term_Wrong']  = (summary_df['With_Term_Incorrect'] - summary_df['With_Term_NoPred']).clip(lower=0)\n",
    "    elif 'With_Term_Wrong' in summary_df.columns:\n",
    "        summary_df['With_Term_NoPred'] = (summary_df['With_Term_Incorrect'] - summary_df['With_Term_Wrong']).clip(lower=0)\n",
    "    else:\n",
    "        warnings.warn(\n",
    "            \"Unable to infer With_Term_Wrong / With_Term_NoPred; treating all errors as Wrong. \"\n",
    "            \"Consider providing With_Term_Predicted or With_Term_NoPred in the summary.\"\n",
    "        )\n",
    "        summary_df['With_Term_Wrong']  = summary_df['With_Term_Incorrect'].copy()\n",
    "        summary_df['With_Term_NoPred'] = 0\n",
    "\n",
    "# ---- 3) Backfill correction: ensure Correct+Wrong+NoPred == With_Term_Total --\n",
    "summary_df['With_Term_NoPred'] = summary_df['With_Term_NoPred'].clip(lower=0)\n",
    "summary_df['With_Term_Wrong']  = (\n",
    "    summary_df['With_Term_Total']\n",
    "    - summary_df['With_Term_Correct']\n",
    "    - summary_df['With_Term_NoPred']\n",
    ").clip(lower=0)\n",
    "\n",
    "# (Optional) Assertion to ensure decomposition holds\n",
    "assert (\n",
    "    (summary_df['With_Term_Correct'] + summary_df['With_Term_Wrong'] + summary_df['With_Term_NoPred'])\n",
    "    == summary_df['With_Term_Total']\n",
    ").all(), \"Decomposition does not sum to With_Term_Total.\"\n",
    "\n",
    "# Define right-bar consistency as well (derive incorrect from total and correct)\n",
    "summary_df['No_Term_Incorrect'] = (\n",
    "    summary_df['No_Term_Total'] - summary_df['No_Term_Correct']\n",
    ").clip(lower=0)\n",
    "\n",
    "# ---- 4) Plot ----------------------------------------------------------------\n",
    "bar_width = 0.35\n",
    "x = np.arange(len(summary_df['Model']))\n",
    "gap = bar_width * 0.6\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(10, 8))\n",
    "\n",
    "# color\n",
    "color_with_correct   = '#66c2a5'  \n",
    "color_with_wrong     = '#fc8d62'  \n",
    "color_with_nopred    = '#fef0d9'  \n",
    "color_no_correct     = '#ccece6'  \n",
    "color_no_incorrect   = '#fddbc7'  \n",
    "\n",
    "ax.bar(x - gap, summary_df['With_Term_Correct'],  bar_width,\n",
    "       label='Correct (with term)', color=color_with_correct)\n",
    "ax.bar(x - gap, summary_df['With_Term_Wrong'],    bar_width,\n",
    "       bottom=summary_df['With_Term_Correct'],\n",
    "       label='Type IA error', color=color_with_wrong)\n",
    "ax.bar(x - gap, summary_df['With_Term_NoPred'],   bar_width,\n",
    "       bottom=summary_df['With_Term_Correct'] + summary_df['With_Term_Wrong'],\n",
    "       label='Type II error', color=color_with_nopred)\n",
    "\n",
    "ax.bar(x + gap, summary_df['No_Term_Correct'],    bar_width,\n",
    "       label='Correct (no term)', color=color_no_correct)\n",
    "ax.bar(x + gap, summary_df['No_Term_Incorrect'],  bar_width,\n",
    "       bottom=summary_df['No_Term_Correct'],\n",
    "       label='Type IB error', color=color_no_incorrect)\n",
    "\n",
    "ax.legend(loc='upper left', bbox_to_anchor=(1.02, 1),\n",
    "          fancybox=False, shadow=False, ncol=1, fontsize=10)\n",
    "\n",
    "ax.set_xlabel('Model', fontsize=10)\n",
    "ax.set_ylabel('Number of Samples', fontsize=10)\n",
    "ax.set_title('Outcome Decomposition of Model Predictions (DS-PALS)', fontsize=14)\n",
    "ax.set_xticks(x)\n",
    "ax.set_xticklabels(summary_df['Model'], fontsize=10)\n",
    "ax.tick_params(axis='y', labelsize=10)\n",
    "\n",
    "max_height_with = summary_df['With_Term_Total'].max()\n",
    "max_height_no   = summary_df['No_Term_Total'].max()\n",
    "ax.set_ylim(0, max(max_height_with, max_height_no) * 1.12)\n",
    "\n",
    "label_min_frac = 0.03  \n",
    "\n",
    "def int_percentages(parts, total):\n",
    "\n",
    "    if total <= 0:\n",
    "        return [0 for _ in parts]\n",
    "    arr = np.array(parts, dtype=float)\n",
    "    raw = arr / float(total) * 100.0\n",
    "    flo = np.floor(raw)\n",
    "    remain = int(100 - flo.sum())\n",
    "    if remain > 0:\n",
    "        order = np.argsort(-(raw - flo))  \n",
    "        flo[order[:remain]] += 1\n",
    "    return flo.astype(int).tolist()\n",
    "\n",
    "for i in range(len(summary_df)):\n",
    "    \n",
    "    with_total   = float(summary_df['With_Term_Total'][i])\n",
    "    with_correct = float(summary_df['With_Term_Correct'][i])\n",
    "    with_wrong   = float(summary_df['With_Term_Wrong'][i])\n",
    "    with_nopred  = float(summary_df['With_Term_NoPred'][i])\n",
    "\n",
    "    if with_total > 0:\n",
    "        x_left = x[i] - gap\n",
    "        bottoms = [0.0, with_correct, with_correct + with_wrong]\n",
    "        heights = [with_correct, with_wrong, with_nopred]\n",
    "\n",
    "        int_pcts = int_percentages(heights, with_total)\n",
    "        int_labels = [f\"{p}%\" for p in int_pcts]\n",
    "\n",
    "        for b, h, lab in zip(bottoms, heights, int_labels):\n",
    "            if h <= 0:\n",
    "                continue\n",
    "            frac = h / with_total\n",
    "            if frac >= label_min_frac:\n",
    "                y_pos, va, y_adj = b + h / 2.0, 'center', 0\n",
    "            else:\n",
    "                y_pos, va, y_adj = b + h, 'bottom', 1\n",
    "            ax.text(x_left, y_pos + y_adj, lab, ha='center', va=va, fontsize=10)\n",
    "\n",
    "    no_total     = float(summary_df['No_Term_Total'][i])\n",
    "    no_correct   = float(summary_df['No_Term_Correct'][i])\n",
    "    no_incorrect = float(summary_df['No_Term_Incorrect'][i])\n",
    "\n",
    "    if no_total > 0:\n",
    "        x_right = x[i] + gap\n",
    "        bottoms = [0.0, no_correct]\n",
    "        heights = [no_correct, no_incorrect]\n",
    "\n",
    "        int_pcts = int_percentages(heights, no_total)\n",
    "        int_labels = [f\"{p}%\" for p in int_pcts]\n",
    "\n",
    "        for b, h, lab in zip(bottoms, heights, int_labels):\n",
    "            if h <= 0:\n",
    "                continue\n",
    "            frac = h / no_total\n",
    "            if frac >= label_min_frac:\n",
    "                y_pos, va, y_adj = b + h / 2.0, 'center', 0\n",
    "            else:\n",
    "                y_pos, va, y_adj = b + h, 'bottom', 1\n",
    "            ax.text(x_right, y_pos + y_adj, lab, ha='center', va=va, fontsize=10)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.subplots_adjust(bottom=0.25)\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define model-to-column mapping\n",
    "model_columns = {\n",
    "    'GenOMA': 'agent_CUI',\n",
    "    'PhenoTagger': 'phetag_CUI',\n",
    "    'PhenoBERT': 'BERT_CUI',\n",
    "    'MetaMap': 'meta_CUI',\n",
    "    'cTAKES': 'ctake_CUI',\n",
    "    'GPT-5': 'GPT5_CUI',\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from matplotlib.colors import LinearSegmentedColormap\n",
    "import pandas as pd\n",
    "\n",
    "# Normalize empty values\n",
    "df['true_CUI'] = df['true_CUI'].fillna('None').astype(str).str.strip()\n",
    "\n",
    "results = {}\n",
    "\n",
    "def safe_div(n, d):\n",
    "    return n / d if d > 0 else 0.0\n",
    "\n",
    "for model_name, pred_col in model_columns.items():\n",
    "    if pred_col not in df.columns:\n",
    "        print(f\"Column '{pred_col}' not found, skipping {model_name}\")\n",
    "        continue\n",
    "\n",
    "    df[pred_col] = df[pred_col].fillna('None').astype(str).str.strip()\n",
    "    y_true = df['true_CUI']\n",
    "    y_pred = df[pred_col]\n",
    "\n",
    "    # TP: correct code when gold exists\n",
    "    tp = ((y_pred == y_true) & (y_true != 'None')).sum()\n",
    "\n",
    "    # FP: (1) wrong code when gold exists OR (2) any code when gold is empty\n",
    "    fp_wrong_when_gold = ((y_true != 'None') & (y_pred != 'None') & (y_pred != y_true)).sum()\n",
    "    fp_pred_when_empty = ((y_true == 'None') & (y_pred != 'None')).sum()\n",
    "    fp = fp_wrong_when_gold + fp_pred_when_empty\n",
    "\n",
    "    # FN: predicted nothing when gold exists\n",
    "    fn = ((y_pred == 'None') & (y_true != 'None')).sum()\n",
    "\n",
    "    # TNs are excluded from headline metrics by design\n",
    "    # tn = ((y_true == 'None') & (y_pred == 'None')).sum()  # not used\n",
    "\n",
    "    precision = safe_div(tp, tp + fp)\n",
    "    recall    = safe_div(tp, tp + fn)\n",
    "    f1        = safe_div(2 * precision * recall, (precision + recall))\n",
    "\n",
    "    # Mapping accuracy (positives-only): excludes TNs\n",
    "    mapping_accuracy = safe_div(tp, tp + fp + fn)\n",
    "\n",
    "    results[model_name] = {\n",
    "        'tp': int(tp),\n",
    "        'fp': int(fp),\n",
    "        'fn': int(fn),\n",
    "        'precision': round(precision, 4),\n",
    "        'recall': round(recall, 4),\n",
    "        'f1': round(f1, 4),\n",
    "        'mapping_accuracy': round(mapping_accuracy, 4)\n",
    "    }\n",
    "\n",
    "results_df = pd.DataFrame(results).T\n",
    "print(results_df)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "def wilson_ci(k: int, n: int, z: float = 1.96):\n",
    "    \"\"\"\n",
    "    Wilson confidence interval (95% when z=1.96). Returns (p_hat, lo, hi).\n",
    "    If n==0, returns (nan, nan, nan).\n",
    "    \"\"\"\n",
    "    if n == 0:\n",
    "        return float('nan'), float('nan'), float('nan')\n",
    "    p_hat = k / n\n",
    "    z2 = z * z\n",
    "    denom = 1.0 + z2 / n\n",
    "    center = (p_hat + z2 / (2 * n)) / denom\n",
    "    half = (z / denom) * math.sqrt(p_hat * (1 - p_hat) / n + z2 / (4 * n * n))\n",
    "    lo = max(0.0, center - half)\n",
    "    hi = min(1.0, center + half)\n",
    "    return p_hat, lo, hi\n",
    "\n",
    "rows = []\n",
    "for model, r in results_df.iterrows():\n",
    "    tp = int(r['tp']); fp = int(r['fp']); fn = int(r['fn'])\n",
    "\n",
    "    # Denominators (aligned with your definitions)\n",
    "    acc_n  = tp + fp + fn             # accuracy denominator (positives-only)\n",
    "    prec_n = tp + fp                  # precision denominator (predicted positives)\n",
    "    rec_n  = tp + fn                  # recall denominator (actual positives)\n",
    "\n",
    "    # Wilson CI\n",
    "    acc, acc_lo, acc_hi   = wilson_ci(tp, acc_n)\n",
    "    prec, prec_lo, prec_hi = wilson_ci(tp, prec_n)\n",
    "    rec, rec_lo, rec_hi    = wilson_ci(tp, rec_n)\n",
    "\n",
    "    rows.append({\n",
    "        \"model\": model,\n",
    "        \"tp\": tp, \"fp\": fp, \"fn\": fn,\n",
    "        # Point estimates (recomputed to avoid accumulated rounding error)\n",
    "        \"accuracy\": acc, \"accuracy_CI_low\": acc_lo, \"accuracy_CI_high\": acc_hi, \"acc_n\": acc_n,\n",
    "        \"precision\": prec, \"precision_CI_low\": prec_lo, \"precision_CI_high\": prec_hi, \"prec_n\": prec_n,\n",
    "        \"recall\": rec, \"recall_CI_low\": rec_lo, \"recall_CI_high\": rec_hi, \"rec_n\": rec_n,\n",
    "    })\n",
    "\n",
    "metrics_with_ci = pd.DataFrame(rows).set_index(\"model\")\n",
    "\n",
    "# Pretty print (keep 3 decimals; change to .round(2) if preferred)\n",
    "cols_to_round = [\n",
    "    \"accuracy\",\"accuracy_CI_low\",\"accuracy_CI_high\",\n",
    "    \"precision\",\"precision_CI_low\",\"precision_CI_high\",\n",
    "    \"recall\",\"recall_CI_low\",\"recall_CI_high\"\n",
    "]\n",
    "metrics_with_ci[cols_to_round] = metrics_with_ci[cols_to_round].round(3)\n",
    "\n",
    "print(metrics_with_ci)\n",
    "\n",
    "# If you want to merge with original results_df (won't overwrite point estimates)\n",
    "# merged = results_df.join(metrics_with_ci, how=\"left\", rsuffix=\"_wilson\")\n",
    "# print(merged)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Assuming results_df has already been defined\n",
    "results_df = pd.DataFrame(results).T.reset_index().rename(columns={'index': 'Model'})\n",
    "\n",
    "# Custom color: Green from dark to light (soft)\n",
    "green_shades = ['#00441b', '#006d2c', '#238b45', '#41ab5d', '#74c476', '#c7e9c0']\n",
    "\n",
    "# Manually set the X coordinate position\n",
    "x = np.arange(len(results_df))  \n",
    "bar_width = 0.6  \n",
    "\n",
    "fig, ax = plt.subplots(figsize=(6, 5))\n",
    "\n",
    "# Draw a bar chart\n",
    "bars = ax.bar(x, results_df['mapping_accuracy'], width=bar_width, color=green_shades)\n",
    "\n",
    "# Added accuracy value (percentage)\n",
    "for i, bar in enumerate(bars):\n",
    "    height = bar.get_height()\n",
    "    ax.text(bar.get_x() + bar.get_width()/2, height + 0.02,\n",
    "            f'{height:.2%}', ha='center', va='bottom', fontsize=12)\n",
    "\n",
    "# Set the title and axis\n",
    "ax.set_xlabel('Model', fontsize=10)\n",
    "ax.set_title('Model Accuracy Comparison (DS-PALS)', fontsize=16)\n",
    "ax.set_ylabel('Accuracy', fontsize=10)\n",
    "ax.set_xticks(x)\n",
    "ax.set_xticklabels(results_df['Model'], fontsize=9)\n",
    "\n",
    "# y \n",
    "ax.yaxis.set_major_formatter(PercentFormatter(xmax=1.0))\n",
    "\n",
    "ax.set_ylim(0, 1.05)\n",
    "ax.tick_params(axis='y', labelsize=10)\n",
    "\n",
    "# Remove extra borders\n",
    "ax.spines['top'].set_visible(False)\n",
    "ax.spines['right'].set_visible(False)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Models and performance data\n",
    "models = ['GenOMA', 'PhenoTagger', 'PhenoBERT', 'MetaMap', 'cTAKES', 'GPT-5']\n",
    "precision = [0.9310, 0.8585, 0.8659, 0.3333, 0.6949, 0.5083]\n",
    "recall    = [0.9818, 0.8750, 0.6762, 0.8367, 0.8817, 0.8971]\n",
    "f1_score  = [0.9558, 0.8667, 0.7594, 0.4767, 0.7773, 0.6489]\n",
    "\n",
    "# Metric names\n",
    "metrics = ['Precision', 'Recall', 'F1 Score']\n",
    "metric_values = [precision, recall, f1_score]\n",
    "\n",
    "# Axis setup\n",
    "x = np.arange(len(metrics))  # [0, 1, 2]\n",
    "bar_width = 0.14\n",
    "\n",
    "# Professional palette: 6 green shades from dark to light\n",
    "colors = ['#00441b', '#006d2c', '#238b45', '#41ab5d', '#74c476', '#c7e9c0']\n",
    "\n",
    "# Figure style\n",
    "fig, ax = plt.subplots(figsize=(10, 5.5))\n",
    "plt.rcParams.update({'font.size': 12})\n",
    "\n",
    "# Plot bars for each model\n",
    "for i, model in enumerate(models):\n",
    "    values = [metric[i] for metric in metric_values]\n",
    "    bars = ax.bar(x + i * bar_width, values, bar_width, label=model, color=colors[i])\n",
    "\n",
    "    # Labels above bars\n",
    "    for bar in bars:\n",
    "        height = bar.get_height()\n",
    "        ax.annotate(f'{height:.2f}',\n",
    "                    xy=(bar.get_x() + bar.get_width() / 2, height + 0.01),\n",
    "                    ha='center', va='bottom',\n",
    "                    fontsize=10, color='black')\n",
    "\n",
    "# Axes and title\n",
    "ax.set_ylabel('Score', fontsize=14)\n",
    "ax.set_title('Model Evaluation Metric Comparison (DS-PALS)', fontsize=16, pad=15)\n",
    "ax.set_xticks(x + 1.5 * bar_width)\n",
    "ax.set_xticklabels(metrics, fontsize=13)\n",
    "ax.set_ylim(0, 1.05)\n",
    "ax.yaxis.grid(True, linestyle='--', alpha=0.6)\n",
    "\n",
    "# Legend\n",
    "ax.legend(title=\"Model\", bbox_to_anchor=(1.02, 1), loc='upper left',\n",
    "          borderaxespad=0, fontsize=11, title_fontsize=12, frameon=False)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1) Assemble a long-form table (6 models; drop NaNs automatically)\n",
    "df_long = pd.concat([\n",
    "    pd.DataFrame({'Model': 'GenOMA',     'LLM_Score': df['agent_LLM_Score']}),\n",
    "    pd.DataFrame({'Model': 'PhenoTagger', 'LLM_Score': df['phetag_LLM_Score']}),\n",
    "    pd.DataFrame({'Model': 'PhenoBERT',   'LLM_Score': df['BERT_LLM_Score']}),\n",
    "    pd.DataFrame({'Model': 'MetaMap',     'LLM_Score': df['meta_LLM_Score']}),\n",
    "    pd.DataFrame({'Model': 'cTAKES',      'LLM_Score': df['ctake_LLM_Score']}),\n",
    "    pd.DataFrame({'Model': 'GPT-5',        'LLM_Score': df['GPT5_LLM_Score']}),\n",
    "], ignore_index=True).dropna(subset=['LLM_Score'])\n",
    "\n",
    "# 2) Compute mean and std per model (optionally sort by mean desc)\n",
    "stats = (df_long\n",
    "         .groupby('Model', as_index=False)['LLM_Score']\n",
    "         .agg(mean='mean', std='std'))\n",
    "\n",
    "# Sort by mean descending (optional)\n",
    "stats = stats.sort_values('mean', ascending=False)\n",
    "\n",
    "models = stats['Model'].tolist()\n",
    "means  = stats['mean'].to_numpy()\n",
    "stds   = stats['std'].fillna(0).to_numpy()   \n",
    "\n",
    "# 3) painting\n",
    "fig, ax = plt.subplots(figsize=(9.5, 5.6))\n",
    "\n",
    "# Optional: Soft Green 6-step gradient (dark→light)\n",
    "green_shades = ['#00441b', '#006d2c', '#238b45', '#41ab5d', '#74c476', '#c7e9c0']\n",
    "\n",
    "bars = ax.bar(models, means, yerr=stds, capsize=6, linewidth=0, color=green_shades[:len(models)])\n",
    "\n",
    "# 4)Smart placement of value labels (on top of column + error bars, plus a little adaptive offset)\n",
    "ymin, ymax = ax.get_ylim()\n",
    "offset = 0.015 * (ymax - ymin)  \n",
    "for rect, mean, std in zip(bars, means, stds):\n",
    "    top = rect.get_height() + std + offset\n",
    "    ax.text(rect.get_x() + rect.get_width()/2, top,\n",
    "            f'{mean:.3f}',\n",
    "            ha='center', va='bottom', fontsize=10)\n",
    "\n",
    "# 5) Axes and Styles\n",
    "ax.set_ylabel('Mean LLM Similarity Score', fontsize=12)\n",
    "ax.set_title('Mean LLM Score with Standard Deviation (DS-PALS)', fontsize=14, pad=10)\n",
    "ax.set_ylim(0, max(means + stds) * 1.15)     \n",
    "ax.yaxis.grid(True, linestyle='--', alpha=0.6)\n",
    "ax.set_axisbelow(True)                       \n",
    "ax.tick_params(axis='x', labelrotation=0)    \n",
    "\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "GOLD_COL = \"true_CUI\"       # Gold standard\n",
    "GEN_COL  = \"agent_CUI\"      # GenOMA\n",
    "BASE_COL = \"phetag_CUI\"     # PhenoTagger (strongest baseline)\n",
    "\n",
    "def _norm_code(x):\n",
    "    \"\"\"Trim whitespace, normalize case; treat empty/None/nan as empty string.\"\"\"\n",
    "    if pd.isna(x):\n",
    "        return \"\"\n",
    "    s = str(x).strip()\n",
    "    if s.lower() in {\"\", \"none\", \"nan\", \"null\"}:\n",
    "        return \"\"\n",
    "    return s.upper()\n",
    "\n",
    "def compute_mcnemar_counts(\n",
    "    df: pd.DataFrame,\n",
    "    gold_col=GOLD_COL,\n",
    "    gen_col=GEN_COL,\n",
    "    base_col=BASE_COL,\n",
    "    include_empty_gold: bool = False\n",
    "):\n",
    "    g  = df[gold_col].map(_norm_code)\n",
    "    g1 = df[gen_col].map(_norm_code)\n",
    "    g2 = df[base_col].map(_norm_code)\n",
    "\n",
    "    if include_empty_gold:\n",
    "        # Compare on all rows; prediction equals gold counts as correct (including both empty)\n",
    "        mask = pd.Series(True, index=df.index)\n",
    "        gen_ok  = (g1 == g)\n",
    "        base_ok = (g2 == g)\n",
    "    else:\n",
    "        # Compare only where true_CUI is non-empty\n",
    "        mask = (g != \"\")\n",
    "        gen_ok  = (g1 == g) & mask\n",
    "        base_ok = (g2 == g) & mask\n",
    "\n",
    "    a = int(((gen_ok)  & (base_ok) & mask).sum())   # Both correct\n",
    "    b = int(((gen_ok)  & (~base_ok) & mask).sum())  # GenOMA correct, baseline wrong\n",
    "    c = int(((~gen_ok) & (base_ok) & mask).sum())   # GenOMA wrong, baseline correct\n",
    "    d = int(((~gen_ok) & (~base_ok) & mask).sum())  # Both wrong\n",
    "    N = int(mask.sum())\n",
    "\n",
    "    return {\"a\": a, \"b\": b, \"c\": c, \"d\": d, \"N\": N}\n",
    "\n",
    "def mcnemar_exact_p(b: int, c: int) -> float:\n",
    "    \"\"\"McNemar exact binomial (two-sided) p-value; valid for any b+c.\"\"\"\n",
    "    n = b + c\n",
    "    if n == 0:\n",
    "        return 1.0\n",
    "    k = min(b, c)\n",
    "    tail = sum(comb(n, i) for i in range(0, k + 1)) / (2 ** n)\n",
    "    return min(1.0, 2 * tail)\n",
    "\n",
    "# ==== Computation ====\n",
    "counts = compute_mcnemar_counts(df, include_empty_gold=False)  # Set to True to include rows with empty true_CUI\n",
    "a, b, c, d, N = counts[\"a\"], counts[\"b\"], counts[\"c\"], counts[\"d\"], counts[\"N\"]\n",
    "p_exact = mcnemar_exact_p(b, c)\n",
    "\n",
    "print(f\"N={N}\")\n",
    "print(f\"a={a}, b={b}, c={c}, d={d}\")\n",
    "print(f\"McNemar exact binomial (two-sided) p = {p_exact:.4g}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Evaluation for XGS dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Your CSV file path\n",
    "csv_file_path = \"2result.xlsx\"\n",
    "\n",
    "# Read CSV, skip the first row, and select only the desired columns\n",
    "df = pd.read_excel(csv_file_path)\n",
    "\n",
    "# Show the first few rows to verify\n",
    "print(df.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define model columns\n",
    "models = ['agent_CUI', 'phetag_CUI', 'BERT_CUI', 'meta_CUI', 'ctake_CUI', 'GPT5_CUI']\n",
    "\n",
    "# Map to more readable model names\n",
    "model_name_map = {\n",
    "    'agent_CUI': 'GenOMA',\n",
    "    'phetag_CUI': 'PhenoTagger',\n",
    "    'BERT_CUI': 'PhenoBERT',\n",
    "    'meta_CUI': 'MetaMap',\n",
    "    'ctake_CUI': 'cTAKES',\n",
    "    'GPT5_CUI': 'GPT-5'\n",
    "}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "results = {\n",
    "    'Model': [],\n",
    "    'With_Term_Correct': [],\n",
    "    'With_Term_Wrong': [],     # Predicted but mismatched\n",
    "    'With_Term_NoPred': [],    # No prediction made\n",
    "    'With_Term_Total': [],\n",
    "    'No_Term_Correct': [],\n",
    "    'No_Term_Incorrect': [],   # Should be empty but predicted something\n",
    "    'No_Term_Total': []\n",
    "}\n",
    "\n",
    "for model in models:\n",
    "    model_name = model_name_map.get(model, model)\n",
    "\n",
    "    with_term = df[df['true_CUI'].notna() & (df['true_CUI'] != '')]\n",
    "    no_term   = df[df['true_CUI'].isna() | (df['true_CUI'] == '')]\n",
    "\n",
    "    # Cases where the gold has a term\n",
    "    with_term_total   = len(with_term)\n",
    "    with_term_correct = (with_term[model] == with_term['true_CUI']).sum()\n",
    "    with_term_nopred  = (with_term[model].isna() | (with_term[model] == '')).sum()\n",
    "    with_term_wrong   = with_term_total - with_term_correct - with_term_nopred\n",
    "\n",
    "    # Cases where the gold has no term\n",
    "    no_term_total     = len(no_term)\n",
    "    no_term_correct   = (no_term[model].isna() | (no_term[model] == '')).sum()\n",
    "    no_term_incorrect = no_term_total - no_term_correct  # Should be empty but output produced\n",
    "\n",
    "    # Save results\n",
    "    results['Model'].append(model_name)\n",
    "    results['With_Term_Correct'].append(with_term_correct)\n",
    "    results['With_Term_Wrong'].append(with_term_wrong)\n",
    "    results['With_Term_NoPred'].append(with_term_nopred)\n",
    "    results['With_Term_Total'].append(with_term_total)\n",
    "    results['No_Term_Correct'].append(no_term_correct)\n",
    "    results['No_Term_Incorrect'].append(no_term_incorrect)\n",
    "    results['No_Term_Total'].append(no_term_total)\n",
    "\n",
    "summary_df = pd.DataFrame(results)\n",
    "print(summary_df)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ---- 1) Derived columns -----------------------------------------------------\n",
    "summary_df['With_Term_Incorrect'] = summary_df['With_Term_Total'] - summary_df['With_Term_Correct']\n",
    "summary_df['No_Term_Incorrect']   = summary_df['No_Term_Total']   - summary_df['No_Term_Correct']\n",
    "\n",
    "if 'With_Term_Wrong' not in summary_df.columns or 'With_Term_NoPred' not in summary_df.columns:\n",
    "    if 'With_Term_Predicted' in summary_df.columns:\n",
    "        # When a \"predicted count\" column is present: Wrong = Predicted - Correct; NoPred = Total - Predicted\n",
    "        summary_df['With_Term_Wrong']  = (summary_df['With_Term_Predicted'] - summary_df['With_Term_Correct']).clip(lower=0)\n",
    "        summary_df['With_Term_NoPred'] = (summary_df['With_Term_Total']     - summary_df['With_Term_Predicted']).clip(lower=0)\n",
    "    elif 'With_Term_NoPred' in summary_df.columns:\n",
    "        summary_df['With_Term_Wrong']  = (summary_df['With_Term_Incorrect'] - summary_df['With_Term_NoPred']).clip(lower=0)\n",
    "    elif 'With_Term_Wrong' in summary_df.columns:\n",
    "        summary_df['With_Term_NoPred'] = (summary_df['With_Term_Incorrect'] - summary_df['With_Term_Wrong']).clip(lower=0)\n",
    "    else:\n",
    "        warnings.warn(\n",
    "            \"Unable to infer With_Term_Wrong / With_Term_NoPred from summary_df. \"\n",
    "            \"Temporarily treat all errors as Wrong; consider providing With_Term_Predicted or With_Term_NoPred.\"\n",
    "        )\n",
    "        summary_df['With_Term_Wrong']  = summary_df['With_Term_Incorrect'].copy()\n",
    "        summary_df['With_Term_NoPred'] = 0\n",
    "\n",
    "# ---- 2) Backfill correction: enforce Correct+Wrong+NoPred == With_Term_Total ----\n",
    "summary_df['With_Term_NoPred'] = summary_df['With_Term_NoPred'].clip(lower=0)\n",
    "summary_df['With_Term_Wrong']  = (\n",
    "    summary_df['With_Term_Total']\n",
    "    - summary_df['With_Term_Correct']\n",
    "    - summary_df['With_Term_NoPred']\n",
    ").clip(lower=0)\n",
    "\n",
    "# (Optional) Assertion check\n",
    "assert (\n",
    "    (summary_df['With_Term_Correct'] + summary_df['With_Term_Wrong'] + summary_df['With_Term_NoPred'])\n",
    "    == summary_df['With_Term_Total']\n",
    ").all(), \"Decomposition does not sum to With_Term_Total.\"\n",
    "\n",
    "# ---- 3) painting----------------------------------------------------------------\n",
    "bar_width = 0.45\n",
    "x = np.arange(len(summary_df['Model']))\n",
    "gap = 0\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(10, 8))\n",
    "\n",
    "# color\n",
    "color_with_correct   = '#66c2a5'  \n",
    "color_with_wrong     = '#fc8d62'  \n",
    "color_with_nopred    = '#fef0d9'  \n",
    "# Unused right column colors are reserved\n",
    "color_no_correct     = '#ccece6'\n",
    "color_no_incorrect   = '#fddbc7'\n",
    "\n",
    "# Left column (gold has term)\n",
    "ax.bar(x - gap, summary_df['With_Term_Correct'],  bar_width,\n",
    "       label='Correct (with term)', color=color_with_correct)\n",
    "ax.bar(x - gap, summary_df['With_Term_Wrong'],    bar_width,\n",
    "       bottom=summary_df['With_Term_Correct'],\n",
    "       label='Type I error', color=color_with_wrong)\n",
    "ax.bar(x - gap, summary_df['With_Term_NoPred'],   bar_width,\n",
    "       bottom=summary_df['With_Term_Correct'] + summary_df['With_Term_Wrong'],\n",
    "       label='Type II error', color=color_with_nopred)\n",
    "\n",
    "# legend\n",
    "ax.legend(loc='upper left', bbox_to_anchor=(1.02, 1),\n",
    "          fancybox=False, shadow=False, ncol=1, fontsize=12)\n",
    "\n",
    "# Axes and Titles\n",
    "ax.set_xlabel('Model', fontsize=12)\n",
    "ax.set_ylabel('Number of Samples', fontsize=12)\n",
    "ax.set_title('Outcome Decomposition of Model Predictions (XGS)', fontsize=16)\n",
    "ax.set_xticks(x)\n",
    "ax.set_xticklabels(summary_df['Model'], fontsize=12)\n",
    "ax.tick_params(axis='y', labelsize=12)\n",
    "\n",
    "# y \n",
    "max_height_with = summary_df['With_Term_Total'].max()\n",
    "max_height_no   = summary_df['No_Term_Total'].max()\n",
    "max_height = max(max_height_with, max_height_no)\n",
    "ax.set_ylim(0, max_height * 1.12)\n",
    "\n",
    "# ---- 4) Integer percentage labels (maximum remainder method, ensuring totals 100%) --------------------------\n",
    "label_min_frac = 0.03  \n",
    "\n",
    "def int_percentages(parts, total):\n",
    "    \"\"\"返回三个整数百分比，合计必为 100。\"\"\"\n",
    "    if total <= 0:\n",
    "        return [0, 0, 0]\n",
    "    parts = np.array(parts, dtype=float)\n",
    "    raw = parts / float(total) * 100.0          \n",
    "    flo = np.floor(raw)                         \n",
    "    remain = int(100 - flo.sum())               \n",
    "    if remain > 0:\n",
    "        remainders = raw - flo\n",
    "        order = np.argsort(-remainders)         \n",
    "        flo[order[:remain]] += 1\n",
    "    return flo.astype(int).tolist()\n",
    "\n",
    "for i in range(len(summary_df)):\n",
    "    with_total   = float(summary_df['With_Term_Total'][i])\n",
    "    with_correct = float(summary_df['With_Term_Correct'][i])\n",
    "    with_wrong   = float(summary_df['With_Term_Wrong'][i])\n",
    "    with_nopred  = float(summary_df['With_Term_NoPred'][i])\n",
    "\n",
    "    if with_total <= 0:\n",
    "        continue\n",
    "\n",
    "    # Calculate integer percentages (guaranteed to be 100%)\n",
    "    int_pcts = int_percentages([with_correct, with_wrong, with_nopred], with_total)\n",
    "    int_labels = [f\"{p}%\" for p in int_pcts]\n",
    "\n",
    "    x_left = x[i] - gap\n",
    "    bottoms = [0.0, with_correct, with_correct + with_wrong]\n",
    "    heights = [with_correct, with_wrong, with_nopred]\n",
    "\n",
    "    for (b, h, lab) in zip(bottoms, heights, int_labels):\n",
    "        if h <= 0:\n",
    "            continue\n",
    "        frac = h / with_total\n",
    "        if frac >= label_min_frac:\n",
    "            y_pos, va, y_adj = b + h / 2.0, 'center', 0\n",
    "        else:\n",
    "            y_pos, va, y_adj = b + h, 'bottom', 1\n",
    "        ax.text(x_left, y_pos + y_adj, lab, ha='center', va=va, fontsize=12)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.subplots_adjust(bottom=0.25)\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define model-to-column mapping\n",
    "model_columns = {\n",
    "    'GenOMA': 'agent_CUI',\n",
    "    'PhenoTagger': 'phetag_CUI',\n",
    "    'PhenoBERT': 'BERT_CUI',\n",
    "    'MetaMap': 'meta_CUI',\n",
    "    'cTAKES': 'ctake_CUI',\n",
    "    'GPT-5': 'GPT5_CUI',\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "df['true_CUI'] = df['true_CUI'].fillna('None').astype(str).str.strip()\n",
    "\n",
    "results = {}\n",
    "\n",
    "def safe_div(n, d):\n",
    "    return n / d if d > 0 else 0.0\n",
    "\n",
    "for model_name, pred_col in model_columns.items():\n",
    "    if pred_col not in df.columns:\n",
    "        print(f\"Column '{pred_col}' not found, skipping {model_name}\")\n",
    "        continue\n",
    "\n",
    "    df[pred_col] = df[pred_col].fillna('None').astype(str).str.strip()\n",
    "    y_true = df['true_CUI']\n",
    "    y_pred = df[pred_col]\n",
    "\n",
    "    # TP: correct code when gold exists\n",
    "    tp = ((y_pred == y_true) & (y_true != 'None')).sum()\n",
    "\n",
    "    # FP: (1) wrong code when gold exists OR (2) any code when gold is empty\n",
    "    fp_wrong_when_gold = ((y_true != 'None') & (y_pred != 'None') & (y_pred != y_true)).sum()\n",
    "    fp_pred_when_empty = ((y_true == 'None') & (y_pred != 'None')).sum()\n",
    "    fp = fp_wrong_when_gold + fp_pred_when_empty\n",
    "\n",
    "    # FN: predicted nothing when gold exists\n",
    "    fn = ((y_pred == 'None') & (y_true != 'None')).sum()\n",
    "\n",
    "    # TNs are excluded from headline metrics by design\n",
    "    # tn = ((y_true == 'None') & (y_pred == 'None')).sum()  # not used\n",
    "\n",
    "    precision = safe_div(tp, tp + fp)\n",
    "    recall    = safe_div(tp, tp + fn)\n",
    "    f1        = safe_div(2 * precision * recall, (precision + recall))\n",
    "\n",
    "    # Mapping accuracy (positives-only): excludes TNs\n",
    "    mapping_accuracy = safe_div(tp, tp + fp + fn)\n",
    "\n",
    "    results[model_name] = {\n",
    "        'tp': int(tp),\n",
    "        'fp': int(fp),\n",
    "        'fn': int(fn),\n",
    "        'precision': round(precision, 4),\n",
    "        'recall': round(recall, 4),\n",
    "        'f1': round(f1, 4),\n",
    "        'mapping_accuracy': round(mapping_accuracy, 4)\n",
    "    }\n",
    "\n",
    "results_df = pd.DataFrame(results).T\n",
    "print(results_df)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def wilson_ci(k: int, n: int, z: float = 1.96):\n",
    "\n",
    "    if n == 0:\n",
    "        return float('nan'), float('nan'), float('nan')\n",
    "    p_hat = k / n\n",
    "    z2 = z * z\n",
    "    denom = 1.0 + z2 / n\n",
    "    center = (p_hat + z2 / (2 * n)) / denom\n",
    "    half = (z / denom) * math.sqrt(p_hat * (1 - p_hat) / n + z2 / (4 * n * n))\n",
    "    lo = max(0.0, center - half)\n",
    "    hi = min(1.0, center + half)\n",
    "    return p_hat, lo, hi\n",
    "\n",
    "rows = []\n",
    "for model, r in results_df.iterrows():\n",
    "    tp = int(r['tp']); fp = int(r['fp']); fn = int(r['fn'])\n",
    "\n",
    "    # Denominator (same as your definition)\n",
    "    acc_n  = tp + fp + fn             # accuracy\n",
    "    prec_n = tp + fp                  # precision \n",
    "    rec_n  = tp + fn                  # recall \n",
    "\n",
    "    # Wilson CI\n",
    "    acc, acc_lo, acc_hi   = wilson_ci(tp, acc_n)\n",
    "    prec, prec_lo, prec_hi = wilson_ci(tp, prec_n)\n",
    "    rec, rec_lo, rec_hi    = wilson_ci(tp, rec_n)\n",
    "\n",
    "    rows.append({\n",
    "        \"model\": model,\n",
    "        \"tp\": tp, \"fp\": fp, \"fn\": fn,\n",
    "        # Point estimates (can be aligned with your existing columns; recalculated here to avoid cumulative errors)\n",
    "        \"accuracy\": acc, \"accuracy_CI_low\": acc_lo, \"accuracy_CI_high\": acc_hi, \"acc_n\": acc_n,\n",
    "        \"precision\": prec, \"precision_CI_low\": prec_lo, \"precision_CI_high\": prec_hi, \"prec_n\": prec_n,\n",
    "        \"recall\": rec, \"recall_CI_low\": rec_lo, \"recall_CI_high\": rec_hi, \"rec_n\": rec_n,\n",
    "    })\n",
    "\n",
    "metrics_with_ci = pd.DataFrame(rows).set_index(\"model\")\n",
    "\n",
    "#Beautify the number (keep three decimal places; if you prefer two, change to .round(2))\n",
    "cols_to_round = [\n",
    "    \"accuracy\",\"accuracy_CI_low\",\"accuracy_CI_high\",\n",
    "    \"precision\",\"precision_CI_low\",\"precision_CI_high\",\n",
    "    \"recall\",\"recall_CI_low\",\"recall_CI_high\"\n",
    "]\n",
    "metrics_with_ci[cols_to_round] = metrics_with_ci[cols_to_round].round(3)\n",
    "\n",
    "print(metrics_with_ci)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Assuming results_df is already defined\n",
    "results_df = pd.DataFrame(results).T.reset_index().rename(columns={'index': 'Model'})\n",
    "\n",
    "# Custom color: Green from dark to light (soft)\n",
    "green_shades = ['#00441b', '#006d2c', '#238b45', '#41ab5d', '#74c476', '#c7e9c0']\n",
    "\n",
    "# Manually set the X coordinate position\n",
    "x = np.arange(len(results_df))  \n",
    "bar_width = 0.6  \n",
    "\n",
    "fig, ax = plt.subplots(figsize=(6, 5))\n",
    "\n",
    "# Draw a bar chart\n",
    "bars = ax.bar(x, results_df['mapping_accuracy'], width=bar_width, color=green_shades)\n",
    "\n",
    "# Added accuracy value (percentage)\n",
    "for i, bar in enumerate(bars):\n",
    "    height = bar.get_height()\n",
    "    ax.text(bar.get_x() + bar.get_width()/2, height + 0.02,\n",
    "            f'{height:.2%}', ha='center', va='bottom', fontsize=12)\n",
    "\n",
    "# Set the title and axis\n",
    "ax.set_xlabel('Model', fontsize=10)\n",
    "ax.set_title('Model Accuracy Comparison (XGS)', fontsize=16)\n",
    "ax.set_ylabel('Accuracy', fontsize=10)\n",
    "ax.set_xticks(x)\n",
    "ax.set_xticklabels(results_df['Model'], fontsize=9)\n",
    "\n",
    "#  y\n",
    "ax.yaxis.set_major_formatter(PercentFormatter(xmax=1.0))\n",
    "\n",
    "ax.set_ylim(0, 1.05)\n",
    "ax.tick_params(axis='y', labelsize=10)\n",
    "\n",
    "# Remove extra borders\n",
    "ax.spines['top'].set_visible(False)\n",
    "ax.spines['right'].set_visible(False)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Model and performance data (XGS dataset)\n",
    "models = ['GenOMA', 'PhenoTagger', 'PhenoBERT', 'MetaMap', 'cTAKES', 'GPT-5']\n",
    "precision = [0.9397, 0.8667, 0.8706, 0.5962, 0.7284, 0.6216]\n",
    "recall    = [0.9732, 0.7290, 0.6852, 0.8052, 0.6082, 0.8961]\n",
    "f1_score  = [0.9561, 0.7919, 0.7668, 0.6851, 0.6629, 0.7340]\n",
    "\n",
    "# Indicator name\n",
    "metrics = ['Precision', 'Recall', 'F1 Score']\n",
    "metric_values = [precision, recall, f1_score]\n",
    "\n",
    "# Coordinate settings\n",
    "x = np.arange(len(metrics))  # [0, 1, 2]\n",
    "bar_width = 0.14\n",
    "\n",
    "# Professional color palette: 4 yellow-green transition colors from dark to light\n",
    "colors = ['#00441b', '#006d2c', '#238b45', '#41ab5d', '#74c476', '#c7e9c0']\n",
    "\n",
    "# Set the image style\n",
    "fig, ax = plt.subplots(figsize=(10, 5.5))\n",
    "plt.rcParams.update({'font.size': 12})\n",
    "\n",
    "# Draw the columns for each model\n",
    "for i, model in enumerate(models):\n",
    "    values = [metric[i] for metric in metric_values]\n",
    "    bars = ax.bar(x + i * bar_width, values, bar_width, label=model, color=colors[i])\n",
    "\n",
    "    # Add labels inside columns\n",
    "    for bar in bars:\n",
    "        height = bar.get_height()\n",
    "        ax.annotate(f'{height:.2f}',\n",
    "                    xy=(bar.get_x() + bar.get_width() / 2, height + 0.01),\n",
    "                    ha='center', va='bottom',\n",
    "                    fontsize=10, color='black')\n",
    "\n",
    "# Set up axes and titles\n",
    "ax.set_ylabel('Score', fontsize=14)\n",
    "ax.set_title('Model Evaluation Metric Comparison (XGS)', fontsize=16, pad=15)\n",
    "ax.set_xticks(x + 1.5 * bar_width)\n",
    "ax.set_xticklabels(metrics, fontsize=13)\n",
    "ax.set_ylim(0, 1.05)\n",
    "ax.yaxis.grid(True, linestyle='--', alpha=0.6)\n",
    "\n",
    "# Setting the Legend\n",
    "ax.legend(title=\"Model\", bbox_to_anchor=(1.02, 1), loc='upper left',\n",
    "          borderaxespad=0, fontsize=11, title_fontsize=12, frameon=False)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1)Assemble the \"long table\" (containing 6 models; automatically discarding NaNs)\n",
    "df_long = pd.concat([\n",
    "    pd.DataFrame({'Model': 'GenOMA',     'LLM_Score': df['agent_LLM_Score']}),\n",
    "    pd.DataFrame({'Model': 'PhenoTagger', 'LLM_Score': df['phetag_LLM_Score']}),\n",
    "    pd.DataFrame({'Model': 'PhenoBERT',   'LLM_Score': df['BERT_LLM_Score']}),\n",
    "    pd.DataFrame({'Model': 'MetaMap',     'LLM_Score': df['meta_LLM_Score']}),\n",
    "    pd.DataFrame({'Model': 'cTAKES',      'LLM_Score': df['ctake_LLM_Score']}),\n",
    "    pd.DataFrame({'Model': 'GPT-5',        'LLM_Score': df['GPT5_LLM_Score']}),\n",
    "], ignore_index=True).dropna(subset=['LLM_Score'])\n",
    "\n",
    "# 2) Calculate the mean and standard deviation of each model (can be sorted in descending order by mean)\n",
    "stats = (df_long\n",
    "         .groupby('Model', as_index=False)['LLM_Score']\n",
    "         .agg(mean='mean', std='std'))\n",
    "\n",
    "# Sort by average from high to low (delete this row if not needed)\n",
    "stats = stats.sort_values('mean', ascending=False)\n",
    "\n",
    "models = stats['Model'].tolist()\n",
    "means  = stats['mean'].to_numpy()\n",
    "stds   = stats['std'].fillna(0).to_numpy()  \n",
    "\n",
    "# 3) painting\n",
    "fig, ax = plt.subplots(figsize=(9.5, 5.6))\n",
    "\n",
    "green_shades = ['#00441b', '#006d2c', '#238b45', '#41ab5d', '#74c476', '#c7e9c0']\n",
    "\n",
    "bars = ax.bar(models, means, yerr=stds, capsize=6, linewidth=0, color=green_shades[:len(models)])\n",
    "\n",
    "# 4) Smart placement of value labels (on top of column + error bars, plus a little adaptive offset)\n",
    "ymin, ymax = ax.get_ylim()\n",
    "offset = 0.015 * (ymax - ymin)  \n",
    "for rect, mean, std in zip(bars, means, stds):\n",
    "    top = rect.get_height() + std + offset\n",
    "    ax.text(rect.get_x() + rect.get_width()/2, top,\n",
    "            f'{mean:.3f}',\n",
    "            ha='center', va='bottom', fontsize=10)\n",
    "\n",
    "# 5) Axes and Styles\n",
    "ax.set_ylabel('Mean LLM Similarity Score', fontsize=12)\n",
    "ax.set_title('Mean LLM Score with Standard Deviation (XGS)', fontsize=14, pad=10)\n",
    "ax.set_ylim(0, max(means + stds) * 1.15)    \n",
    "ax.yaxis.grid(True, linestyle='--', alpha=0.6)\n",
    "ax.set_axisbelow(True)                     \n",
    "ax.tick_params(axis='x', labelrotation=0)     \n",
    "\n",
    "\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "GOLD_COL = \"true_CUI\"       \n",
    "GEN_COL  = \"agent_CUI\"    \n",
    "BASE_COL = \"phetag_CUI\"    \n",
    "def _norm_code(x):\n",
    "  \n",
    "    if pd.isna(x):\n",
    "        return \"\"\n",
    "    s = str(x).strip()\n",
    "    if s.lower() in {\"\", \"none\", \"nan\", \"null\"}:\n",
    "        return \"\"\n",
    "    return s.upper()\n",
    "\n",
    "def compute_mcnemar_counts(\n",
    "    df: pd.DataFrame,\n",
    "    gold_col=GOLD_COL,\n",
    "    gen_col=GEN_COL,\n",
    "    base_col=BASE_COL,\n",
    "    include_empty_gold: bool = False\n",
    "):\n",
    "    g  = df[gold_col].map(_norm_code)\n",
    "    g1 = df[gen_col].map(_norm_code)\n",
    "    g2 = df[base_col].map(_norm_code)\n",
    "\n",
    "    if include_empty_gold:\n",
    "       \n",
    "        mask = pd.Series(True, index=df.index)\n",
    "        gen_ok  = (g1 == g)\n",
    "        base_ok = (g2 == g)\n",
    "    else:\n",
    "       \n",
    "        mask = (g != \"\")\n",
    "        gen_ok  = (g1 == g) & mask\n",
    "        base_ok = (g2 == g) & mask\n",
    "\n",
    "    a = int(((gen_ok)  & (base_ok) & mask).sum())   \n",
    "    b = int(((gen_ok)  & (~base_ok) & mask).sum()) \n",
    "    c = int(((~gen_ok) & (base_ok) & mask).sum())   \n",
    "    d = int(((~gen_ok) & (~base_ok) & mask).sum())  \n",
    "    N = int(mask.sum())\n",
    "\n",
    "    return {\"a\": a, \"b\": b, \"c\": c, \"d\": d, \"N\": N}\n",
    "\n",
    "def mcnemar_exact_p(b: int, c: int) -> float:\n",
    "    \"\"\"McNemar p 。\"\"\"\n",
    "    n = b + c\n",
    "    if n == 0:\n",
    "        return 1.0\n",
    "    k = min(b, c)\n",
    "    tail = sum(comb(n, i) for i in range(0, k + 1)) / (2 ** n)\n",
    "    return min(1.0, 2 * tail)\n",
    "\n",
    "\n",
    "counts = compute_mcnemar_counts(df, include_empty_gold=False)  \n",
    "a, b, c, d, N = counts[\"a\"], counts[\"b\"], counts[\"c\"], counts[\"d\"], counts[\"N\"]\n",
    "p_exact = mcnemar_exact_p(b, c)\n",
    "\n",
    "print(f\"N={N}\")\n",
    "print(f\"a={a}, b={b}, c={c}, d={d}\")\n",
    "print(f\"McNemar p = {p_exact:.4g}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
