{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "scoring function for complement, factualQA, and random choice"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "columns = [\n",
    "'perplexity', 'energy', 'normalized_entropy',\n",
    "       'lexical_similarity', 'eigenscore_original',  'eigenscore_output',\n",
    "   'eigenscore_variant'\n",
    "'semantic_entropy'\n",
    "\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# input: a csv file with columns \"label\" (the two values are \"original\" and \"expand\"), \"set_id\"\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "def compute_complement_accuracy(df, score_column=\"semantic_entropy\"):\n",
    "    reverse = score_column == \"lexical_similarity\"\n",
    "    \n",
    "    correct = 0\n",
    "    total = 0\n",
    "    for combo_id, group in df.groupby(\"set_id\"): # change set_id to combo_id\n",
    "        if len(group) != 2:\n",
    "            continue  # skip incomplete pairs\n",
    "        original = group[group[\"label\"] == \"original\"][score_column].values[0]\n",
    "        complement = group[group[\"label\"] == \"expand\"][score_column].values[0] # change expand back to complement\n",
    "        \n",
    "        if reverse:\n",
    "            if complement < original:\n",
    "                correct += 1\n",
    "        else:\n",
    "            if complement > original:\n",
    "                correct += 1\n",
    "        total += 1\n",
    "\n",
    "    if total == 0:\n",
    "        return 0.0, 0.0\n",
    "\n",
    "    acc = correct / total\n",
    "    se = np.sqrt(acc * (1 - acc) / total)\n",
    "    error_bar = 1.96 * se\n",
    "    return f\"{acc:.4f} ± {error_bar:.4f}\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "\n",
    "for column in columns:\n",
    "    value, error = compute_pairwise_accuracy(df, column)\n",
    "    print(column, value, error)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "scoring function for subset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import itertools\n",
    "import pandas as pd\n",
    "import math\n",
    "\n",
    "def compute_pairwise_accuracy(df: pd.DataFrame, metric_name: str) -> tuple[float, float]:\n",
    "    \"\"\"\n",
    "    Computes ranking accuracy of a metric over 5 specificity levels for each object group,\n",
    "    and returns accuracy ± 95% CI (1.96 * standard error).\n",
    "    \n",
    "    Returns:\n",
    "        tuple: (accuracy, error_bar)\n",
    "    \"\"\"\n",
    "    total_comparisons = 0\n",
    "    total_correct = 0\n",
    "\n",
    "    for obj_id, group in df.groupby(\"combo_id\"):\n",
    "        group = group.sort_values(\"specificity_level\")\n",
    "        scores = list(group[metric_name])\n",
    "\n",
    "        if len(scores) != 5:\n",
    "            raise ValueError(f\"Object '{obj_id}' has {len(scores)} specificity levels (expected 5)\")\n",
    "\n",
    "        for i, j in itertools.combinations(range(5), 2):\n",
    "            total_comparisons += 1\n",
    "            if metric_name in {\"lexical_similarity\", \"word_count\"}:\n",
    "                if scores[i] < scores[j]:  # higher score for lower specificity expected\n",
    "                    total_correct += 1\n",
    "            else:\n",
    "                if scores[i] > scores[j]:  # higher score for lower specificity expected\n",
    "                    total_correct += 1\n",
    "\n",
    "    accuracy = total_correct / total_comparisons if total_comparisons > 0 else 0.0\n",
    "\n",
    "    # Standard error for a proportion\n",
    "    if total_comparisons > 0:\n",
    "        std_err = math.sqrt((accuracy * (1 - accuracy)) / total_comparisons)\n",
    "        error_bar = 1.96 * std_err\n",
    "    else:\n",
    "        error_bar = 0.0\n",
    "\n",
    "    return accuracy, error_bar\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "for column in columns:\n",
    "    value, error = compute_pairwise_accuracy(df, column)\n",
    "    print(column, value, error)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "scoring function for intersection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from itertools import combinations\n",
    "import numpy as np\n",
    "\n",
    "def compute_and_subset_accuracy(df, score_column=\"score\"):\n",
    "    \"\"\"\n",
    "    Evaluate AND logic: adding constraints → smaller generation space.\n",
    "    For most metrics: score(superset) < score(subset)\n",
    "    For reversed metrics like lexical_similarity: score(superset) > score(subset)\n",
    "\n",
    "    Returns: mean accuracy ± 1.96 * standard error\n",
    "    \"\"\"\n",
    "    reverse_metric = score_column == \"lexical_similarity\"\n",
    "    accuracies = []\n",
    "\n",
    "    for combo_id, group in df.groupby(\"set_id\"):\n",
    "        group = group.set_index(\"elements\")\n",
    "        elements = group.index.tolist()\n",
    "\n",
    "        correct = 0\n",
    "        total = 0\n",
    "\n",
    "        for small, large in combinations(elements, 2):\n",
    "            if set(large) > set(small):  # large is strict superset of small\n",
    "                score_small = group.loc[small, score_column]\n",
    "                score_large = group.loc[large, score_column]\n",
    "\n",
    "                if reverse_metric:\n",
    "                    if score_large > score_small:\n",
    "                        correct += 1\n",
    "                else:\n",
    "                    if score_large < score_small:\n",
    "                        correct += 1\n",
    "\n",
    "                total += 1\n",
    "\n",
    "        if total > 0:\n",
    "            accuracies.append(correct / total)\n",
    "\n",
    "    if accuracies:\n",
    "        mean = np.mean(accuracies)\n",
    "        se = np.std(accuracies, ddof=1) / np.sqrt(len(accuracies))\n",
    "        ci = 1.96 * se\n",
    "        return f\"{mean:.4f} ± {ci:.4f}\"\n",
    "    else:\n",
    "        return \"0.0000 ± 0.0000\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "for column in columns:\n",
    "    result = compute_and_subset_accuracy(df, score_column=column)\n",
    "    print(f\"{column}: {result}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "scoring function for union"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from itertools import combinations\n",
    "import numpy as np\n",
    "\n",
    "def compute_subset_accuracy(df, score_column=\"score\"):\n",
    "    \"\"\"\n",
    "    Evaluates how often superset > subset in score, under union logic.\n",
    "    For lexical_similarity, reversed logic: subset should have higher score (smaller space).\n",
    "    Returns: mean accuracy ± 1.96 * standard error (95% CI)\n",
    "    \"\"\"\n",
    "    reverse_metric = score_column == \"lexical_similarity\"\n",
    "    accuracies = []\n",
    "\n",
    "    for combo_id, group in df.groupby(\"combo_id\"):\n",
    "        group = group.set_index(\"elements\")\n",
    "        elements = group.index.tolist()\n",
    "\n",
    "        correct = 0\n",
    "        total = 0\n",
    "\n",
    "        for small, large in combinations(elements, 2):\n",
    "            if set(small) < set(large):  # strict subset\n",
    "                score_small = group.loc[small, score_column]\n",
    "                score_large = group.loc[large, score_column]\n",
    "\n",
    "                if reverse_metric:\n",
    "                    if score_small > score_large:\n",
    "                        correct += 1\n",
    "                else:\n",
    "                    if score_large > score_small:\n",
    "                        correct += 1\n",
    "\n",
    "                total += 1\n",
    "\n",
    "        if total > 0:\n",
    "            accuracies.append(correct / total)\n",
    "\n",
    "    if accuracies:\n",
    "        mean = np.mean(accuracies)\n",
    "        stderr = np.std(accuracies, ddof=1) / np.sqrt(len(accuracies))\n",
    "        ci = 1.96 * stderr\n",
    "        return f\"{mean:.4f} ± {ci:.4f}\"\n",
    "    else:\n",
    "        return \"0.0000 ± 0.0000\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for column in columns:\n",
    "    print(f\"{column}: {compute_subset_accuracy(df, score_column=column)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "t-test code (for RIFTS)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from scipy.stats import ttest_ind\n",
    "\n",
    "# Collapse labels into two categories\n",
    "def map_label(lbl):\n",
    "    if lbl in [\"addressing\", \"ambiguous\"]:\n",
    "        return \"Group1\"  # collapsed category 1\n",
    "    else:  # \"advancing\" or \"none\"\n",
    "        return \"Group2\"\n",
    "\n",
    "llama[\"group\"] = llama[\"label\"].map(map_label)\n",
    "\n",
    "# list of columns to test\n",
    "columns = [\n",
    "'perplexity', 'energy', 'normalized_entropy',\n",
    "       'lexical_similarity', 'eigenscore_original',  'eigenscore_output',\n",
    "   'eigenscore_variant'\n",
    "'semantic_entropy'\n",
    "\n",
    "]\n",
    "\n",
    "results = []\n",
    "for col in columns:\n",
    "    g1 = llama.loc[llama[\"group\"] == \"Group1\", col].dropna()\n",
    "    g2 = llama.loc[llama[\"group\"] == \"Group2\", col].dropna()\n",
    "    # two-sample t-test (equal_var=False is Welch’s t-test, safer for unequal variances)\n",
    "    stat, pval = ttest_ind(g1, g2, equal_var=False)\n",
    "    \n",
    "    results.append({\n",
    "        \"column\": col,\n",
    "        \"group1_mean\": g1.mean(),\n",
    "        \"group2_mean\": g2.mean(),\n",
    "        \"t_stat\": stat,\n",
    "        \"p_value\": pval\n",
    "    })\n",
    "\n",
    "results_df = pd.DataFrame(results)\n",
    "print(results_df)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "bruh",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
