{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "9ef66df8",
   "metadata": {},
   "source": [
    "# Evaluation"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ffa86ad6",
   "metadata": {},
   "source": [
    "### Answers "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "7ef05437",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "import pandas as pd\n",
    "from utils import jaccard_similarity\n",
    "import datetime\n",
    "\n",
    "def get_answer_set(df, q_serie, task):\n",
    "    match = df[(df[\"Q_serie\"] == q_serie) & (df[\"task\"] == task)]\n",
    "    if not match.empty:\n",
    "        return set(match[\"Answer\"].values[0])\n",
    "    return set()\n",
    "\n",
    "def load_question(file_path: str) -> pd.DataFrame:\n",
    "    df = pd.read_csv(file_path, sep=\"\\t\", encoding=\"utf-8\")\n",
    "    # df[\"type\"] = df[\"type\"].apply(lambda x: str(x) if not pd.isna(x) else \"0\")\n",
    "    return df\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "ba5477c3",
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_all_questions(root_dir, datasets, languages):\n",
    "    \"\"\"\n",
    "    Load and merge question files from multiple datasets and languages.\n",
    "\n",
    "    Args:\n",
    "        root_dir (str): Base directory containing the question files.\n",
    "        datasets (list): List of dataset names.\n",
    "        languages (list): List of language codes.\n",
    "        load_questions_fn (Callable): Function to load a TSV file into a DataFrame.\n",
    "\n",
    "    Returns:\n",
    "        pd.DataFrame: Merged DataFrame with original index stored as 'q_index',\n",
    "                      and columns 'dataset' and 'lang' added.\n",
    "    \"\"\"\n",
    "    all_dfs = []\n",
    "\n",
    "    for dataset in datasets:\n",
    "        for lang in languages:\n",
    "            question_path = os.path.join(root_dir, \"data\", \"Dataset\", lang, f\"{dataset}.tsv\")\n",
    "            if not os.path.exists(question_path):\n",
    "                print(f\"File not found: {question_path}\")\n",
    "                continue\n",
    "\n",
    "            df = load_question(question_path)\n",
    "            df = df.copy()\n",
    "            df[\"q_index\"] = df.index\n",
    "            df[\"dataset\"] = dataset\n",
    "            df[\"lang\"] = lang\n",
    "\n",
    "            all_dfs.append(df)\n",
    "\n",
    "    return pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "ae950616",
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_answers(folder: str, datasets, llms, actions, tasks, languages, questions) -> pd.DataFrame:\n",
    "    df_answers = pd.DataFrame(columns=[\"Q_ID\", \"Q_serie\", \"action\", \"task\", \"dataset\", \"lang\",\"llm\"])\n",
    "\n",
    "    json_files = [\n",
    "        os.path.join(root, file)\n",
    "        for root, _, files in os.walk(folder)\n",
    "        for file in files if file.endswith(\".json\")\n",
    "    ]\n",
    "\n",
    "    print(f\"JSON files found: {len(json_files)}\")\n",
    "\n",
    "    for file in json_files:\n",
    "        if not file.split(\"/\")[-1].startswith(\"Q\"):\n",
    "            continue\n",
    "        elements = file.replace(\"_\", \"/\").replace(\".json\", \"\").split(\"/\")\n",
    "        question = next((q for q in questions if q in elements), None)\n",
    "        action = next((a for a in actions if a in elements), \"zero-shot\")\n",
    "        task = next((t for t in tasks if t in elements), None)\n",
    "        dataset = next((d for d in datasets if d in elements), None)\n",
    "        lang = next((l for l in languages if l in elements), None)\n",
    "        llm = next((l for l in llms if l in elements), None)\n",
    "\n",
    "        if all([question, action, task, dataset, llm]):\n",
    "            with open(file, 'r', encoding='utf-8') as f:\n",
    "                data = json.load(f)\n",
    "            df = pd.DataFrame([{\"Q_ID\": key, \"Answer\": value} for key, value in data.items()])\n",
    "            df[\"Q_serie\"] = question\n",
    "            df[\"action\"] = action\n",
    "            df[\"task\"] = task\n",
    "            df[\"dataset\"] = dataset\n",
    "            df[\"llm\"] = llm\n",
    "            df[\"lang\"] = lang\n",
    "            df_answers = pd.concat([df_answers, df], ignore_index=True)\n",
    "\n",
    "    return df_answers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "e4caacd4",
   "metadata": {},
   "outputs": [],
   "source": [
    "def enrich_answers(df_answers, df_questions):\n",
    "    df_answers[\"Question\"] = df_answers.apply(\n",
    "        lambda x: df_questions.loc[\n",
    "            (df_questions[\"q_index\"] == int(x[\"Q_ID\"])) &\n",
    "            (df_questions[\"dataset\"] == x[\"dataset\"])\n",
    "        ][x[\"Q_serie\"]].values[0]\n",
    "        if not df_questions.loc[\n",
    "            (df_questions[\"q_index\"] == int(x[\"Q_ID\"])) &\n",
    "            (df_questions[\"dataset\"] == x[\"dataset\"]) \n",
    "        ].empty else None,\n",
    "        axis=1\n",
    "    )\n",
    "\n",
    "    df_answers.drop_duplicates(\n",
    "        subset=[\"Q_ID\", \"Q_serie\", \"action\", \"task\", \"dataset\", \"llm\"],\n",
    "        inplace=True\n",
    "    )\n",
    "    df_answers[\"Answer\"] = df_answers[\"Answer\"].apply(lambda x: x if isinstance(x, list) else [])\n",
    "    df_answers.reset_index(drop=True, inplace=True)\n",
    "    return df_answers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 228,
   "id": "86d11567",
   "metadata": {},
   "outputs": [],
   "source": [
    "def analysis(df):\n",
    "    rows = []\n",
    "    group_keys = [\"Q_ID\", \"action\", \"dataset\", \"llm\"]\n",
    "    grouped = df.groupby(group_keys)\n",
    "\n",
    "    for keys, group in grouped: \n",
    "        if set(group[\"Q_serie\"]) >= {\"Q1\", \"Q2\", \"Q3\", \"Q4\"}:\n",
    "            action = group[\"action\"].values[0]\n",
    "            if action in [\"zero-shot\", \"wikidata\"]:\n",
    "                A1 = get_answer_set(group, \"Q1\", \"equal\")\n",
    "                A2 = get_answer_set(group, \"Q2\", \"equal\")\n",
    "                A3 = get_answer_set(group, \"Q3\", \"sup-sub\")\n",
    "                A4 = get_answer_set(group, \"Q4\", \"minus\")\n",
    "\n",
    "                A1_prime = None\n",
    "                A1_double_prime = None\n",
    "\n",
    "                similarities = {\n",
    "                    \"J(A1-A2)\": round(jaccard_similarity(A1, A2), 4),\n",
    "                    \"J(A1-A34)\": round(jaccard_similarity(A1, A3.union(A4)), 4),\n",
    "                    \"J(A1-A1*)\": None,\n",
    "                    \"J(A1-A1**)\": None,\n",
    "                    \"J(A1*-A1**)\": None\n",
    "                    }\n",
    "                consistency = {\n",
    "                    \"?A1=A2\": int(A1 == A2),\n",
    "                    \"?A1=A3+A4\": int(A1 == A3.union(A4)),\n",
    "                    \"?A1>A3\": int(A3.issubset(A1)),\n",
    "                    \"?A1>A4\": int(A4.issubset(A1)),\n",
    "                    \"?A3∅A4\": int(A3.isdisjoint(A4)),\n",
    "                    \"?A1=A1*\": None,\n",
    "                    \"?A1=A1**\": None,\n",
    "                    \"?A1*=A1**\": None\n",
    "                    }\n",
    "            elif action in ['classification','fixing']:\n",
    "                # Usage\n",
    "                A1_equal = get_answer_set(group, \"Q1\", \"equal\")\n",
    "                A1_contain = get_answer_set(group, \"Q1\", \"sup-sub\")\n",
    "                A1_minus = get_answer_set(group, \"Q1\", \"minus\")\n",
    "                A2_equal = get_answer_set(group, \"Q2\", \"equal\")\n",
    "                A3_contain = get_answer_set(group, \"Q3\", \"sup-sub\")\n",
    "                A3_minus = get_answer_set(group, \"Q3\", \"minus\")\n",
    "                A4_minus = get_answer_set(group, \"Q4\", \"minus\")\n",
    "                similarities = {\n",
    "                    \"J(A1-A2)\": round(jaccard_similarity(A1_equal, A2_equal), 4),\n",
    "                    \"J(A1-A34)\": round(jaccard_similarity(A1_minus, A3_minus.union(A4_minus)), 4),\n",
    "                    \"J(A1-A1*)\": round(jaccard_similarity(A1_equal, A1_contain), 4),\n",
    "                    \"J(A1-A1**)\": round(jaccard_similarity(A1_equal, A1_minus), 4),\n",
    "                    \"J(A1*-A1**)\": round(jaccard_similarity(A1_contain, A1_minus), 4)\n",
    "                    }\n",
    "                consistency = {\n",
    "                    \"?A1=A2\": int(A1_equal == A2_equal),\n",
    "                    \"?A1=A3+A4\": int(A1_minus == A3_minus.union(A4_minus)),\n",
    "                    \"?A1>A3\": int(A3_contain.issubset(A1_contain)),\n",
    "                    \"?A1>A4\": int(A4_minus.issubset(A1_minus)),\n",
    "                    \"?A3∅A4\": int(A3_minus.isdisjoint(A4_minus)),\n",
    "                    \"?A1=A1*\": int(A1_equal == A1_contain),\n",
    "                    \"?A1=A1**\": int(A1_equal == A1_minus),\n",
    "                    \"?A1*=A1**\": int(A1_contain == A1_minus)\n",
    "                    }\n",
    "\n",
    "                A1 = A1_equal\n",
    "                A2 = A2_equal\n",
    "                A3 = A3_contain\n",
    "                A4 = A4_minus\n",
    "                A1_prime = list(A1_contain)\n",
    "                A1_double_prime = list(A1_minus)\n",
    "                \n",
    "            q_map = {\n",
    "                row[\"Q_serie\"]: row[\"Question\"]\n",
    "                for _, row in group.iterrows()\n",
    "                if row[\"Q_serie\"] in {\"Q1\", \"Q2\", \"Q3\", \"Q4\"}\n",
    "            }\n",
    "\n",
    "            row = {\n",
    "                \"Q_ID\": keys[0], \"action\": keys[1], \"dataset\": keys[2], \"llm\": keys[3],\n",
    "                **consistency, **similarities,\n",
    "                \"Q1\": q_map.get(\"Q1\", \"\"), \"Q2\": q_map.get(\"Q2\", \"\"),\n",
    "                \"Q3\": q_map.get(\"Q3\", \"\"), \"Q4\": q_map.get(\"Q4\", \"\"),\n",
    "                \"A1\": list(A1), \"A2\": list(A2), \"A3\": list(A3), \"A4\": list(A4),\n",
    "                \"A1*\": A1_prime, \"A1**\": A1_double_prime\n",
    "            }\n",
    "            rows.append(row)\n",
    "\n",
    "    return pd.DataFrame(rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 229,
   "id": "901daf86",
   "metadata": {},
   "outputs": [],
   "source": [
    "def summary(df_analysis):\n",
    "    group_cols = [\"dataset\", \"action\", \"llm\"]\n",
    "    consistency_cols = [\"?A1=A2\", \"?A1=A3+A4\", \"?A1>A3\", \"?A1>A4\", \"?A3∅A4\", \"?A1=A1*\", \"?A1=A1**\",\"?A1*=A1**\"]\n",
    "    jaccard_cols = [\"J(A1-A2)\", \"J(A1-A34)\", \"J(A1-A1*)\", \"J(A1-A1**)\",\"J(A1*-A1**)\"]\n",
    "    pval_cols = [col for col in df_analysis.columns if col.startswith(\"p_value_\")]\n",
    "    metric_cols = consistency_cols + jaccard_cols + pval_cols\n",
    "\n",
    "    for a in [\"A1\", \"A2\", \"A3\", \"A4\"]:\n",
    "        df_analysis[f\"idk_{a}\"] = df_analysis[a].apply(lambda x: int(\n",
    "        (isinstance(x, list) and len(x) == 0)       # []\n",
    "        or (x == \"idk\")                             # \"idk\"\n",
    "        or (isinstance(x, list) and x == [\"idk\"])   # [\"idk\"]\n",
    "    ))\n",
    "\n",
    "    empty_cols = [f\"idk_{a}\" for a in [\"A1\", \"A2\", \"A3\", \"A4\"]]\n",
    "\n",
    "\n",
    "    df_summary = (\n",
    "        df_analysis\n",
    "        .groupby(group_cols)[metric_cols + empty_cols]\n",
    "        .mean()\n",
    "        .reset_index()\n",
    "        .round(4)\n",
    "    )\n",
    "    group_cols_overall = [\"action\", \"llm\"]\n",
    "    df_summary_extend = (\n",
    "        df_analysis\n",
    "        .groupby(group_cols_overall)[metric_cols + empty_cols]\n",
    "        .mean()\n",
    "        .reset_index()\n",
    "        .round(4)\n",
    "    )\n",
    "    df_summary_extend[\"dataset\"] = \"overall\"\n",
    "    \n",
    "    df_summary = pd.concat([df_summary, df_summary_extend], ignore_index=True)\n",
    "    df_summary[\"?A1=A1(ave)\"] = df_summary[[\"?A1=A1*\", \"?A1=A1**\",\"?A1*=A1**\"]].mean(axis=1).round(4)\n",
    "    df_summary[\"J_A1_ave\"] = df_summary[[\"J(A1-A1*)\", \"J(A1-A1**)\", \"J(A1*-A1**)\"]].mean(axis=1).round(4)\n",
    "    return df_summary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "7807c2e0",
   "metadata": {},
   "outputs": [],
   "source": [
    "root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))\n",
    "datasets=[\"spinach\", \"qawiki\",'synthetic']\n",
    "# llms = ['gpt-4.1-2025-04-14', 'gpt-4.1-mini-2025-04-14', 'gpt-4.1-nano-2025-04-14', \n",
    "#         'gpt-4o','o3','gpt-5-nano',\"gpt-5-mini\",\"gpt-5\",\n",
    "#         \"gemini-2.0-flash\",\"gemini-2.5-flash\",\"gemini-2.5-pro\",\n",
    "#         \"grok-3-mini\",\"deepseek-chat\",\"deepseek-reasoner\",\"llama3.1:8b\",\"llama3.3:70b\"]\n",
    "llms = ['gpt-5']\n",
    "actions = [\"fixing\", \"classification\", \"wikidata\"]\n",
    "tasks = ['equal', 'sup-sub', \"minus\"]\n",
    "languages = ['en']\n",
    "\n",
    "df_questions = load_all_questions(root_dir, datasets, languages)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "id": "0ef3a9cc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>usparql</th>\n",
       "      <th>Q1</th>\n",
       "      <th>Q2</th>\n",
       "      <th>Q3</th>\n",
       "      <th>Q4</th>\n",
       "      <th>type</th>\n",
       "      <th>q_index</th>\n",
       "      <th>dataset</th>\n",
       "      <th>lang</th>\n",
       "      <th>q1</th>\n",
       "      <th>q2</th>\n",
       "      <th>Type</th>\n",
       "      <th>Unnamed: 7</th>\n",
       "      <th>0: fully containment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>no_usparql_13</td>\n",
       "      <td>Which television shows were created by John Cl...</td>\n",
       "      <td>What television programs did John Cleese create?</td>\n",
       "      <td>Which television shows were created by John Cl...</td>\n",
       "      <td>Which television shows were created by John Cl...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>spinach</td>\n",
       "      <td>en</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>no_usparql_17</td>\n",
       "      <td>Give me all current communist countries.</td>\n",
       "      <td>What countries have a communist government?</td>\n",
       "      <td>Give me all communist countries in Asia.</td>\n",
       "      <td>Give me all communist countries outside of Asia.</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>spinach</td>\n",
       "      <td>en</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>no_usparql_26</td>\n",
       "      <td>Which politicians were married to a German per...</td>\n",
       "      <td>Which politicians had a spouse who was German?</td>\n",
       "      <td>Which politicians were married to a German woman?</td>\n",
       "      <td>Which politicians were married to a German man...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>spinach</td>\n",
       "      <td>en</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>no_usparql_29</td>\n",
       "      <td>Give me all soccer clubs in Spain that play in...</td>\n",
       "      <td>Can you list every soccer club located in Spai...</td>\n",
       "      <td>Please tell me all the Spanish football clubs ...</td>\n",
       "      <td>Please tell me all the Spanish football clubs ...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3</td>\n",
       "      <td>spinach</td>\n",
       "      <td>en</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>no_usparql_32</td>\n",
       "      <td>Which telecommunications organizations are loc...</td>\n",
       "      <td>Which telecommunications organizations are bas...</td>\n",
       "      <td>Which telecommunications organizations are loc...</td>\n",
       "      <td>Which telecommunications organizations are loc...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4</td>\n",
       "      <td>spinach</td>\n",
       "      <td>en</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>445</th>\n",
       "      <td>NaN</td>\n",
       "      <td>In which universities do Nobel Prize Laureates...</td>\n",
       "      <td>Which universities have Nobel laureates on the...</td>\n",
       "      <td>Which universities have Nobel laureates in Eco...</td>\n",
       "      <td>Which universities have Nobel laureates on the...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>145</td>\n",
       "      <td>synthetic</td>\n",
       "      <td>en</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>446</th>\n",
       "      <td>NaN</td>\n",
       "      <td>With which countries does India have ongoing b...</td>\n",
       "      <td>Which countries currently have territorial dis...</td>\n",
       "      <td>With which countries that are nuclear powers d...</td>\n",
       "      <td>With which countries that are not nuclear powe...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>146</td>\n",
       "      <td>synthetic</td>\n",
       "      <td>en</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>447</th>\n",
       "      <td>NaN</td>\n",
       "      <td>What countries on the Arabian Peninsula are co...</td>\n",
       "      <td>Which countries are in the Middle East and on ...</td>\n",
       "      <td>What countries on the Arabian Peninsula are pa...</td>\n",
       "      <td>What countries on the Arabian Peninsula are pa...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>147</td>\n",
       "      <td>synthetic</td>\n",
       "      <td>en</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>448</th>\n",
       "      <td>NaN</td>\n",
       "      <td>What rivers over 500 kilometers long flow into...</td>\n",
       "      <td>Which rivers over 500 kms in length feed into ...</td>\n",
       "      <td>What rivers over 500 kilometers long flow into...</td>\n",
       "      <td>What rivers over 500 kilometers long flow into...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>148</td>\n",
       "      <td>synthetic</td>\n",
       "      <td>en</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>449</th>\n",
       "      <td>NaN</td>\n",
       "      <td>What political parties had at least one minist...</td>\n",
       "      <td>Which parties (excluding independents) have ha...</td>\n",
       "      <td>What political parties had at least one minist...</td>\n",
       "      <td>What political parties had at least one minist...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>149</td>\n",
       "      <td>synthetic</td>\n",
       "      <td>en</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>450 rows × 14 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           usparql                                                 Q1  \\\n",
       "0    no_usparql_13  Which television shows were created by John Cl...   \n",
       "1    no_usparql_17           Give me all current communist countries.   \n",
       "2    no_usparql_26  Which politicians were married to a German per...   \n",
       "3    no_usparql_29  Give me all soccer clubs in Spain that play in...   \n",
       "4    no_usparql_32  Which telecommunications organizations are loc...   \n",
       "..             ...                                                ...   \n",
       "445            NaN  In which universities do Nobel Prize Laureates...   \n",
       "446            NaN  With which countries does India have ongoing b...   \n",
       "447            NaN  What countries on the Arabian Peninsula are co...   \n",
       "448            NaN  What rivers over 500 kilometers long flow into...   \n",
       "449            NaN  What political parties had at least one minist...   \n",
       "\n",
       "                                                    Q2  \\\n",
       "0     What television programs did John Cleese create?   \n",
       "1          What countries have a communist government?   \n",
       "2       Which politicians had a spouse who was German?   \n",
       "3    Can you list every soccer club located in Spai...   \n",
       "4    Which telecommunications organizations are bas...   \n",
       "..                                                 ...   \n",
       "445  Which universities have Nobel laureates on the...   \n",
       "446  Which countries currently have territorial dis...   \n",
       "447  Which countries are in the Middle East and on ...   \n",
       "448  Which rivers over 500 kms in length feed into ...   \n",
       "449  Which parties (excluding independents) have ha...   \n",
       "\n",
       "                                                    Q3  \\\n",
       "0    Which television shows were created by John Cl...   \n",
       "1             Give me all communist countries in Asia.   \n",
       "2    Which politicians were married to a German woman?   \n",
       "3    Please tell me all the Spanish football clubs ...   \n",
       "4    Which telecommunications organizations are loc...   \n",
       "..                                                 ...   \n",
       "445  Which universities have Nobel laureates in Eco...   \n",
       "446  With which countries that are nuclear powers d...   \n",
       "447  What countries on the Arabian Peninsula are pa...   \n",
       "448  What rivers over 500 kilometers long flow into...   \n",
       "449  What political parties had at least one minist...   \n",
       "\n",
       "                                                    Q4  type  q_index  \\\n",
       "0    Which television shows were created by John Cl...   NaN        0   \n",
       "1     Give me all communist countries outside of Asia.   NaN        1   \n",
       "2    Which politicians were married to a German man...   NaN        2   \n",
       "3    Please tell me all the Spanish football clubs ...   NaN        3   \n",
       "4    Which telecommunications organizations are loc...   NaN        4   \n",
       "..                                                 ...   ...      ...   \n",
       "445  Which universities have Nobel laureates on the...   NaN      145   \n",
       "446  With which countries that are not nuclear powe...   NaN      146   \n",
       "447  What countries on the Arabian Peninsula are pa...   NaN      147   \n",
       "448  What rivers over 500 kilometers long flow into...   NaN      148   \n",
       "449  What political parties had at least one minist...   NaN      149   \n",
       "\n",
       "       dataset lang   q1   q2  Type  Unnamed: 7 0: fully containment  \n",
       "0      spinach   en  NaN  NaN   NaN         NaN                  NaN  \n",
       "1      spinach   en  NaN  NaN   NaN         NaN                  NaN  \n",
       "2      spinach   en  NaN  NaN   NaN         NaN                  NaN  \n",
       "3      spinach   en  NaN  NaN   NaN         NaN                  NaN  \n",
       "4      spinach   en  NaN  NaN   NaN         NaN                  NaN  \n",
       "..         ...  ...  ...  ...   ...         ...                  ...  \n",
       "445  synthetic   en  NaN  NaN   NaN         NaN                  NaN  \n",
       "446  synthetic   en  NaN  NaN   NaN         NaN                  NaN  \n",
       "447  synthetic   en  NaN  NaN   NaN         NaN                  NaN  \n",
       "448  synthetic   en  NaN  NaN   NaN         NaN                  NaN  \n",
       "449  synthetic   en  NaN  NaN   NaN         NaN                  NaN  \n",
       "\n",
       "[450 rows x 14 columns]"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_questions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "7572273d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "JSON files found: 1185\n"
     ]
    }
   ],
   "source": [
    "df_answers = load_answers(\n",
    "    folder=root_dir + \"/data/answers/\",\n",
    "    datasets = datasets,\n",
    "    llms=llms,\n",
    "    actions=actions,\n",
    "    tasks=tasks,\n",
    "    languages=languages,\n",
    "    questions=[\"Q1\", \"Q2\", \"Q3\", \"Q4\"]\n",
    ")\n",
    "\n",
    "df_answers = enrich_answers(df_answers, df_questions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "bbd98889",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Q_ID</th>\n",
       "      <th>Q_serie</th>\n",
       "      <th>action</th>\n",
       "      <th>task</th>\n",
       "      <th>dataset</th>\n",
       "      <th>lang</th>\n",
       "      <th>llm</th>\n",
       "      <th>Answer</th>\n",
       "      <th>Question</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>Q1</td>\n",
       "      <td>classification</td>\n",
       "      <td>sup-sub</td>\n",
       "      <td>spinach</td>\n",
       "      <td>None</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>[At Last the 1948 Show, Monty Python's Flying ...</td>\n",
       "      <td>Which television shows were created by John Cl...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>Q1</td>\n",
       "      <td>classification</td>\n",
       "      <td>sup-sub</td>\n",
       "      <td>spinach</td>\n",
       "      <td>None</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>[People's Republic of China, Republic of Cuba,...</td>\n",
       "      <td>Give me all current communist countries.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>Q1</td>\n",
       "      <td>classification</td>\n",
       "      <td>sup-sub</td>\n",
       "      <td>spinach</td>\n",
       "      <td>None</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>[idk]</td>\n",
       "      <td>Which politicians were married to a German per...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>Q1</td>\n",
       "      <td>classification</td>\n",
       "      <td>sup-sub</td>\n",
       "      <td>spinach</td>\n",
       "      <td>None</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>[Futbol Club Barcelona, Real Madrid Club de Fú...</td>\n",
       "      <td>Give me all soccer clubs in Spain that play in...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>Q1</td>\n",
       "      <td>classification</td>\n",
       "      <td>sup-sub</td>\n",
       "      <td>spinach</td>\n",
       "      <td>None</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>[idk]</td>\n",
       "      <td>Which telecommunications organizations are loc...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9895</th>\n",
       "      <td>146</td>\n",
       "      <td>Q1</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>equal</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>None</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>[Álvaro Dias, Cabo Daciolo, Ciro Gomes, Felipe...</td>\n",
       "      <td>Who has run against Jair Bolsonaro for Preside...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9896</th>\n",
       "      <td>147</td>\n",
       "      <td>Q1</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>equal</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>None</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>[Lee Hae-chan, Han Myeong-sook, Han Duck-soo, ...</td>\n",
       "      <td>Who has served as Prime Minister of South Kore...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9897</th>\n",
       "      <td>148</td>\n",
       "      <td>Q1</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>equal</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>None</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>[Marie Curie, Linus Pauling]</td>\n",
       "      <td>Who has won more than one type of Nobel Prize?</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9898</th>\n",
       "      <td>149</td>\n",
       "      <td>Q1</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>equal</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>None</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>[idk]</td>\n",
       "      <td>Who were the candidates in the 1972 United Sta...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9899</th>\n",
       "      <td>142</td>\n",
       "      <td>Q1</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>equal</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>None</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>[Justine Musk, Talulah Riley]</td>\n",
       "      <td>Who are Elon Musk's ex-spouses?</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>9900 rows × 9 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Q_ID Q_serie          action     task  dataset  lang    llm  \\\n",
       "0       0      Q1  classification  sup-sub  spinach  None  gpt-5   \n",
       "1       1      Q1  classification  sup-sub  spinach  None  gpt-5   \n",
       "2       2      Q1  classification  sup-sub  spinach  None  gpt-5   \n",
       "3       3      Q1  classification  sup-sub  spinach  None  gpt-5   \n",
       "4       4      Q1  classification  sup-sub  spinach  None  gpt-5   \n",
       "...   ...     ...             ...      ...      ...   ...    ...   \n",
       "9895  146      Q1        wikidata    equal   qawiki  None  gpt-5   \n",
       "9896  147      Q1        wikidata    equal   qawiki  None  gpt-5   \n",
       "9897  148      Q1        wikidata    equal   qawiki  None  gpt-5   \n",
       "9898  149      Q1        wikidata    equal   qawiki  None  gpt-5   \n",
       "9899  142      Q1        wikidata    equal   qawiki  None  gpt-5   \n",
       "\n",
       "                                                 Answer  \\\n",
       "0     [At Last the 1948 Show, Monty Python's Flying ...   \n",
       "1     [People's Republic of China, Republic of Cuba,...   \n",
       "2                                                 [idk]   \n",
       "3     [Futbol Club Barcelona, Real Madrid Club de Fú...   \n",
       "4                                                 [idk]   \n",
       "...                                                 ...   \n",
       "9895  [Álvaro Dias, Cabo Daciolo, Ciro Gomes, Felipe...   \n",
       "9896  [Lee Hae-chan, Han Myeong-sook, Han Duck-soo, ...   \n",
       "9897                       [Marie Curie, Linus Pauling]   \n",
       "9898                                              [idk]   \n",
       "9899                      [Justine Musk, Talulah Riley]   \n",
       "\n",
       "                                               Question  \n",
       "0     Which television shows were created by John Cl...  \n",
       "1              Give me all current communist countries.  \n",
       "2     Which politicians were married to a German per...  \n",
       "3     Give me all soccer clubs in Spain that play in...  \n",
       "4     Which telecommunications organizations are loc...  \n",
       "...                                                 ...  \n",
       "9895  Who has run against Jair Bolsonaro for Preside...  \n",
       "9896  Who has served as Prime Minister of South Kore...  \n",
       "9897     Who has won more than one type of Nobel Prize?  \n",
       "9898  Who were the candidates in the 1972 United Sta...  \n",
       "9899                    Who are Elon Musk's ex-spouses?  \n",
       "\n",
       "[9900 rows x 9 columns]"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_answers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 230,
   "id": "bee985b8",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_analysis = analysis(df_answers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 231,
   "id": "1e725f8b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['Q_ID', 'action', 'dataset', 'llm', '?A1=A2', '?A1=A3+A4', '?A1>A3',\n",
      "       '?A1>A4', '?A3∅A4', '?A1=A1*', '?A1=A1**', '?A1*=A1**', 'J(A1-A2)',\n",
      "       'J(A1-A34)', 'J(A1-A1*)', 'J(A1-A1**)', 'J(A1*-A1**)', 'Q1', 'Q2', 'Q3',\n",
      "       'Q4', 'A1', 'A2', 'A3', 'A4', 'A1*', 'A1**'],\n",
      "      dtype='object')\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Q_ID</th>\n",
       "      <th>action</th>\n",
       "      <th>dataset</th>\n",
       "      <th>llm</th>\n",
       "      <th>?A1=A2</th>\n",
       "      <th>?A1=A3+A4</th>\n",
       "      <th>?A1&gt;A3</th>\n",
       "      <th>?A1&gt;A4</th>\n",
       "      <th>?A3∅A4</th>\n",
       "      <th>?A1=A1*</th>\n",
       "      <th>...</th>\n",
       "      <th>Q1</th>\n",
       "      <th>Q2</th>\n",
       "      <th>Q3</th>\n",
       "      <th>Q4</th>\n",
       "      <th>A1</th>\n",
       "      <th>A2</th>\n",
       "      <th>A3</th>\n",
       "      <th>A4</th>\n",
       "      <th>A1*</th>\n",
       "      <th>A1**</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>classification</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>...</td>\n",
       "      <td>In which countries are tepuis found?</td>\n",
       "      <td>In which countries are tepuis located?</td>\n",
       "      <td>In which Spanish-speaking countries are tepuis...</td>\n",
       "      <td>In which countries that are not Spanish-speaki...</td>\n",
       "      <td>[Venezuela, Guyana, Brazil]</td>\n",
       "      <td>[Venezuela, Guyana, Brazil]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[Guyana, Brazil]</td>\n",
       "      <td>[Venezuela, Guyana, Brazil]</td>\n",
       "      <td>[Venezuela, Guyana, Brazil]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>classification</td>\n",
       "      <td>spinach</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>Which television shows were created by John Cl...</td>\n",
       "      <td>What television programs did John Cleese create?</td>\n",
       "      <td>Which television shows were created by John Cl...</td>\n",
       "      <td>Which television shows were created by John Cl...</td>\n",
       "      <td>[At Last the 1948 Show, Fawlty Towers, Monty P...</td>\n",
       "      <td>[At Last the 1948 Show, Fawlty Towers, Monty P...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[At Last the 1948 Show, Fawlty Towers, Monty P...</td>\n",
       "      <td>[At Last the 1948 Show, Fawlty Towers, Monty P...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>classification</td>\n",
       "      <td>synthetic</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>...</td>\n",
       "      <td>Name the films for which Quentin Tarantino was...</td>\n",
       "      <td>Which movies were directed by Quentin Tarantino?</td>\n",
       "      <td>Name the films directed by Quentin Tarantino s...</td>\n",
       "      <td>Name the films directed by Quentin Tarantino n...</td>\n",
       "      <td>[The Hateful Eight, Django Unchained, Kill Bil...</td>\n",
       "      <td>[The Hateful Eight, Django Unchained, Kill Bil...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[The Hateful Eight, Django Unchained, Kill Bil...</td>\n",
       "      <td>[The Hateful Eight, Django Unchained, Kill Bil...</td>\n",
       "      <td>[The Hateful Eight, Django Unchained, Kill Bil...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>fixing</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>In which countries are tepuis found?</td>\n",
       "      <td>In which countries are tepuis located?</td>\n",
       "      <td>In which Spanish-speaking countries are tepuis...</td>\n",
       "      <td>In which countries that are not Spanish-speaki...</td>\n",
       "      <td>[Suriname, Guyana, Brazil, Colombia, Venezuela]</td>\n",
       "      <td>[Suriname, Guyana, Brazil, Colombia, Venezuela]</td>\n",
       "      <td>[Colombia, Venezuela]</td>\n",
       "      <td>[Suriname, Guyana, Brazil]</td>\n",
       "      <td>[Colombia, Venezuela, Guyana, Brazil]</td>\n",
       "      <td>[Suriname, Guyana, Brazil, Colombia, Venezuela]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>fixing</td>\n",
       "      <td>spinach</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>Which television shows were created by John Cl...</td>\n",
       "      <td>What television programs did John Cleese create?</td>\n",
       "      <td>Which television shows were created by John Cl...</td>\n",
       "      <td>Which television shows were created by John Cl...</td>\n",
       "      <td>[Fawlty Towers, Monty Python's Flying Circus]</td>\n",
       "      <td>[At Last the 1948 Show, Fawlty Towers, Monty P...</td>\n",
       "      <td>[At Last the 1948 Show, Monty Python's Fliegen...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[At Last the 1948 Show, Monty Python's Fliegen...</td>\n",
       "      <td>[At Last the 1948 Show, Fawlty Towers, Monty P...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1795</th>\n",
       "      <td>99</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>spinach</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>Which states border Illinois?</td>\n",
       "      <td>What states share a border with Illinois?</td>\n",
       "      <td>Which states in the Great Lakes region border ...</td>\n",
       "      <td>Which states not in the Great Lakes region bor...</td>\n",
       "      <td>[Indiana, Missouri, Wisconsin, Iowa, Kentucky]</td>\n",
       "      <td>[Indiana, Missouri, Wisconsin, Iowa, Kentucky]</td>\n",
       "      <td>[Indiana, Wisconsin]</td>\n",
       "      <td>[Missouri, Iowa, Kentucky]</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1796</th>\n",
       "      <td>99</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>synthetic</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>Which colors are on the flag of both Germany a...</td>\n",
       "      <td>What are the colors used for both the German a...</td>\n",
       "      <td>What primary colors are used for both the Germ...</td>\n",
       "      <td>What non-primary colors are used for both the ...</td>\n",
       "      <td>[black, red]</td>\n",
       "      <td>[black, red]</td>\n",
       "      <td>[yellow, red]</td>\n",
       "      <td>[black]</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1797</th>\n",
       "      <td>99</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>What languages are pro-drop?</td>\n",
       "      <td>What languages drop pronouns?</td>\n",
       "      <td>Which pro-drop languages use Latin script?</td>\n",
       "      <td>Which pro-drop languages do not use Latin script?</td>\n",
       "      <td>[idk]</td>\n",
       "      <td>[idk]</td>\n",
       "      <td>[idk]</td>\n",
       "      <td>[idk]</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1798</th>\n",
       "      <td>99</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>spinach</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>Which states border Illinois?</td>\n",
       "      <td>What states share a border with Illinois?</td>\n",
       "      <td>Which states in the Great Lakes region border ...</td>\n",
       "      <td>Which states not in the Great Lakes region bor...</td>\n",
       "      <td>[Indiana, Missouri, Wisconsin, Iowa, Kentucky]</td>\n",
       "      <td>[Indiana, Missouri, Wisconsin, Iowa, Kentucky]</td>\n",
       "      <td>[Indiana, Wisconsin]</td>\n",
       "      <td>[Missouri, Iowa, Kentucky]</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1799</th>\n",
       "      <td>99</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>synthetic</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>Which colors are on the flag of both Germany a...</td>\n",
       "      <td>What are the colors used for both the German a...</td>\n",
       "      <td>What primary colors are used for both the Germ...</td>\n",
       "      <td>What non-primary colors are used for both the ...</td>\n",
       "      <td>[black, yellow, red]</td>\n",
       "      <td>[black, red]</td>\n",
       "      <td>[yellow, red]</td>\n",
       "      <td>[black]</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1800 rows × 27 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Q_ID          action    dataset    llm  ?A1=A2  ?A1=A3+A4  ?A1>A3  \\\n",
       "0       0  classification     qawiki  gpt-5       1          1       1   \n",
       "1       0  classification    spinach  gpt-5       1          1       1   \n",
       "2       0  classification  synthetic  gpt-5       1          1       1   \n",
       "3       0          fixing     qawiki  gpt-5       1          1       1   \n",
       "4       0          fixing    spinach  gpt-5       0          1       1   \n",
       "...   ...             ...        ...    ...     ...        ...     ...   \n",
       "1795   99        wikidata    spinach  gpt-5       1          1       1   \n",
       "1796   99        wikidata  synthetic  gpt-5       1          0       0   \n",
       "1797   99       zero-shot     qawiki  gpt-5       1          1       1   \n",
       "1798   99       zero-shot    spinach  gpt-5       1          1       1   \n",
       "1799   99       zero-shot  synthetic  gpt-5       0          1       1   \n",
       "\n",
       "      ?A1>A4  ?A3∅A4  ?A1=A1*  ...  \\\n",
       "0          1       1      1.0  ...   \n",
       "1          1       1      0.0  ...   \n",
       "2          1       1      1.0  ...   \n",
       "3          1       1      0.0  ...   \n",
       "4          1       1      0.0  ...   \n",
       "...      ...     ...      ...  ...   \n",
       "1795       1       1      NaN  ...   \n",
       "1796       1       1      NaN  ...   \n",
       "1797       1       0      NaN  ...   \n",
       "1798       1       1      NaN  ...   \n",
       "1799       1       1      NaN  ...   \n",
       "\n",
       "                                                     Q1  \\\n",
       "0                  In which countries are tepuis found?   \n",
       "1     Which television shows were created by John Cl...   \n",
       "2     Name the films for which Quentin Tarantino was...   \n",
       "3                  In which countries are tepuis found?   \n",
       "4     Which television shows were created by John Cl...   \n",
       "...                                                 ...   \n",
       "1795                      Which states border Illinois?   \n",
       "1796  Which colors are on the flag of both Germany a...   \n",
       "1797                       What languages are pro-drop?   \n",
       "1798                      Which states border Illinois?   \n",
       "1799  Which colors are on the flag of both Germany a...   \n",
       "\n",
       "                                                     Q2  \\\n",
       "0                In which countries are tepuis located?   \n",
       "1      What television programs did John Cleese create?   \n",
       "2      Which movies were directed by Quentin Tarantino?   \n",
       "3                In which countries are tepuis located?   \n",
       "4      What television programs did John Cleese create?   \n",
       "...                                                 ...   \n",
       "1795          What states share a border with Illinois?   \n",
       "1796  What are the colors used for both the German a...   \n",
       "1797                      What languages drop pronouns?   \n",
       "1798          What states share a border with Illinois?   \n",
       "1799  What are the colors used for both the German a...   \n",
       "\n",
       "                                                     Q3  \\\n",
       "0     In which Spanish-speaking countries are tepuis...   \n",
       "1     Which television shows were created by John Cl...   \n",
       "2     Name the films directed by Quentin Tarantino s...   \n",
       "3     In which Spanish-speaking countries are tepuis...   \n",
       "4     Which television shows were created by John Cl...   \n",
       "...                                                 ...   \n",
       "1795  Which states in the Great Lakes region border ...   \n",
       "1796  What primary colors are used for both the Germ...   \n",
       "1797         Which pro-drop languages use Latin script?   \n",
       "1798  Which states in the Great Lakes region border ...   \n",
       "1799  What primary colors are used for both the Germ...   \n",
       "\n",
       "                                                     Q4  \\\n",
       "0     In which countries that are not Spanish-speaki...   \n",
       "1     Which television shows were created by John Cl...   \n",
       "2     Name the films directed by Quentin Tarantino n...   \n",
       "3     In which countries that are not Spanish-speaki...   \n",
       "4     Which television shows were created by John Cl...   \n",
       "...                                                 ...   \n",
       "1795  Which states not in the Great Lakes region bor...   \n",
       "1796  What non-primary colors are used for both the ...   \n",
       "1797  Which pro-drop languages do not use Latin script?   \n",
       "1798  Which states not in the Great Lakes region bor...   \n",
       "1799  What non-primary colors are used for both the ...   \n",
       "\n",
       "                                                     A1  \\\n",
       "0                           [Venezuela, Guyana, Brazil]   \n",
       "1     [At Last the 1948 Show, Fawlty Towers, Monty P...   \n",
       "2     [The Hateful Eight, Django Unchained, Kill Bil...   \n",
       "3       [Suriname, Guyana, Brazil, Colombia, Venezuela]   \n",
       "4         [Fawlty Towers, Monty Python's Flying Circus]   \n",
       "...                                                 ...   \n",
       "1795     [Indiana, Missouri, Wisconsin, Iowa, Kentucky]   \n",
       "1796                                       [black, red]   \n",
       "1797                                              [idk]   \n",
       "1798     [Indiana, Missouri, Wisconsin, Iowa, Kentucky]   \n",
       "1799                               [black, yellow, red]   \n",
       "\n",
       "                                                     A2  \\\n",
       "0                           [Venezuela, Guyana, Brazil]   \n",
       "1     [At Last the 1948 Show, Fawlty Towers, Monty P...   \n",
       "2     [The Hateful Eight, Django Unchained, Kill Bil...   \n",
       "3       [Suriname, Guyana, Brazil, Colombia, Venezuela]   \n",
       "4     [At Last the 1948 Show, Fawlty Towers, Monty P...   \n",
       "...                                                 ...   \n",
       "1795     [Indiana, Missouri, Wisconsin, Iowa, Kentucky]   \n",
       "1796                                       [black, red]   \n",
       "1797                                              [idk]   \n",
       "1798     [Indiana, Missouri, Wisconsin, Iowa, Kentucky]   \n",
       "1799                                       [black, red]   \n",
       "\n",
       "                                                     A3  \\\n",
       "0                                                    []   \n",
       "1                                                    []   \n",
       "2                                                    []   \n",
       "3                                 [Colombia, Venezuela]   \n",
       "4     [At Last the 1948 Show, Monty Python's Fliegen...   \n",
       "...                                                 ...   \n",
       "1795                               [Indiana, Wisconsin]   \n",
       "1796                                      [yellow, red]   \n",
       "1797                                              [idk]   \n",
       "1798                               [Indiana, Wisconsin]   \n",
       "1799                                      [yellow, red]   \n",
       "\n",
       "                                                     A4  \\\n",
       "0                                      [Guyana, Brazil]   \n",
       "1                                                    []   \n",
       "2     [The Hateful Eight, Django Unchained, Kill Bil...   \n",
       "3                            [Suriname, Guyana, Brazil]   \n",
       "4                                                    []   \n",
       "...                                                 ...   \n",
       "1795                         [Missouri, Iowa, Kentucky]   \n",
       "1796                                            [black]   \n",
       "1797                                              [idk]   \n",
       "1798                         [Missouri, Iowa, Kentucky]   \n",
       "1799                                            [black]   \n",
       "\n",
       "                                                    A1*  \\\n",
       "0                           [Venezuela, Guyana, Brazil]   \n",
       "1     [At Last the 1948 Show, Fawlty Towers, Monty P...   \n",
       "2     [The Hateful Eight, Django Unchained, Kill Bil...   \n",
       "3                 [Colombia, Venezuela, Guyana, Brazil]   \n",
       "4     [At Last the 1948 Show, Monty Python's Fliegen...   \n",
       "...                                                 ...   \n",
       "1795                                               None   \n",
       "1796                                               None   \n",
       "1797                                               None   \n",
       "1798                                               None   \n",
       "1799                                               None   \n",
       "\n",
       "                                                   A1**  \n",
       "0                           [Venezuela, Guyana, Brazil]  \n",
       "1     [At Last the 1948 Show, Fawlty Towers, Monty P...  \n",
       "2     [The Hateful Eight, Django Unchained, Kill Bil...  \n",
       "3       [Suriname, Guyana, Brazil, Colombia, Venezuela]  \n",
       "4     [At Last the 1948 Show, Fawlty Towers, Monty P...  \n",
       "...                                                 ...  \n",
       "1795                                               None  \n",
       "1796                                               None  \n",
       "1797                                               None  \n",
       "1798                                               None  \n",
       "1799                                               None  \n",
       "\n",
       "[1800 rows x 27 columns]"
      ]
     },
     "execution_count": 231,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(df_analysis.columns)\n",
    "df_analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 232,
   "id": "8c5d9b09",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>dataset</th>\n",
       "      <th>action</th>\n",
       "      <th>llm</th>\n",
       "      <th>?A1=A2</th>\n",
       "      <th>?A1=A3+A4</th>\n",
       "      <th>?A1&gt;A3</th>\n",
       "      <th>?A1&gt;A4</th>\n",
       "      <th>?A3∅A4</th>\n",
       "      <th>?A1=A1*</th>\n",
       "      <th>?A1=A1**</th>\n",
       "      <th>...</th>\n",
       "      <th>J(A1-A34)</th>\n",
       "      <th>J(A1-A1*)</th>\n",
       "      <th>J(A1-A1**)</th>\n",
       "      <th>J(A1*-A1**)</th>\n",
       "      <th>idk_A1</th>\n",
       "      <th>idk_A2</th>\n",
       "      <th>idk_A3</th>\n",
       "      <th>idk_A4</th>\n",
       "      <th>?A1=A1(ave)</th>\n",
       "      <th>J_A1_ave</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>qawiki</td>\n",
       "      <td>classification</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.9133</td>\n",
       "      <td>0.9267</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.9733</td>\n",
       "      <td>0.5200</td>\n",
       "      <td>0.5267</td>\n",
       "      <td>0.6067</td>\n",
       "      <td>...</td>\n",
       "      <td>0.9644</td>\n",
       "      <td>0.7286</td>\n",
       "      <td>0.7324</td>\n",
       "      <td>0.7460</td>\n",
       "      <td>0.4133</td>\n",
       "      <td>0.4067</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.5333</td>\n",
       "      <td>0.5800</td>\n",
       "      <td>0.7357</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>qawiki</td>\n",
       "      <td>fixing</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.8933</td>\n",
       "      <td>0.8933</td>\n",
       "      <td>0.9533</td>\n",
       "      <td>0.9867</td>\n",
       "      <td>0.6200</td>\n",
       "      <td>0.7067</td>\n",
       "      <td>0.7067</td>\n",
       "      <td>...</td>\n",
       "      <td>0.9191</td>\n",
       "      <td>0.8220</td>\n",
       "      <td>0.8343</td>\n",
       "      <td>0.8067</td>\n",
       "      <td>0.5133</td>\n",
       "      <td>0.5267</td>\n",
       "      <td>0.5067</td>\n",
       "      <td>0.5800</td>\n",
       "      <td>0.7000</td>\n",
       "      <td>0.8210</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>qawiki</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.8200</td>\n",
       "      <td>0.6400</td>\n",
       "      <td>0.7867</td>\n",
       "      <td>0.7667</td>\n",
       "      <td>0.5000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.7825</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.6600</td>\n",
       "      <td>0.6467</td>\n",
       "      <td>0.5400</td>\n",
       "      <td>0.5933</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>qawiki</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.6800</td>\n",
       "      <td>0.5667</td>\n",
       "      <td>0.7267</td>\n",
       "      <td>0.7400</td>\n",
       "      <td>0.5867</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.7421</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.5467</td>\n",
       "      <td>0.5267</td>\n",
       "      <td>0.4400</td>\n",
       "      <td>0.5267</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>spinach</td>\n",
       "      <td>classification</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.9333</td>\n",
       "      <td>0.8600</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.9733</td>\n",
       "      <td>0.7000</td>\n",
       "      <td>0.6200</td>\n",
       "      <td>0.6067</td>\n",
       "      <td>...</td>\n",
       "      <td>0.9308</td>\n",
       "      <td>0.7761</td>\n",
       "      <td>0.7870</td>\n",
       "      <td>0.7876</td>\n",
       "      <td>0.3000</td>\n",
       "      <td>0.2933</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.3867</td>\n",
       "      <td>0.6133</td>\n",
       "      <td>0.7836</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>spinach</td>\n",
       "      <td>fixing</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.9400</td>\n",
       "      <td>0.8400</td>\n",
       "      <td>0.9400</td>\n",
       "      <td>0.9733</td>\n",
       "      <td>0.7800</td>\n",
       "      <td>0.7267</td>\n",
       "      <td>0.6200</td>\n",
       "      <td>...</td>\n",
       "      <td>0.8977</td>\n",
       "      <td>0.8527</td>\n",
       "      <td>0.7902</td>\n",
       "      <td>0.8160</td>\n",
       "      <td>0.3533</td>\n",
       "      <td>0.3533</td>\n",
       "      <td>0.3333</td>\n",
       "      <td>0.3867</td>\n",
       "      <td>0.6667</td>\n",
       "      <td>0.8196</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>spinach</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.7533</td>\n",
       "      <td>0.5867</td>\n",
       "      <td>0.7533</td>\n",
       "      <td>0.7533</td>\n",
       "      <td>0.6200</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.7381</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.4533</td>\n",
       "      <td>0.5000</td>\n",
       "      <td>0.3733</td>\n",
       "      <td>0.4600</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>spinach</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.6333</td>\n",
       "      <td>0.4600</td>\n",
       "      <td>0.6667</td>\n",
       "      <td>0.6733</td>\n",
       "      <td>0.7133</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.6965</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.3800</td>\n",
       "      <td>0.3667</td>\n",
       "      <td>0.3000</td>\n",
       "      <td>0.3400</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>synthetic</td>\n",
       "      <td>classification</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.8933</td>\n",
       "      <td>0.8867</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.9733</td>\n",
       "      <td>0.8867</td>\n",
       "      <td>0.6000</td>\n",
       "      <td>0.5867</td>\n",
       "      <td>...</td>\n",
       "      <td>0.9285</td>\n",
       "      <td>0.8146</td>\n",
       "      <td>0.7942</td>\n",
       "      <td>0.8114</td>\n",
       "      <td>0.0933</td>\n",
       "      <td>0.0933</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.1667</td>\n",
       "      <td>0.6000</td>\n",
       "      <td>0.8067</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>synthetic</td>\n",
       "      <td>fixing</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.8800</td>\n",
       "      <td>0.9000</td>\n",
       "      <td>0.9667</td>\n",
       "      <td>0.9933</td>\n",
       "      <td>0.9267</td>\n",
       "      <td>0.5667</td>\n",
       "      <td>0.5600</td>\n",
       "      <td>...</td>\n",
       "      <td>0.9482</td>\n",
       "      <td>0.7964</td>\n",
       "      <td>0.7859</td>\n",
       "      <td>0.7853</td>\n",
       "      <td>0.1267</td>\n",
       "      <td>0.1333</td>\n",
       "      <td>0.1200</td>\n",
       "      <td>0.1467</td>\n",
       "      <td>0.5578</td>\n",
       "      <td>0.7892</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>synthetic</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.6667</td>\n",
       "      <td>0.5400</td>\n",
       "      <td>0.7333</td>\n",
       "      <td>0.7800</td>\n",
       "      <td>0.8667</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.7549</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.2133</td>\n",
       "      <td>0.2333</td>\n",
       "      <td>0.1200</td>\n",
       "      <td>0.2067</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>synthetic</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.5533</td>\n",
       "      <td>0.3933</td>\n",
       "      <td>0.6067</td>\n",
       "      <td>0.6467</td>\n",
       "      <td>0.8667</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.6867</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.1533</td>\n",
       "      <td>0.1267</td>\n",
       "      <td>0.0467</td>\n",
       "      <td>0.1333</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>overall</td>\n",
       "      <td>classification</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.9133</td>\n",
       "      <td>0.8911</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.9733</td>\n",
       "      <td>0.7022</td>\n",
       "      <td>0.5822</td>\n",
       "      <td>0.6000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.9412</td>\n",
       "      <td>0.7731</td>\n",
       "      <td>0.7712</td>\n",
       "      <td>0.7817</td>\n",
       "      <td>0.2689</td>\n",
       "      <td>0.2644</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.3622</td>\n",
       "      <td>0.5978</td>\n",
       "      <td>0.7753</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>overall</td>\n",
       "      <td>fixing</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.9044</td>\n",
       "      <td>0.8778</td>\n",
       "      <td>0.9533</td>\n",
       "      <td>0.9844</td>\n",
       "      <td>0.7756</td>\n",
       "      <td>0.6667</td>\n",
       "      <td>0.6289</td>\n",
       "      <td>...</td>\n",
       "      <td>0.9217</td>\n",
       "      <td>0.8237</td>\n",
       "      <td>0.8035</td>\n",
       "      <td>0.8027</td>\n",
       "      <td>0.3311</td>\n",
       "      <td>0.3378</td>\n",
       "      <td>0.3200</td>\n",
       "      <td>0.3711</td>\n",
       "      <td>0.6415</td>\n",
       "      <td>0.8100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>overall</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.7467</td>\n",
       "      <td>0.5889</td>\n",
       "      <td>0.7578</td>\n",
       "      <td>0.7667</td>\n",
       "      <td>0.6622</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.7585</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.4422</td>\n",
       "      <td>0.4600</td>\n",
       "      <td>0.3444</td>\n",
       "      <td>0.4200</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>overall</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.6222</td>\n",
       "      <td>0.4733</td>\n",
       "      <td>0.6667</td>\n",
       "      <td>0.6867</td>\n",
       "      <td>0.7222</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.7084</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.3600</td>\n",
       "      <td>0.3400</td>\n",
       "      <td>0.2622</td>\n",
       "      <td>0.3333</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>16 rows × 22 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      dataset          action    llm  ?A1=A2  ?A1=A3+A4  ?A1>A3  ?A1>A4  \\\n",
       "0      qawiki  classification  gpt-5  0.9133     0.9267  1.0000  0.9733   \n",
       "1      qawiki          fixing  gpt-5  0.8933     0.8933  0.9533  0.9867   \n",
       "2      qawiki        wikidata  gpt-5  0.8200     0.6400  0.7867  0.7667   \n",
       "3      qawiki       zero-shot  gpt-5  0.6800     0.5667  0.7267  0.7400   \n",
       "4     spinach  classification  gpt-5  0.9333     0.8600  1.0000  0.9733   \n",
       "5     spinach          fixing  gpt-5  0.9400     0.8400  0.9400  0.9733   \n",
       "6     spinach        wikidata  gpt-5  0.7533     0.5867  0.7533  0.7533   \n",
       "7     spinach       zero-shot  gpt-5  0.6333     0.4600  0.6667  0.6733   \n",
       "8   synthetic  classification  gpt-5  0.8933     0.8867  1.0000  0.9733   \n",
       "9   synthetic          fixing  gpt-5  0.8800     0.9000  0.9667  0.9933   \n",
       "10  synthetic        wikidata  gpt-5  0.6667     0.5400  0.7333  0.7800   \n",
       "11  synthetic       zero-shot  gpt-5  0.5533     0.3933  0.6067  0.6467   \n",
       "12    overall  classification  gpt-5  0.9133     0.8911  1.0000  0.9733   \n",
       "13    overall          fixing  gpt-5  0.9044     0.8778  0.9533  0.9844   \n",
       "14    overall        wikidata  gpt-5  0.7467     0.5889  0.7578  0.7667   \n",
       "15    overall       zero-shot  gpt-5  0.6222     0.4733  0.6667  0.6867   \n",
       "\n",
       "    ?A3∅A4  ?A1=A1*  ?A1=A1**  ...  J(A1-A34)  J(A1-A1*)  J(A1-A1**)  \\\n",
       "0   0.5200   0.5267    0.6067  ...     0.9644     0.7286      0.7324   \n",
       "1   0.6200   0.7067    0.7067  ...     0.9191     0.8220      0.8343   \n",
       "2   0.5000      NaN       NaN  ...     0.7825        NaN         NaN   \n",
       "3   0.5867      NaN       NaN  ...     0.7421        NaN         NaN   \n",
       "4   0.7000   0.6200    0.6067  ...     0.9308     0.7761      0.7870   \n",
       "5   0.7800   0.7267    0.6200  ...     0.8977     0.8527      0.7902   \n",
       "6   0.6200      NaN       NaN  ...     0.7381        NaN         NaN   \n",
       "7   0.7133      NaN       NaN  ...     0.6965        NaN         NaN   \n",
       "8   0.8867   0.6000    0.5867  ...     0.9285     0.8146      0.7942   \n",
       "9   0.9267   0.5667    0.5600  ...     0.9482     0.7964      0.7859   \n",
       "10  0.8667      NaN       NaN  ...     0.7549        NaN         NaN   \n",
       "11  0.8667      NaN       NaN  ...     0.6867        NaN         NaN   \n",
       "12  0.7022   0.5822    0.6000  ...     0.9412     0.7731      0.7712   \n",
       "13  0.7756   0.6667    0.6289  ...     0.9217     0.8237      0.8035   \n",
       "14  0.6622      NaN       NaN  ...     0.7585        NaN         NaN   \n",
       "15  0.7222      NaN       NaN  ...     0.7084        NaN         NaN   \n",
       "\n",
       "    J(A1*-A1**)  idk_A1  idk_A2  idk_A3  idk_A4  ?A1=A1(ave)  J_A1_ave  \n",
       "0        0.7460  0.4133  0.4067  1.0000  0.5333       0.5800    0.7357  \n",
       "1        0.8067  0.5133  0.5267  0.5067  0.5800       0.7000    0.8210  \n",
       "2           NaN  0.6600  0.6467  0.5400  0.5933          NaN       NaN  \n",
       "3           NaN  0.5467  0.5267  0.4400  0.5267          NaN       NaN  \n",
       "4        0.7876  0.3000  0.2933  1.0000  0.3867       0.6133    0.7836  \n",
       "5        0.8160  0.3533  0.3533  0.3333  0.3867       0.6667    0.8196  \n",
       "6           NaN  0.4533  0.5000  0.3733  0.4600          NaN       NaN  \n",
       "7           NaN  0.3800  0.3667  0.3000  0.3400          NaN       NaN  \n",
       "8        0.8114  0.0933  0.0933  1.0000  0.1667       0.6000    0.8067  \n",
       "9        0.7853  0.1267  0.1333  0.1200  0.1467       0.5578    0.7892  \n",
       "10          NaN  0.2133  0.2333  0.1200  0.2067          NaN       NaN  \n",
       "11          NaN  0.1533  0.1267  0.0467  0.1333          NaN       NaN  \n",
       "12       0.7817  0.2689  0.2644  1.0000  0.3622       0.5978    0.7753  \n",
       "13       0.8027  0.3311  0.3378  0.3200  0.3711       0.6415    0.8100  \n",
       "14          NaN  0.4422  0.4600  0.3444  0.4200          NaN       NaN  \n",
       "15          NaN  0.3600  0.3400  0.2622  0.3333          NaN       NaN  \n",
       "\n",
       "[16 rows x 22 columns]"
      ]
     },
     "execution_count": 232,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_summary = summary(df_analysis)\n",
    "df_summary"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "deaf530a",
   "metadata": {},
   "source": [
    "### Relation analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "id": "e931d8fb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "# Relation Classification\n",
    "root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__name__))) + \"/data/answers/zero-shot/\"\n",
    "datasets=[\"spinach\", \"qawiki\",'synthetic']\n",
    "# llms = ['gpt-4.1-2025-04-14', 'gpt-4.1-mini-2025-04-14', 'gpt-4.1-nano-2025-04-14', \n",
    "#         'gpt-4o','o3','gpt-5-nano',\"gpt-5-mini\",\"gpt-5\",\n",
    "#         \"gemini-2.0-flash\",\"gemini-2.5-flash\",\"gemini-2.5-pro\",\n",
    "#         \"grok-3-mini\",\"deepseek-chat\",\"deepseek-reasoner\",\"llama3.1:8b\",\"llama3.3:70b\"]\n",
    "llms = ['gpt-5']\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "id": "695849b8",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "def load_relations(root_dir, datasets, llms):\n",
    "    \"\"\"\n",
    "        DataFrame with columns: [\"Q_ID\", \"dataset\", \"llm\", \"R(1-2)\", \"R(1-3)\", \"R(1-4)\", \"R(3-4)\", \"R(1-34)\"]\n",
    "    \"\"\"\n",
    "    # find JSON files\n",
    "    json_files = [\n",
    "        os.path.join(root, file)\n",
    "        for root, _, files in os.walk(root_dir)\n",
    "        for file in files\n",
    "        if file.startswith(\"Relation\") and file.endswith(\".json\")\n",
    "    ]\n",
    "    print(f\"JSON files found: {len(json_files)}\")\n",
    "\n",
    "    # initialize dataframe\n",
    "    df_relation = pd.DataFrame(\n",
    "        columns=[\"Q_ID\", \"dataset\", \"llm\", \"R(1-2)\", \"R(1-3)\", \"R(1-4)\", \"R(3-4)\", \"R(1-34)\"]\n",
    "    )\n",
    "\n",
    "    for file in json_files:\n",
    "        elements = file.replace(\"_\", \"/\").replace(\".json\", \"\").split(\"/\")\n",
    "        dataset = next((d for d in datasets if d in elements), None)\n",
    "        llm = next((l for l in llms if l in elements), None)\n",
    "\n",
    "        if all([dataset, llm]):\n",
    "            with open(file, \"r\", encoding=\"utf-8\") as f:\n",
    "                data = json.load(f)\n",
    "\n",
    "            # transform dict into rows\n",
    "            rows = [\n",
    "                {\n",
    "                    \"dataset\": dataset,\n",
    "                    \"llm\": llm,\n",
    "                    \"Q_ID\": key,\n",
    "                    \"R(1-2)\": value[0],\n",
    "                    \"R(1-3)\": value[1],\n",
    "                    \"R(1-4)\": value[2],\n",
    "                    \"R(3-4)\": value[3],\n",
    "                    \"R(1-34)\": value[4],\n",
    "                }\n",
    "                for key, value in data.items()\n",
    "            ]\n",
    "            df_relation = pd.concat([df_relation, pd.DataFrame(rows)], ignore_index=True)\n",
    "\n",
    "    return df_relation\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "id": "37860f2e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "JSON files found: 37\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Q_ID</th>\n",
       "      <th>dataset</th>\n",
       "      <th>llm</th>\n",
       "      <th>R(1-2)</th>\n",
       "      <th>R(1-3)</th>\n",
       "      <th>R(1-4)</th>\n",
       "      <th>R(3-4)</th>\n",
       "      <th>R(1-34)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>spinach</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Disjoint</td>\n",
       "      <td>Equivalence</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>spinach</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Disjoint</td>\n",
       "      <td>Equivalence</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>spinach</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Overlap</td>\n",
       "      <td>ContainedBy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>spinach</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Disjoint</td>\n",
       "      <td>Equivalence</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>spinach</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Disjoint</td>\n",
       "      <td>Equivalence</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>445</th>\n",
       "      <td>145</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Disjoint</td>\n",
       "      <td>Equivalence</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>446</th>\n",
       "      <td>146</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Overlap</td>\n",
       "      <td>ContainedBy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>447</th>\n",
       "      <td>147</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Disjoint</td>\n",
       "      <td>Equivalence</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>448</th>\n",
       "      <td>148</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Disjoint</td>\n",
       "      <td>Equivalence</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>449</th>\n",
       "      <td>149</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Disjoint</td>\n",
       "      <td>Equivalence</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>450 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    Q_ID  dataset    llm       R(1-2)    R(1-3)    R(1-4)    R(3-4)  \\\n",
       "0      0  spinach  gpt-5  Equivalence  Contains  Contains  Disjoint   \n",
       "1      1  spinach  gpt-5  Equivalence  Contains  Contains  Disjoint   \n",
       "2      2  spinach  gpt-5  Equivalence  Contains  Contains   Overlap   \n",
       "3      3  spinach  gpt-5  Equivalence  Contains  Contains  Disjoint   \n",
       "4      4  spinach  gpt-5     Contains  Contains  Contains  Disjoint   \n",
       "..   ...      ...    ...          ...       ...       ...       ...   \n",
       "445  145   qawiki  gpt-5  Equivalence  Contains  Contains  Disjoint   \n",
       "446  146   qawiki  gpt-5  Equivalence  Contains  Contains   Overlap   \n",
       "447  147   qawiki  gpt-5  Equivalence  Contains  Contains  Disjoint   \n",
       "448  148   qawiki  gpt-5  Equivalence  Contains  Contains  Disjoint   \n",
       "449  149   qawiki  gpt-5  Equivalence  Contains  Contains  Disjoint   \n",
       "\n",
       "         R(1-34)  \n",
       "0    Equivalence  \n",
       "1    Equivalence  \n",
       "2    ContainedBy  \n",
       "3    Equivalence  \n",
       "4    Equivalence  \n",
       "..           ...  \n",
       "445  Equivalence  \n",
       "446  ContainedBy  \n",
       "447  Equivalence  \n",
       "448  Equivalence  \n",
       "449  Equivalence  \n",
       "\n",
       "[450 rows x 8 columns]"
      ]
     },
     "execution_count": 126,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_relation = load_relations(root_dir, datasets, llms)\n",
    "df_relation\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "id": "8f3bcb6b",
   "metadata": {},
   "outputs": [],
   "source": [
    "def relation_summary(df_relation):\n",
    "    df_relation_summery = pd.DataFrame(\n",
    "            columns=[\"dataset\", \"llm\", \"R(1-2)\", \"R(1-3)\", \"R(1-4)\", \"R(3-4)\", \"R(1-34)\"]\n",
    "        )\n",
    "\n",
    "    group_keys = [\"dataset\", \"llm\"]\n",
    "    grouped = df_relation.groupby(group_keys)\n",
    "\n",
    "    for keys, group in grouped: \n",
    "        row = {\n",
    "            \"dataset\": keys[0],\n",
    "            \"llm\": keys[1],\n",
    "            \"R(1-2)\":  round((group[\"R(1-2)\"]  == \"Equivalence\").mean(), 4),\n",
    "            \"R(1-3)\":  round((group[\"R(1-3)\"]  == \"Contains\").mean(),    4),\n",
    "            \"R(1-4)\":  round((group[\"R(1-4)\"]  == \"Contains\").mean(),    4),\n",
    "            \"R(3-4)\":  round((group[\"R(3-4)\"]  == \"Disjoint\").mean(),    4),\n",
    "            \"R(1-34)\": round((group[\"R(1-34)\"] == \"Equivalence\").mean(), 4),\n",
    "        }\n",
    "        df_relation_summery = pd.concat([df_relation_summery, pd.DataFrame([row])], ignore_index=True)\n",
    "\n",
    "    group_keys = [\"llm\"]\n",
    "    grouped = df_relation.groupby(group_keys)\n",
    "\n",
    "    for key, group in grouped: \n",
    "        row = {\n",
    "            \"llm\": key[0],\n",
    "            \"dataset\": \"overall\",\n",
    "            \"R(1-2)\":  round((group[\"R(1-2)\"]  == \"Equivalence\").mean(), 4),\n",
    "            \"R(1-3)\":  round((group[\"R(1-3)\"]  == \"Contains\").mean(),    4),\n",
    "            \"R(1-4)\":  round((group[\"R(1-4)\"]  == \"Contains\").mean(),    4),\n",
    "            \"R(3-4)\":  round((group[\"R(3-4)\"]  == \"Disjoint\").mean(),    4),\n",
    "            \"R(1-34)\": round((group[\"R(1-34)\"] == \"Equivalence\").mean(), 4),\n",
    "        }\n",
    "        df_relation_summery = pd.concat([df_relation_summery, pd.DataFrame([row])], ignore_index=True)\n",
    "    return df_relation_summery"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "id": "7ca06a57",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/pp/f9hxx2kn0vbfmjhhcspmjlt40000gq/T/ipykernel_75958/3824595000.py:19: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
      "  df_relation_summery = pd.concat([df_relation_summery, pd.DataFrame([row])], ignore_index=True)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>dataset</th>\n",
       "      <th>llm</th>\n",
       "      <th>R(1-2)</th>\n",
       "      <th>R(1-3)</th>\n",
       "      <th>R(1-4)</th>\n",
       "      <th>R(3-4)</th>\n",
       "      <th>R(1-34)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>qawiki</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.8733</td>\n",
       "      <td>0.9800</td>\n",
       "      <td>0.9800</td>\n",
       "      <td>0.9733</td>\n",
       "      <td>0.9467</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>spinach</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.9200</td>\n",
       "      <td>0.9867</td>\n",
       "      <td>0.9867</td>\n",
       "      <td>0.9467</td>\n",
       "      <td>0.9133</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>synthetic</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.8867</td>\n",
       "      <td>0.9733</td>\n",
       "      <td>0.9867</td>\n",
       "      <td>0.9733</td>\n",
       "      <td>0.9733</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>overall</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.8933</td>\n",
       "      <td>0.9800</td>\n",
       "      <td>0.9844</td>\n",
       "      <td>0.9644</td>\n",
       "      <td>0.9444</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     dataset    llm  R(1-2)  R(1-3)  R(1-4)  R(3-4)  R(1-34)\n",
       "0     qawiki  gpt-5  0.8733  0.9800  0.9800  0.9733   0.9467\n",
       "1    spinach  gpt-5  0.9200  0.9867  0.9867  0.9467   0.9133\n",
       "2  synthetic  gpt-5  0.8867  0.9733  0.9867  0.9733   0.9733\n",
       "3    overall  gpt-5  0.8933  0.9800  0.9844  0.9644   0.9444"
      ]
     },
     "execution_count": 128,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_relation_summery = relation_summary(df_relation)\n",
    "df_relation_summery"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "efcd184a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "CANONICAL_LABELS = [\n",
    "    \"Equivalence\", \"Contains\", \"ContainedBy\", \"Overlap\", \"Disjoint\", \"Unknown\", \"Else\"\n",
    "]\n",
    "\n",
    "GT = {\n",
    "    \"R(1-2)\":  \"Equivalence\",\n",
    "    \"R(1-3)\":  \"Contains\",\n",
    "    \"R(1-4)\":  \"Contains\",\n",
    "    \"R(3-4)\":  \"Disjoint\",\n",
    "    \"R(1-34)\": \"Equivalence\",\n",
    "}\n",
    "\n",
    "\n",
    "def _normalize_pred(x: object) -> str:\n",
    "    if pd.isna(x):\n",
    "        return \"Unknown\"\n",
    "    s = str(x).strip()\n",
    "    if s in CANONICAL_LABELS:\n",
    "        return s\n",
    "    return \"Else\"\n",
    "\n",
    "def per_model_confusions(\n",
    "    df_relation: pd.DataFrame,\n",
    "    relation_cols=None,\n",
    "    include_overall: bool = True,\n",
    "    round_digits: int = 4,\n",
    "):\n",
    "    \"\"\"\n",
    "    Build complete confusion matrices per (llm, dataset) and per relation column.\n",
    "    Adds an 'overall' dataset per llm if include_overall=True.\n",
    "    \n",
    "    Returns\n",
    "    -------\n",
    "    cms_counts : pd.DataFrame\n",
    "        MultiIndex rows: (llm, dataset, relation, True)\n",
    "        Columns: Equivalence, Contains, ContainedBy, Overlap, Disjoint, Unknown, Else (counts)\n",
    "    cms_ratio : pd.DataFrame\n",
    "        Same shape, row-normalized ratios.\n",
    "    \"\"\"\n",
    "    if relation_cols is None:\n",
    "        relation_cols = list(GT.keys())\n",
    "\n",
    "    needed = {\"dataset\", \"llm\", *relation_cols}\n",
    "    missing = needed - set(df_relation.columns)\n",
    "    if missing:\n",
    "        raise ValueError(f\"df_relation missing columns: {missing}\")\n",
    "\n",
    "    rows_counts, rows_ratio, idx = [], [], []\n",
    "\n",
    "    # 1) Per (llm, dataset)\n",
    "    for (llm, dataset), group in df_relation.groupby([\"llm\", \"dataset\"], dropna=False):\n",
    "        n_group = len(group)\n",
    "        for rel in relation_cols:\n",
    "            truth = GT[rel]\n",
    "            y_pred = group[rel].map(_normalize_pred)\n",
    "            counts = y_pred.value_counts()\n",
    "            row_counts = [int(counts.get(lbl, 0)) for lbl in CANONICAL_LABELS]\n",
    "            row_ratio = [(c / n_group) if n_group > 0 else 0.0 for c in row_counts]\n",
    "            rows_counts.append(row_counts)\n",
    "            rows_ratio.append(row_ratio)\n",
    "            idx.append((llm, dataset, rel, truth))\n",
    "\n",
    "    # 2) Per llm (overall across datasets)\n",
    "    if include_overall:\n",
    "        for llm, group in df_relation.groupby(\"llm\", dropna=False):\n",
    "            n_group = len(group)\n",
    "            for rel in relation_cols:\n",
    "                truth = GT[rel]\n",
    "                y_pred = group[rel].map(_normalize_pred)\n",
    "                counts = y_pred.value_counts()\n",
    "                row_counts = [int(counts.get(lbl, 0)) for lbl in CANONICAL_LABELS]\n",
    "                row_ratio = [(c / n_group) if n_group > 0 else 0.0 for c in row_counts]\n",
    "                rows_counts.append(row_counts)\n",
    "                rows_ratio.append(row_ratio)\n",
    "                idx.append((llm, \"overall\", rel, truth))\n",
    "\n",
    "    index = pd.MultiIndex.from_tuples(idx, names=[\"llm\", \"dataset\", \"relation\", \"True\"])\n",
    "    cms_counts = pd.DataFrame(rows_counts, index=index, columns=CANONICAL_LABELS)\n",
    "    cms_ratio  = pd.DataFrame(rows_ratio,  index=index, columns=CANONICAL_LABELS)\n",
    "    if round_digits is not None:\n",
    "        cms_ratio = cms_ratio.round(round_digits)\n",
    "\n",
    "    return cms_counts, cms_ratio\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "15705135",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# Reuse your existing constants if already defined\n",
    "CANONICAL_LABELS = [\n",
    "    \"Equivalence\", \"Contains\", \"ContainedBy\", \"Overlap\", \"Disjoint\", \"Unknown\", \"Else\"\n",
    "]\n",
    "\n",
    "def build_confusion_table(cms_counts: pd.DataFrame,\n",
    "                          cms_ratio: pd.DataFrame,\n",
    "                          round_digits: int = 4) -> pd.DataFrame:\n",
    "    \"\"\"\n",
    "    Create one tidy table:\n",
    "      llm | dataset | relation | True | Accuracy | Size | Equivalence | Contains | ... | Else\n",
    "    where each label column is 'ratio(count)' and Accuracy is for the True label as 'ratio(count)'.\n",
    "    \"\"\"\n",
    "    records = []\n",
    "    for idx in cms_counts.index:\n",
    "        llm, dataset, relation, true_label = idx\n",
    "        counts_row = cms_counts.loc[idx]\n",
    "        ratio_row  = cms_ratio.loc[idx]\n",
    "\n",
    "        N = int(counts_row.sum())\n",
    "        acc_ratio = float(ratio_row.get(true_label, 0.0))\n",
    "        acc_count = int(counts_row.get(true_label, 0))\n",
    "\n",
    "        row = {\n",
    "            \"llm\": llm,\n",
    "            \"dataset\": dataset,\n",
    "            \"relation\": relation,\n",
    "            \"True\": true_label,\n",
    "            \"Accuracy\": f\"{acc_ratio:.{round_digits}f}({acc_count})\",\n",
    "            \"Size\": N,\n",
    "        }\n",
    "\n",
    "        # Add each predicted label as ratio(count)\n",
    "        for lbl in CANONICAL_LABELS:\n",
    "            r = float(ratio_row.get(lbl, 0.0))\n",
    "            c = int(counts_row.get(lbl, 0))\n",
    "            row[lbl] = f\"{r:.{round_digits}f}({c})\"\n",
    "\n",
    "        records.append(row)\n",
    "\n",
    "    out = pd.DataFrame.from_records(records)\n",
    "    # nice ordering\n",
    "    cols = [\"llm\", \"dataset\", \"relation\", \"True\", \"Accuracy\", \"Size\"] + CANONICAL_LABELS\n",
    "    return out[cols]\n",
    "\n",
    "\n",
    "def confusion_table_from_df(df_relation: pd.DataFrame,\n",
    "                            relation_cols=None,\n",
    "                            include_overall: bool = True,\n",
    "                            round_digits: int = 4) -> pd.DataFrame:\n",
    "    \"\"\"\n",
    "    Convenience wrapper: calls your per_model_confusions(...) then builds the table.\n",
    "    \"\"\"\n",
    "    # uses the per_model_confusions you already have\n",
    "    cms_counts, cms_ratio = per_model_confusions(\n",
    "        df_relation,\n",
    "        relation_cols=relation_cols,\n",
    "        include_overall=include_overall,\n",
    "        round_digits=round_digits,\n",
    "    )\n",
    "    return build_confusion_table(cms_counts, cms_ratio, round_digits)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "a51cea33",
   "metadata": {},
   "outputs": [],
   "source": [
    "# From your df_relation:\n",
    "table = confusion_table_from_df(df_relation, include_overall=True, round_digits=4)\n",
    "\n",
    "# Save if you want:\n",
    "table.to_csv(\"../output/relation_summary.csv\", index=False)\n",
    "table.to_excel(\"../output/relation_summary.xlsx\", index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "cb589167",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>llm</th>\n",
       "      <th>dataset</th>\n",
       "      <th>relation</th>\n",
       "      <th>True</th>\n",
       "      <th>Accuracy</th>\n",
       "      <th>Size</th>\n",
       "      <th>Equivalence</th>\n",
       "      <th>Contains</th>\n",
       "      <th>ContainedBy</th>\n",
       "      <th>Overlap</th>\n",
       "      <th>Disjoint</th>\n",
       "      <th>Unknown</th>\n",
       "      <th>Else</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>gemini-2.0-flash</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>R(1-2)</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>0.9667(145)</td>\n",
       "      <td>150</td>\n",
       "      <td>0.9667(145)</td>\n",
       "      <td>0.0133(2)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "      <td>0.0200(3)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>gemini-2.0-flash</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>R(1-3)</td>\n",
       "      <td>Contains</td>\n",
       "      <td>0.8667(130)</td>\n",
       "      <td>150</td>\n",
       "      <td>0.0067(1)</td>\n",
       "      <td>0.8667(130)</td>\n",
       "      <td>0.1067(16)</td>\n",
       "      <td>0.0200(3)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>gemini-2.0-flash</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>R(1-4)</td>\n",
       "      <td>Contains</td>\n",
       "      <td>0.1600(24)</td>\n",
       "      <td>150</td>\n",
       "      <td>0.0000(0)</td>\n",
       "      <td>0.1600(24)</td>\n",
       "      <td>0.0267(4)</td>\n",
       "      <td>0.7267(109)</td>\n",
       "      <td>0.0867(13)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>gemini-2.0-flash</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>R(3-4)</td>\n",
       "      <td>Disjoint</td>\n",
       "      <td>0.9400(141)</td>\n",
       "      <td>150</td>\n",
       "      <td>0.0000(0)</td>\n",
       "      <td>0.0067(1)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "      <td>0.0533(8)</td>\n",
       "      <td>0.9400(141)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>gemini-2.0-flash</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>R(1-34)</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>0.8400(126)</td>\n",
       "      <td>150</td>\n",
       "      <td>0.8400(126)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "      <td>0.0933(14)</td>\n",
       "      <td>0.0667(10)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>235</th>\n",
       "      <td>o3</td>\n",
       "      <td>overall</td>\n",
       "      <td>R(1-2)</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>0.9133(411)</td>\n",
       "      <td>450</td>\n",
       "      <td>0.9133(411)</td>\n",
       "      <td>0.0356(16)</td>\n",
       "      <td>0.0378(17)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "      <td>0.0022(1)</td>\n",
       "      <td>0.0111(5)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>236</th>\n",
       "      <td>o3</td>\n",
       "      <td>overall</td>\n",
       "      <td>R(1-3)</td>\n",
       "      <td>Contains</td>\n",
       "      <td>0.9733(438)</td>\n",
       "      <td>450</td>\n",
       "      <td>0.0111(5)</td>\n",
       "      <td>0.9733(438)</td>\n",
       "      <td>0.0022(1)</td>\n",
       "      <td>0.0022(1)</td>\n",
       "      <td>0.0022(1)</td>\n",
       "      <td>0.0089(4)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>237</th>\n",
       "      <td>o3</td>\n",
       "      <td>overall</td>\n",
       "      <td>R(1-4)</td>\n",
       "      <td>Contains</td>\n",
       "      <td>0.9733(438)</td>\n",
       "      <td>450</td>\n",
       "      <td>0.0111(5)</td>\n",
       "      <td>0.9733(438)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "      <td>0.0044(2)</td>\n",
       "      <td>0.0089(4)</td>\n",
       "      <td>0.0022(1)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>238</th>\n",
       "      <td>o3</td>\n",
       "      <td>overall</td>\n",
       "      <td>R(3-4)</td>\n",
       "      <td>Disjoint</td>\n",
       "      <td>0.9467(426)</td>\n",
       "      <td>450</td>\n",
       "      <td>0.0044(2)</td>\n",
       "      <td>0.0067(3)</td>\n",
       "      <td>0.0044(2)</td>\n",
       "      <td>0.0200(9)</td>\n",
       "      <td>0.9467(426)</td>\n",
       "      <td>0.0178(8)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>239</th>\n",
       "      <td>o3</td>\n",
       "      <td>overall</td>\n",
       "      <td>R(1-34)</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>0.9356(421)</td>\n",
       "      <td>450</td>\n",
       "      <td>0.9356(421)</td>\n",
       "      <td>0.0200(9)</td>\n",
       "      <td>0.0356(16)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "      <td>0.0044(2)</td>\n",
       "      <td>0.0044(2)</td>\n",
       "      <td>0.0000(0)</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>240 rows × 13 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                  llm  dataset relation         True     Accuracy  Size  \\\n",
       "0    gemini-2.0-flash   qawiki   R(1-2)  Equivalence  0.9667(145)   150   \n",
       "1    gemini-2.0-flash   qawiki   R(1-3)     Contains  0.8667(130)   150   \n",
       "2    gemini-2.0-flash   qawiki   R(1-4)     Contains   0.1600(24)   150   \n",
       "3    gemini-2.0-flash   qawiki   R(3-4)     Disjoint  0.9400(141)   150   \n",
       "4    gemini-2.0-flash   qawiki  R(1-34)  Equivalence  0.8400(126)   150   \n",
       "..                ...      ...      ...          ...          ...   ...   \n",
       "235                o3  overall   R(1-2)  Equivalence  0.9133(411)   450   \n",
       "236                o3  overall   R(1-3)     Contains  0.9733(438)   450   \n",
       "237                o3  overall   R(1-4)     Contains  0.9733(438)   450   \n",
       "238                o3  overall   R(3-4)     Disjoint  0.9467(426)   450   \n",
       "239                o3  overall  R(1-34)  Equivalence  0.9356(421)   450   \n",
       "\n",
       "     Equivalence     Contains ContainedBy      Overlap     Disjoint  \\\n",
       "0    0.9667(145)    0.0133(2)   0.0000(0)    0.0200(3)    0.0000(0)   \n",
       "1      0.0067(1)  0.8667(130)  0.1067(16)    0.0200(3)    0.0000(0)   \n",
       "2      0.0000(0)   0.1600(24)   0.0267(4)  0.7267(109)   0.0867(13)   \n",
       "3      0.0000(0)    0.0067(1)   0.0000(0)    0.0533(8)  0.9400(141)   \n",
       "4    0.8400(126)    0.0000(0)   0.0000(0)   0.0933(14)   0.0667(10)   \n",
       "..           ...          ...         ...          ...          ...   \n",
       "235  0.9133(411)   0.0356(16)  0.0378(17)    0.0000(0)    0.0022(1)   \n",
       "236    0.0111(5)  0.9733(438)   0.0022(1)    0.0022(1)    0.0022(1)   \n",
       "237    0.0111(5)  0.9733(438)   0.0000(0)    0.0044(2)    0.0089(4)   \n",
       "238    0.0044(2)    0.0067(3)   0.0044(2)    0.0200(9)  0.9467(426)   \n",
       "239  0.9356(421)    0.0200(9)  0.0356(16)    0.0000(0)    0.0044(2)   \n",
       "\n",
       "       Unknown       Else  \n",
       "0    0.0000(0)  0.0000(0)  \n",
       "1    0.0000(0)  0.0000(0)  \n",
       "2    0.0000(0)  0.0000(0)  \n",
       "3    0.0000(0)  0.0000(0)  \n",
       "4    0.0000(0)  0.0000(0)  \n",
       "..         ...        ...  \n",
       "235  0.0111(5)  0.0000(0)  \n",
       "236  0.0089(4)  0.0000(0)  \n",
       "237  0.0022(1)  0.0000(0)  \n",
       "238  0.0178(8)  0.0000(0)  \n",
       "239  0.0044(2)  0.0000(0)  \n",
       "\n",
       "[240 rows x 13 columns]"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "2cb10149",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Q_ID</th>\n",
       "      <th>dataset</th>\n",
       "      <th>llm</th>\n",
       "      <th>R(1-2)</th>\n",
       "      <th>R(1-3)</th>\n",
       "      <th>R(1-4)</th>\n",
       "      <th>R(3-4)</th>\n",
       "      <th>R(1-34)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>spinach</td>\n",
       "      <td>grok-3-mini</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Disjoint</td>\n",
       "      <td>Equivalence</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>spinach</td>\n",
       "      <td>grok-3-mini</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Disjoint</td>\n",
       "      <td>Equivalence</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>spinach</td>\n",
       "      <td>grok-3-mini</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Disjoint</td>\n",
       "      <td>Equivalence</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>spinach</td>\n",
       "      <td>grok-3-mini</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Disjoint</td>\n",
       "      <td>Equivalence</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>spinach</td>\n",
       "      <td>grok-3-mini</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Disjoint</td>\n",
       "      <td>Equivalence</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5373</th>\n",
       "      <td>145</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>gemini-2.5-pro</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Disjoint</td>\n",
       "      <td>Equivalence</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5374</th>\n",
       "      <td>146</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>gemini-2.5-pro</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Overlap</td>\n",
       "      <td>ContainedBy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5375</th>\n",
       "      <td>147</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>gemini-2.5-pro</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Disjoint</td>\n",
       "      <td>Equivalence</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5376</th>\n",
       "      <td>148</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>gemini-2.5-pro</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Disjoint</td>\n",
       "      <td>Equivalence</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5377</th>\n",
       "      <td>149</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>gemini-2.5-pro</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Disjoint</td>\n",
       "      <td>Equivalence</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5378 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Q_ID  dataset             llm       R(1-2)    R(1-3)    R(1-4)    R(3-4)  \\\n",
       "0       0  spinach     grok-3-mini  Equivalence  Contains  Contains  Disjoint   \n",
       "1       1  spinach     grok-3-mini  Equivalence  Contains  Contains  Disjoint   \n",
       "2       2  spinach     grok-3-mini  Equivalence  Contains  Contains  Disjoint   \n",
       "3       3  spinach     grok-3-mini  Equivalence  Contains  Contains  Disjoint   \n",
       "4       4  spinach     grok-3-mini     Contains  Contains  Contains  Disjoint   \n",
       "...   ...      ...             ...          ...       ...       ...       ...   \n",
       "5373  145   qawiki  gemini-2.5-pro  Equivalence  Contains  Contains  Disjoint   \n",
       "5374  146   qawiki  gemini-2.5-pro  Equivalence  Contains  Contains   Overlap   \n",
       "5375  147   qawiki  gemini-2.5-pro  Equivalence  Contains  Contains  Disjoint   \n",
       "5376  148   qawiki  gemini-2.5-pro  Equivalence  Contains  Contains  Disjoint   \n",
       "5377  149   qawiki  gemini-2.5-pro  Equivalence  Contains  Contains  Disjoint   \n",
       "\n",
       "          R(1-34)  \n",
       "0     Equivalence  \n",
       "1     Equivalence  \n",
       "2     Equivalence  \n",
       "3     Equivalence  \n",
       "4     Equivalence  \n",
       "...           ...  \n",
       "5373  Equivalence  \n",
       "5374  ContainedBy  \n",
       "5375  Equivalence  \n",
       "5376  Equivalence  \n",
       "5377  Equivalence  \n",
       "\n",
       "[5378 rows x 8 columns]"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_relation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "af8668ad",
   "metadata": {},
   "outputs": [],
   "source": [
    "import datetime\n",
    "\n",
    "relation_file_format = datetime.datetime.now().strftime(\"relations_%Y-%m-%d_%H-%M.csv\")\n",
    "summary_file_format = datetime.datetime.now().strftime(\"relation_summary_%Y-%m-%d_%H-%M.csv\")\n",
    "summary_file_format_excel = datetime.datetime.now().strftime(\"relation_summary_%Y-%m-%d_%H-%M.xlsx\")\n",
    "output_folder = \"../output/\"\n",
    "\n",
    "df_relation.to_csv(os.path.join(output_folder, relation_file_format), index=False)\n",
    "df_relation_summery.to_csv(os.path.join(output_folder, summary_file_format), index=False)\n",
    "df_relation_summery.to_excel(\"../output/relation_summary.xlsx\", index=False)\n",
    "df_relation_summery.to_excel(os.path.join(output_folder, summary_file_format_excel), index=False)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a8f99aaf",
   "metadata": {},
   "source": [
    "### Relation Identification and Consistency? \n",
    "\n",
    "for each of question in re-classification, there is the relation-classification. \n",
    "\n",
    "Add more columns [R(1-2), R(1-3), R(1-4), R(3-4),R(1-34)] in df_analysis to show the identified relation in re-classification. \n",
    "\n",
    "Maybe also add the initial relation in zero-shot at this column. \n",
    "\n",
    "Based on df_analysis do the summary for both zero-shot and re-classication in terms of relation and consistency. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "id": "8d57653b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Q_ID</th>\n",
       "      <th>dataset</th>\n",
       "      <th>llm</th>\n",
       "      <th>R(1-2)</th>\n",
       "      <th>R(1-3)</th>\n",
       "      <th>R(1-34)</th>\n",
       "      <th>action</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>spinach</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>equivalence</td>\n",
       "      <td>contains</td>\n",
       "      <td>equivalence</td>\n",
       "      <td>classification</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>spinach</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>equivalence</td>\n",
       "      <td>contains</td>\n",
       "      <td>equivalence</td>\n",
       "      <td>classification</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>spinach</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>equivalence</td>\n",
       "      <td>contains</td>\n",
       "      <td>containedby</td>\n",
       "      <td>classification</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>spinach</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>equivalence</td>\n",
       "      <td>contains</td>\n",
       "      <td>equivalence</td>\n",
       "      <td>classification</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>spinach</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>contains</td>\n",
       "      <td>contains</td>\n",
       "      <td>equivalence</td>\n",
       "      <td>classification</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>445</th>\n",
       "      <td>146</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>equivalence</td>\n",
       "      <td>contains</td>\n",
       "      <td>equivalence</td>\n",
       "      <td>classification</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>446</th>\n",
       "      <td>147</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>equivalence</td>\n",
       "      <td>contains</td>\n",
       "      <td>equivalence</td>\n",
       "      <td>classification</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>447</th>\n",
       "      <td>148</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>equivalence</td>\n",
       "      <td>contains</td>\n",
       "      <td>equivalence</td>\n",
       "      <td>classification</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>448</th>\n",
       "      <td>149</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>equivalence</td>\n",
       "      <td>contains</td>\n",
       "      <td>equivalence</td>\n",
       "      <td>classification</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>449</th>\n",
       "      <td>142</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>equivalence</td>\n",
       "      <td>contains</td>\n",
       "      <td>equivalence</td>\n",
       "      <td>classification</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>450 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    Q_ID  dataset    llm       R(1-2)    R(1-3)      R(1-34)          action\n",
       "0      0  spinach  gpt-5  equivalence  contains  equivalence  classification\n",
       "1      1  spinach  gpt-5  equivalence  contains  equivalence  classification\n",
       "2      2  spinach  gpt-5  equivalence  contains  containedby  classification\n",
       "3      3  spinach  gpt-5  equivalence  contains  equivalence  classification\n",
       "4      4  spinach  gpt-5     contains  contains  equivalence  classification\n",
       "..   ...      ...    ...          ...       ...          ...             ...\n",
       "445  146   qawiki  gpt-5  equivalence  contains  equivalence  classification\n",
       "446  147   qawiki  gpt-5  equivalence  contains  equivalence  classification\n",
       "447  148   qawiki  gpt-5  equivalence  contains  equivalence  classification\n",
       "448  149   qawiki  gpt-5  equivalence  contains  equivalence  classification\n",
       "449  142   qawiki  gpt-5  equivalence  contains  equivalence  classification\n",
       "\n",
       "[450 rows x 7 columns]"
      ]
     },
     "execution_count": 134,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# def load_relation_ref(root_dir, datasets, llms):\n",
    "datasets = [\"spinach\", \"qawiki\", 'synthetic']\n",
    "llms = ['gpt-5']\n",
    "tasks = ['equal', 'sup-sub', \"minus\"]\n",
    "root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__name__))) + \"/data/answers/rel_classification_and_questions/\"\n",
    "json_files = [\n",
    "    os.path.join(root, file)\n",
    "    for root, _, files in os.walk(root_dir)\n",
    "    for file in files\n",
    "    if not file.startswith(\"Q\") and file.endswith(\".json\")\n",
    "]\n",
    "df_relation_clf = pd.DataFrame(\n",
    "        columns=[\"Q_ID\", \"dataset\", \"llm\", \"R(1-2)\", \"R(1-3)\", \"R(1-34)\"]\n",
    "    )\n",
    "\n",
    "task_to_col = {\n",
    "    \"equal\":   \"R(1-2)\",\n",
    "    \"sup-sub\": \"R(1-3)\",\n",
    "    \"minus\":   \"R(1-34)\",\n",
    "}\n",
    "for file in json_files:\n",
    "    elements = file.replace(\"_\", \"/\").replace(\".json\", \"\").split(\"/\")\n",
    "    dataset = next((d for d in datasets if d in elements), None)\n",
    "    llm = next((l for l in llms if l in elements), None)\n",
    "    task = next((t for t in tasks if t in elements), None)\n",
    "    col = task_to_col.get(task)\n",
    "\n",
    "    if all([dataset, llm, task]):\n",
    "        with open(file, \"r\", encoding=\"utf-8\") as f:\n",
    "            data = json.load(f)\n",
    "        # transform dict into rows\n",
    "        rows = []\n",
    "        for qid, rel in data.items():\n",
    "            if qid in df_relation_clf[\"Q_ID\"].values and \\\n",
    "            dataset in df_relation_clf[\"dataset\"].values and \\\n",
    "            llm in df_relation_clf[\"llm\"].values: \n",
    "                df_relation_clf.loc[\n",
    "                    (df_relation_clf[\"Q_ID\"] == qid) & \n",
    "                    (df_relation_clf[\"dataset\"] == dataset) & \n",
    "                    (df_relation_clf[\"llm\"] == llm), col\n",
    "                ] = rel\n",
    "            else:\n",
    "                row = {\n",
    "                        \"dataset\": dataset,\n",
    "                        \"llm\": llm,\n",
    "                        \"Q_ID\": qid,\n",
    "                        col: rel if col else None,\n",
    "                    }\n",
    "                rows.append(row)\n",
    "                \n",
    "        df_relation_clf = pd.concat([df_relation_clf, pd.DataFrame(rows)], ignore_index=True)\n",
    "        df_relation_clf[\"action\"] = \"classification\"\n",
    "df_relation_clf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "id": "c0900616",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import math\n",
    "import unicodedata\n",
    "import pandas as pd\n",
    "from typing import Any, Iterable\n",
    "\n",
    "CANONICAL_LABELS = [\n",
    "    \"Equivalence\", \"Contains\", \"ContainedBy\", \"Overlap\", \"Disjoint\", \"Unknown\", \"Else\"\n",
    "]\n",
    "\n",
    "# --- helpers ---------------------------------------------------------------\n",
    "\n",
    "_UNKNOWN_TOKENS = {\n",
    "    \"unknown\",\"unk\",\"n/a\",\"na\",\"none\",\"null\",\"nil\",\"idk\",\"don't know\",\"dont know\",\n",
    "    \"cannot determine\",\"can’t determine\",\"cant determine\",\"unsure\",\"uncertain\",\n",
    "    \"not sure\",\"not given\",\"not specified\",\"ambiguous\"\n",
    "}\n",
    "\n",
    "def _first_nonempty_str(it: Iterable[Any]) -> str | None:\n",
    "    for x in it:\n",
    "        if x is None: \n",
    "            continue\n",
    "        s = str(x).strip()\n",
    "        if s:\n",
    "            return s\n",
    "    return None\n",
    "\n",
    "def _pick_from_dict(d: dict) -> str | None:\n",
    "    # common shapes: {\"label\": \"...\"} or {\"relation\": \"...\"} or {label: score, ...}\n",
    "    for k in (\"label\",\"relation\",\"pred\",\"class\"):\n",
    "        if k in d and isinstance(d[k], (str, int, float)):\n",
    "            return str(d[k])\n",
    "    # try best-score key if numeric\n",
    "    try:\n",
    "        numeric = {k: float(v) for k, v in d.items() if isinstance(v, (int, float, str)) and str(v).replace('.','',1).lstrip('-').isdigit()}\n",
    "        if numeric:\n",
    "            return max(numeric, key=numeric.get)\n",
    "    except Exception:\n",
    "        pass\n",
    "    # else first key\n",
    "    if d:\n",
    "        return str(next(iter(d.keys())))\n",
    "    return None\n",
    "\n",
    "def _clean_text(s: str) -> str:\n",
    "    # Unicode normalize (e.g., different hyphens, spaces)\n",
    "    s = unicodedata.normalize(\"NFKC\", s)\n",
    "    # drop parenthetical scores etc: \"Equivalence (0.91)\" -> \"Equivalence\"\n",
    "    s = re.sub(r\"\\(.*?\\)\", \"\", s)\n",
    "    # collapse spaces & hyphens around keywords\n",
    "    s = re.sub(r\"[-_]+\", \"-\", s)\n",
    "    s = re.sub(r\"\\s+\", \" \", s).strip()\n",
    "    return s\n",
    "\n",
    "def _is_unknown_like(s_lower: str) -> bool:\n",
    "    return s_lower in _UNKNOWN_TOKENS\n",
    "\n",
    "# --- main normalizer -------------------------------------------------------\n",
    "\n",
    "def normalize_relation(pred: Any) -> str:\n",
    "    \"\"\"\n",
    "    Map various prediction shapes/phrases/symbols into canonical relation labels.\n",
    "    Returns one of CANONICAL_LABELS.\n",
    "    \"\"\"\n",
    "    # None / NaN / empty -> Unknown\n",
    "    if pred is None or (isinstance(pred, float) and math.isnan(pred)):\n",
    "        return \"Unknown\"\n",
    "\n",
    "    # list/tuple: pick first sensible string\n",
    "    if isinstance(pred, (list, tuple)):\n",
    "        s = _first_nonempty_str(pred)\n",
    "        if not s:\n",
    "            return \"Unknown\"\n",
    "        pred = s\n",
    "\n",
    "    # dict: try to extract label\n",
    "    if isinstance(pred, dict):\n",
    "        s = _pick_from_dict(pred)\n",
    "        if not s:\n",
    "            return \"Unknown\"\n",
    "        pred = s\n",
    "\n",
    "    # now string\n",
    "    s_raw = str(pred)\n",
    "    s = _clean_text(s_raw)\n",
    "    s_lower = s.casefold()\n",
    "\n",
    "    # quick exact canonical pass\n",
    "    if s in CANONICAL_LABELS:\n",
    "        return s\n",
    "\n",
    "    # unknown-likes\n",
    "    if _is_unknown_like(s_lower):\n",
    "        return \"Unknown\"\n",
    "\n",
    "    # guard: special negation patterns first (so \"not disjoint\" -> Overlap)\n",
    "    if re.search(r\"\\bnot\\s+disjoint\\b\", s_lower) or re.search(r\"\\bnon[-\\s]?disjoint\\b\", s_lower):\n",
    "        return \"Overlap\"\n",
    "    if re.search(r\"\\bnot\\s+overlap(ping)?\\b\", s_lower):\n",
    "        return \"Disjoint\"\n",
    "\n",
    "    # --- detect by symbols/phrases ---\n",
    "    # Equivalence\n",
    "    if re.search(r\"\\beq(uiv(alent|alence)?)\\b\", s_lower) or \\\n",
    "       re.search(r\"\\bequal(s)?\\b\", s_lower) or \\\n",
    "       re.search(r\"\\bsame(\\s+set)?\\b\", s_lower) or \\\n",
    "       re.search(r\"a\\s*=\\s*b\", s_lower) or \"≡\" in s or \"↔\" in s:\n",
    "        return \"Equivalence\"\n",
    "\n",
    "    # Contains (A ⊃ B; superset; includes)\n",
    "    if \"⊃\" in s or \"⊇\" in s or \\\n",
    "       re.search(r\"\\bsuper\\s*set\\b\", s_lower) or \\\n",
    "       re.search(r\"\\bsuperset\\s+of\\b\", s_lower) or \\\n",
    "       re.search(r\"\\bcontain(s|ment)?\\b\", s_lower) or \\\n",
    "       re.search(r\"\\bincludes?\\b\", s_lower) or \\\n",
    "       re.search(r\"\\b(a|set\\s*a)?\\s*includes?\\s*(b|set\\s*b)\\b\", s_lower):\n",
    "        return \"Contains\"\n",
    "\n",
    "    # ContainedBy (A ⊂ B; subset; contained by; is in)\n",
    "    if \"⊂\" in s or \"⊆\" in s or \\\n",
    "       re.search(r\"\\bsub\\s*set\\b\", s_lower) or \\\n",
    "       re.search(r\"\\bsubset\\s+of\\b\", s_lower) or \\\n",
    "       re.search(r\"\\bcontained\\s*by\\b\", s_lower) or \\\n",
    "       re.search(r\"\\bis\\s+in\\b\", s_lower) or \\\n",
    "       re.search(r\"\\bbelongs\\s+to\\b\", s_lower):\n",
    "        return \"ContainedBy\"\n",
    "\n",
    "    # Disjoint (A ∩ B = ∅; no overlap)\n",
    "    if \"∩\" in s and (\"∅\" in s or \"= 0\" in s_lower) or \\\n",
    "       re.search(r\"\\bdis[-\\s]?joint\\b\", s_lower) or \\\n",
    "       re.search(r\"\\bno\\s+(overlap|intersection)\\b\", s_lower) or \\\n",
    "       re.search(r\"\\bmutual(ly)?\\s+exclusive\\b\", s_lower) or \\\n",
    "       re.search(r\"\\bnon[-\\s]?overlap(ping)?\\b\", s_lower):\n",
    "        return \"Disjoint\"\n",
    "\n",
    "    # Overlap (A ∩ B ≠ ∅; intersect; partial overlap)\n",
    "    if \"∩\" in s and (\"≠\" in s or \"!= \" in s_lower) or \\\n",
    "       re.search(r\"\\boverlap(ping)?\\b\", s_lower) or \\\n",
    "       re.search(r\"\\bintersect(s|ion)?\\b\", s_lower) or \\\n",
    "       re.search(r\"\\b(non[-\\s]?empty|some)\\s+intersection\\b\", s_lower) or \\\n",
    "       re.search(r\"\\bshare(s)?\\s+(elements|items|members)\\b\", s_lower):\n",
    "        return \"Overlap\"\n",
    "\n",
    "    # If the string literally says \"unknown\" in any decorative way, catch it late too\n",
    "    if \"unknown\" in s_lower:\n",
    "        return \"Unknown\"\n",
    "\n",
    "    # Otherwise:\n",
    "    return \"Else\"\n",
    "\n",
    "# --- convenience wrappers ---------------------------------------------------\n",
    "\n",
    "def normalize_relation_series(s: pd.Series) -> pd.Series:\n",
    "    return s.apply(normalize_relation)\n",
    "\n",
    "def normalize_relation_cols(df: pd.DataFrame, cols: list[str], inplace: bool = False, suffix: str = \"_norm\") -> pd.DataFrame:\n",
    "    \"\"\"\n",
    "    Normalize multiple relation columns in a DataFrame.\n",
    "    - If inplace=False, returns a copy with new normalized columns appended (col+suffix).\n",
    "    - If inplace=True, overwrites the original columns.\n",
    "    \"\"\"\n",
    "    target = df if inplace else df.copy()\n",
    "    for c in cols:\n",
    "        norm = target[c].apply(normalize_relation)\n",
    "        if inplace:\n",
    "            target[c] = norm\n",
    "        else:\n",
    "            target[c + suffix] = norm\n",
    "    return target\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "id": "52a0de01",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Q_ID</th>\n",
       "      <th>dataset</th>\n",
       "      <th>llm</th>\n",
       "      <th>R(1-2)</th>\n",
       "      <th>R(1-3)</th>\n",
       "      <th>R(1-34)</th>\n",
       "      <th>action</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>classification</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>classification</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>ContainedBy</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>ContainedBy</td>\n",
       "      <td>classification</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>100</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>classification</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>101</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>classification</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>445</th>\n",
       "      <td>95</td>\n",
       "      <td>synthetic</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>classification</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>446</th>\n",
       "      <td>96</td>\n",
       "      <td>synthetic</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>classification</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>447</th>\n",
       "      <td>97</td>\n",
       "      <td>synthetic</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Disjoint</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>classification</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>448</th>\n",
       "      <td>98</td>\n",
       "      <td>synthetic</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>classification</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>449</th>\n",
       "      <td>99</td>\n",
       "      <td>synthetic</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>classification</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>450 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    Q_ID    dataset    llm       R(1-2)       R(1-3)      R(1-34)  \\\n",
       "0      0     qawiki  gpt-5  Equivalence     Contains  Equivalence   \n",
       "1      1     qawiki  gpt-5  Equivalence     Contains  Equivalence   \n",
       "2     10     qawiki  gpt-5  ContainedBy  Equivalence  ContainedBy   \n",
       "3    100     qawiki  gpt-5  Equivalence     Contains  Equivalence   \n",
       "4    101     qawiki  gpt-5  Equivalence     Contains  Equivalence   \n",
       "..   ...        ...    ...          ...          ...          ...   \n",
       "445   95  synthetic  gpt-5  Equivalence     Contains  Equivalence   \n",
       "446   96  synthetic  gpt-5  Equivalence     Contains  Equivalence   \n",
       "447   97  synthetic  gpt-5  Equivalence     Disjoint  Equivalence   \n",
       "448   98  synthetic  gpt-5  Equivalence     Contains  Equivalence   \n",
       "449   99  synthetic  gpt-5  Equivalence     Contains  Equivalence   \n",
       "\n",
       "             action  \n",
       "0    classification  \n",
       "1    classification  \n",
       "2    classification  \n",
       "3    classification  \n",
       "4    classification  \n",
       "..              ...  \n",
       "445  classification  \n",
       "446  classification  \n",
       "447  classification  \n",
       "448  classification  \n",
       "449  classification  \n",
       "\n",
       "[450 rows x 7 columns]"
      ]
     },
     "execution_count": 135,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "\n",
    "datasets = [\"spinach\", \"qawiki\", \"synthetic\"]\n",
    "llms     = [\"gpt-5\"]\n",
    "tasks    = [\"equal\", \"sup-sub\", \"minus\"]\n",
    "\n",
    "task_to_col = {\n",
    "    \"equal\":   \"R(1-2)\",\n",
    "    \"sup-sub\": \"R(1-3)\",\n",
    "    \"minus\":   \"R(1-34)\",\n",
    "}\n",
    "\n",
    "def load_relation_clf(root_dir: str | None = None) -> pd.DataFrame:\n",
    "    # Resolve default roo\n",
    "\n",
    "    # Find JSON files (exclude those starting with 'Q')\n",
    "    json_files = [\n",
    "        os.path.join(root, file)\n",
    "        for root, _, files in os.walk(root_dir)\n",
    "        for file in files\n",
    "        if not file.startswith(\"Q\") and file.endswith(\".json\")\n",
    "    ]\n",
    "    rows_map: dict[tuple[str, str, str], dict] = {}\n",
    "\n",
    "    for file in json_files:\n",
    "        parts = file.replace(\"_\", \"/\").replace(\".json\", \"\").split(\"/\")\n",
    "        dataset = next((d for d in datasets if d in parts), None)\n",
    "        llm     = next((l for l in llms     if l in parts), None)\n",
    "        task    = next((t for t in tasks    if t in parts), None)\n",
    "        col     = task_to_col.get(task)\n",
    "\n",
    "        if not (dataset and llm and col):\n",
    "            continue\n",
    "\n",
    "        with open(file, \"r\", encoding=\"utf-8\") as f:\n",
    "            data = json.load(f)\n",
    "\n",
    "        for qid, rel in data.items():\n",
    "            # allow value to be list/tuple or scalar\n",
    "            pred = rel[0] if isinstance(rel, (list, tuple)) and len(rel) else rel\n",
    "            \n",
    "            key = (qid, dataset, llm)\n",
    "            row = rows_map.setdefault(key, {\"Q_ID\": qid, \"dataset\": dataset, \"llm\": llm})\n",
    "            row[col] = normalize_relation(pred)\n",
    "\n",
    "    # Materialize dataframe\n",
    "    df = pd.DataFrame(rows_map.values())\n",
    "    for c in [\"R(1-2)\", \"R(1-3)\", \"R(1-34)\"]:\n",
    "        if c not in df.columns:\n",
    "            df[c] = pd.NA\n",
    "\n",
    "    df = df[[\"Q_ID\", \"dataset\", \"llm\", \"R(1-2)\", \"R(1-3)\", \"R(1-34)\"]]\n",
    "    df[\"action\"] = \"classification\"\n",
    "    return df.sort_values([\"llm\", \"dataset\", \"Q_ID\"]).reset_index(drop=True)\n",
    "\n",
    "# Use it\n",
    "df_relation_clf = load_relation_clf(root_dir)\n",
    "df_relation_clf\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "id": "02bf6a5c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Q_ID</th>\n",
       "      <th>dataset</th>\n",
       "      <th>llm</th>\n",
       "      <th>R(1-2)</th>\n",
       "      <th>R(1-3)</th>\n",
       "      <th>R(1-4)</th>\n",
       "      <th>R(3-4)</th>\n",
       "      <th>R(1-34)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>spinach</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Disjoint</td>\n",
       "      <td>Equivalence</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>spinach</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Disjoint</td>\n",
       "      <td>Equivalence</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>spinach</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Overlap</td>\n",
       "      <td>ContainedBy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>spinach</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Disjoint</td>\n",
       "      <td>Equivalence</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>spinach</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Disjoint</td>\n",
       "      <td>Equivalence</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>445</th>\n",
       "      <td>145</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Disjoint</td>\n",
       "      <td>Equivalence</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>446</th>\n",
       "      <td>146</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Overlap</td>\n",
       "      <td>ContainedBy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>447</th>\n",
       "      <td>147</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Disjoint</td>\n",
       "      <td>Equivalence</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>448</th>\n",
       "      <td>148</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Disjoint</td>\n",
       "      <td>Equivalence</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>449</th>\n",
       "      <td>149</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Disjoint</td>\n",
       "      <td>Equivalence</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>450 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    Q_ID  dataset    llm       R(1-2)    R(1-3)    R(1-4)    R(3-4)  \\\n",
       "0      0  spinach  gpt-5  Equivalence  Contains  Contains  Disjoint   \n",
       "1      1  spinach  gpt-5  Equivalence  Contains  Contains  Disjoint   \n",
       "2      2  spinach  gpt-5  Equivalence  Contains  Contains   Overlap   \n",
       "3      3  spinach  gpt-5  Equivalence  Contains  Contains  Disjoint   \n",
       "4      4  spinach  gpt-5     Contains  Contains  Contains  Disjoint   \n",
       "..   ...      ...    ...          ...       ...       ...       ...   \n",
       "445  145   qawiki  gpt-5  Equivalence  Contains  Contains  Disjoint   \n",
       "446  146   qawiki  gpt-5  Equivalence  Contains  Contains   Overlap   \n",
       "447  147   qawiki  gpt-5  Equivalence  Contains  Contains  Disjoint   \n",
       "448  148   qawiki  gpt-5  Equivalence  Contains  Contains  Disjoint   \n",
       "449  149   qawiki  gpt-5  Equivalence  Contains  Contains  Disjoint   \n",
       "\n",
       "         R(1-34)  \n",
       "0    Equivalence  \n",
       "1    Equivalence  \n",
       "2    ContainedBy  \n",
       "3    Equivalence  \n",
       "4    Equivalence  \n",
       "..           ...  \n",
       "445  Equivalence  \n",
       "446  ContainedBy  \n",
       "447  Equivalence  \n",
       "448  Equivalence  \n",
       "449  Equivalence  \n",
       "\n",
       "[450 rows x 8 columns]"
      ]
     },
     "execution_count": 136,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_relation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 151,
   "id": "af0f36e4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Q_ID</th>\n",
       "      <th>action</th>\n",
       "      <th>dataset</th>\n",
       "      <th>llm</th>\n",
       "      <th>?A1=A2</th>\n",
       "      <th>?A1=A3+A4</th>\n",
       "      <th>?A1&gt;A3</th>\n",
       "      <th>?A1&gt;A4</th>\n",
       "      <th>?A3∅A4</th>\n",
       "      <th>J(A1-A2)</th>\n",
       "      <th>...</th>\n",
       "      <th>A2</th>\n",
       "      <th>A3</th>\n",
       "      <th>A4</th>\n",
       "      <th>A1*</th>\n",
       "      <th>A1**</th>\n",
       "      <th>R(1-2)</th>\n",
       "      <th>R(1-3)</th>\n",
       "      <th>R(1-34)</th>\n",
       "      <th>R(1-4)</th>\n",
       "      <th>R(3-4)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>classification</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>...</td>\n",
       "      <td>[Venezuela, Guyana, Brazil]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[Guyana, Brazil]</td>\n",
       "      <td>[Venezuela, Guyana, Brazil]</td>\n",
       "      <td>[Venezuela, Guyana, Brazil]</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>classification</td>\n",
       "      <td>spinach</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>...</td>\n",
       "      <td>[At Last the 1948 Show, Fawlty Towers, Monty P...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[At Last the 1948 Show, Fawlty Towers, Monty P...</td>\n",
       "      <td>[At Last the 1948 Show, Fawlty Towers, Monty P...</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>classification</td>\n",
       "      <td>synthetic</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>...</td>\n",
       "      <td>[The Hateful Eight, Django Unchained, Kill Bil...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[The Hateful Eight, Django Unchained, Kill Bil...</td>\n",
       "      <td>[The Hateful Eight, Django Unchained, Kill Bil...</td>\n",
       "      <td>[The Hateful Eight, Django Unchained, Kill Bil...</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>fixing</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>...</td>\n",
       "      <td>[Suriname, Guyana, Brazil, Colombia, Venezuela]</td>\n",
       "      <td>[Colombia, Venezuela]</td>\n",
       "      <td>[Suriname, Guyana, Brazil]</td>\n",
       "      <td>[Colombia, Venezuela, Guyana, Brazil]</td>\n",
       "      <td>[Suriname, Guyana, Brazil, Colombia, Venezuela]</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>fixing</td>\n",
       "      <td>spinach</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0.6667</td>\n",
       "      <td>...</td>\n",
       "      <td>[At Last the 1948 Show, Fawlty Towers, Monty P...</td>\n",
       "      <td>[At Last the 1948 Show, Monty Python's Fliegen...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[At Last the 1948 Show, Monty Python's Fliegen...</td>\n",
       "      <td>[At Last the 1948 Show, Fawlty Towers, Monty P...</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1795</th>\n",
       "      <td>99</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>spinach</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>...</td>\n",
       "      <td>[Indiana, Missouri, Wisconsin, Iowa, Kentucky]</td>\n",
       "      <td>[Indiana, Wisconsin]</td>\n",
       "      <td>[Missouri, Iowa, Kentucky]</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1796</th>\n",
       "      <td>99</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>synthetic</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>...</td>\n",
       "      <td>[black, red]</td>\n",
       "      <td>[yellow, red]</td>\n",
       "      <td>[black]</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1797</th>\n",
       "      <td>99</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>qawiki</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>...</td>\n",
       "      <td>[idk]</td>\n",
       "      <td>[idk]</td>\n",
       "      <td>[idk]</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Disjoint</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1798</th>\n",
       "      <td>99</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>spinach</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>...</td>\n",
       "      <td>[Indiana, Missouri, Wisconsin, Iowa, Kentucky]</td>\n",
       "      <td>[Indiana, Wisconsin]</td>\n",
       "      <td>[Missouri, Iowa, Kentucky]</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Disjoint</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1799</th>\n",
       "      <td>99</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>synthetic</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0.6667</td>\n",
       "      <td>...</td>\n",
       "      <td>[black, red]</td>\n",
       "      <td>[yellow, red]</td>\n",
       "      <td>[black]</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Equivalence</td>\n",
       "      <td>Contains</td>\n",
       "      <td>Disjoint</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1800 rows × 29 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Q_ID          action    dataset    llm  ?A1=A2  ?A1=A3+A4  ?A1>A3  \\\n",
       "0       0  classification     qawiki  gpt-5       1          1       1   \n",
       "1       0  classification    spinach  gpt-5       1          1       1   \n",
       "2       0  classification  synthetic  gpt-5       1          1       1   \n",
       "3       0          fixing     qawiki  gpt-5       1          1       1   \n",
       "4       0          fixing    spinach  gpt-5       0          1       1   \n",
       "...   ...             ...        ...    ...     ...        ...     ...   \n",
       "1795   99        wikidata    spinach  gpt-5       1          1       1   \n",
       "1796   99        wikidata  synthetic  gpt-5       1          0       0   \n",
       "1797   99       zero-shot     qawiki  gpt-5       1          1       1   \n",
       "1798   99       zero-shot    spinach  gpt-5       1          1       1   \n",
       "1799   99       zero-shot  synthetic  gpt-5       0          1       1   \n",
       "\n",
       "      ?A1>A4  ?A3∅A4  J(A1-A2)  ...  \\\n",
       "0          1       1       1.0  ...   \n",
       "1          1       1       1.0  ...   \n",
       "2          1       1       1.0  ...   \n",
       "3          1       1       1.0  ...   \n",
       "4          1       1    0.6667  ...   \n",
       "...      ...     ...       ...  ...   \n",
       "1795       1       1       1.0  ...   \n",
       "1796       1       1       1.0  ...   \n",
       "1797       1       0       1.0  ...   \n",
       "1798       1       1       1.0  ...   \n",
       "1799       1       1    0.6667  ...   \n",
       "\n",
       "                                                     A2  \\\n",
       "0                           [Venezuela, Guyana, Brazil]   \n",
       "1     [At Last the 1948 Show, Fawlty Towers, Monty P...   \n",
       "2     [The Hateful Eight, Django Unchained, Kill Bil...   \n",
       "3       [Suriname, Guyana, Brazil, Colombia, Venezuela]   \n",
       "4     [At Last the 1948 Show, Fawlty Towers, Monty P...   \n",
       "...                                                 ...   \n",
       "1795     [Indiana, Missouri, Wisconsin, Iowa, Kentucky]   \n",
       "1796                                       [black, red]   \n",
       "1797                                              [idk]   \n",
       "1798     [Indiana, Missouri, Wisconsin, Iowa, Kentucky]   \n",
       "1799                                       [black, red]   \n",
       "\n",
       "                                                     A3  \\\n",
       "0                                                    []   \n",
       "1                                                    []   \n",
       "2                                                    []   \n",
       "3                                 [Colombia, Venezuela]   \n",
       "4     [At Last the 1948 Show, Monty Python's Fliegen...   \n",
       "...                                                 ...   \n",
       "1795                               [Indiana, Wisconsin]   \n",
       "1796                                      [yellow, red]   \n",
       "1797                                              [idk]   \n",
       "1798                               [Indiana, Wisconsin]   \n",
       "1799                                      [yellow, red]   \n",
       "\n",
       "                                                     A4  \\\n",
       "0                                      [Guyana, Brazil]   \n",
       "1                                                    []   \n",
       "2     [The Hateful Eight, Django Unchained, Kill Bil...   \n",
       "3                            [Suriname, Guyana, Brazil]   \n",
       "4                                                    []   \n",
       "...                                                 ...   \n",
       "1795                         [Missouri, Iowa, Kentucky]   \n",
       "1796                                            [black]   \n",
       "1797                                              [idk]   \n",
       "1798                         [Missouri, Iowa, Kentucky]   \n",
       "1799                                            [black]   \n",
       "\n",
       "                                                    A1*  \\\n",
       "0                           [Venezuela, Guyana, Brazil]   \n",
       "1     [At Last the 1948 Show, Fawlty Towers, Monty P...   \n",
       "2     [The Hateful Eight, Django Unchained, Kill Bil...   \n",
       "3                 [Colombia, Venezuela, Guyana, Brazil]   \n",
       "4     [At Last the 1948 Show, Monty Python's Fliegen...   \n",
       "...                                                 ...   \n",
       "1795                                               <NA>   \n",
       "1796                                               <NA>   \n",
       "1797                                               <NA>   \n",
       "1798                                               <NA>   \n",
       "1799                                               <NA>   \n",
       "\n",
       "                                                   A1**       R(1-2)  \\\n",
       "0                           [Venezuela, Guyana, Brazil]  Equivalence   \n",
       "1     [At Last the 1948 Show, Fawlty Towers, Monty P...  Equivalence   \n",
       "2     [The Hateful Eight, Django Unchained, Kill Bil...  Equivalence   \n",
       "3       [Suriname, Guyana, Brazil, Colombia, Venezuela]         <NA>   \n",
       "4     [At Last the 1948 Show, Fawlty Towers, Monty P...         <NA>   \n",
       "...                                                 ...          ...   \n",
       "1795                                               <NA>         <NA>   \n",
       "1796                                               <NA>         <NA>   \n",
       "1797                                               <NA>  Equivalence   \n",
       "1798                                               <NA>  Equivalence   \n",
       "1799                                               <NA>  Equivalence   \n",
       "\n",
       "        R(1-3)      R(1-34)    R(1-4)    R(3-4)  \n",
       "0     Contains  Equivalence      <NA>      <NA>  \n",
       "1     Contains  Equivalence      <NA>      <NA>  \n",
       "2     Contains  Equivalence      <NA>      <NA>  \n",
       "3         <NA>         <NA>      <NA>      <NA>  \n",
       "4         <NA>         <NA>      <NA>      <NA>  \n",
       "...        ...          ...       ...       ...  \n",
       "1795      <NA>         <NA>      <NA>      <NA>  \n",
       "1796      <NA>         <NA>      <NA>      <NA>  \n",
       "1797  Contains  Equivalence  Contains  Disjoint  \n",
       "1798  Contains  Equivalence  Contains  Disjoint  \n",
       "1799  Contains  Equivalence  Contains  Disjoint  \n",
       "\n",
       "[1800 rows x 29 columns]"
      ]
     },
     "execution_count": 151,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "def merge_relations_by_action(df_analysis, df_relation, df_relation_clf):\n",
    "    keys = [\"Q_ID\", \"dataset\", \"llm\"]\n",
    "    rel_cols = [\"R(1-2)\", \"R(1-3)\", \"R(1-4)\", \"R(3-4)\", \"R(1-34)\"]\n",
    "\n",
    "    # Ensure target columns exist in df_analysis\n",
    "    for c in rel_cols:\n",
    "        if c not in df_analysis.columns:\n",
    "            df_analysis[c] = pd.NA\n",
    "\n",
    "    # Deduplicate right tables on keys\n",
    "    df_rel = df_relation[keys + rel_cols].drop_duplicates(subset=keys)\n",
    "\n",
    "    # df_relation_clf may have only a subset of rel_cols; align to full set\n",
    "    clf_cols = [c for c in rel_cols if c in df_relation_clf.columns]\n",
    "    df_rel_clf_aligned = (\n",
    "        df_relation_clf[keys + clf_cols]\n",
    "        .drop_duplicates(subset=keys)\n",
    "        .reindex(columns=keys + rel_cols)  # add missing relation cols as NaN\n",
    "    )\n",
    "\n",
    "    # Fill zero-shot rows from df_relation\n",
    "    m_zs = df_analysis[\"action\"].eq(\"zero-shot\")\n",
    "    if m_zs.any():\n",
    "        zs_merge = df_analysis.loc[m_zs, keys].merge(df_rel, on=keys, how=\"left\")\n",
    "        df_analysis.loc[m_zs, rel_cols] = zs_merge[rel_cols].values\n",
    "\n",
    "    # Fill classification rows from df_relation_clf\n",
    "    m_cls = df_analysis[\"action\"].eq(\"classification\")\n",
    "    if m_cls.any():\n",
    "        cls_merge = df_analysis.loc[m_cls, keys].merge(df_rel_clf_aligned, on=keys, how=\"left\")\n",
    "        df_analysis.loc[m_cls, rel_cols] = cls_merge[rel_cols].values\n",
    "    df_analysis =  df_analysis.replace({None: pd.NA}).convert_dtypes()\n",
    "    return df_analysis\n",
    "\n",
    "# Usage\n",
    "df_analysis = merge_relations_by_action(df_analysis, df_relation, df_relation_clf)\n",
    "df_analysis\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "id": "23654e1c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Q_ID', 'action', 'dataset', 'llm', '?A1=A2', '?A1=A3+A4', '?A1>A3',\n",
       "       '?A1>A4', '?A3∅A4', 'J(A1-A2)', 'J(A1-A34)', 'J(A1-A1*)', 'J(A1-A1**)',\n",
       "       'J(A1*-A1**)', 'Q1', 'Q2', 'Q3', 'Q4', 'A1', 'A2', 'A3', 'A4', 'A1*',\n",
       "       'A1**', 'R(1-2)', 'R(1-3)', 'R(1-34)', 'R(1-4)', 'R(3-4)'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 152,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_analysis.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "836f7b4a",
   "metadata": {},
   "source": [
    "#### Analysis Correct Relations to Consistency and vice versa"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 234,
   "id": "7563188e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>dataset</th>\n",
       "      <th>action</th>\n",
       "      <th>llm</th>\n",
       "      <th>?A1=A2</th>\n",
       "      <th>?A1=A3+A4</th>\n",
       "      <th>?A1&gt;A3</th>\n",
       "      <th>?A1&gt;A4</th>\n",
       "      <th>?A3∅A4</th>\n",
       "      <th>?A1=A1*</th>\n",
       "      <th>?A1=A1**</th>\n",
       "      <th>...</th>\n",
       "      <th>J(A1-A34)</th>\n",
       "      <th>J(A1-A1*)</th>\n",
       "      <th>J(A1-A1**)</th>\n",
       "      <th>J(A1*-A1**)</th>\n",
       "      <th>idk_A1</th>\n",
       "      <th>idk_A2</th>\n",
       "      <th>idk_A3</th>\n",
       "      <th>idk_A4</th>\n",
       "      <th>?A1=A1(ave)</th>\n",
       "      <th>J_A1_ave</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>qawiki</td>\n",
       "      <td>classification</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.9133</td>\n",
       "      <td>0.9267</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.9733</td>\n",
       "      <td>0.5200</td>\n",
       "      <td>0.5267</td>\n",
       "      <td>0.6067</td>\n",
       "      <td>...</td>\n",
       "      <td>0.9644</td>\n",
       "      <td>0.7286</td>\n",
       "      <td>0.7324</td>\n",
       "      <td>0.7460</td>\n",
       "      <td>0.4133</td>\n",
       "      <td>0.4067</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.5333</td>\n",
       "      <td>0.5800</td>\n",
       "      <td>0.7357</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>qawiki</td>\n",
       "      <td>fixing</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.8933</td>\n",
       "      <td>0.8933</td>\n",
       "      <td>0.9533</td>\n",
       "      <td>0.9867</td>\n",
       "      <td>0.6200</td>\n",
       "      <td>0.7067</td>\n",
       "      <td>0.7067</td>\n",
       "      <td>...</td>\n",
       "      <td>0.9191</td>\n",
       "      <td>0.8220</td>\n",
       "      <td>0.8343</td>\n",
       "      <td>0.8067</td>\n",
       "      <td>0.5133</td>\n",
       "      <td>0.5267</td>\n",
       "      <td>0.5067</td>\n",
       "      <td>0.5800</td>\n",
       "      <td>0.7000</td>\n",
       "      <td>0.8210</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>qawiki</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.8200</td>\n",
       "      <td>0.6400</td>\n",
       "      <td>0.7867</td>\n",
       "      <td>0.7667</td>\n",
       "      <td>0.5000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.7825</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.6600</td>\n",
       "      <td>0.6467</td>\n",
       "      <td>0.5400</td>\n",
       "      <td>0.5933</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>qawiki</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.6800</td>\n",
       "      <td>0.5667</td>\n",
       "      <td>0.7267</td>\n",
       "      <td>0.7400</td>\n",
       "      <td>0.5867</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.7421</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.5467</td>\n",
       "      <td>0.5267</td>\n",
       "      <td>0.4400</td>\n",
       "      <td>0.5267</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>spinach</td>\n",
       "      <td>classification</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.9333</td>\n",
       "      <td>0.8600</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.9733</td>\n",
       "      <td>0.7000</td>\n",
       "      <td>0.6200</td>\n",
       "      <td>0.6067</td>\n",
       "      <td>...</td>\n",
       "      <td>0.9308</td>\n",
       "      <td>0.7761</td>\n",
       "      <td>0.7870</td>\n",
       "      <td>0.7876</td>\n",
       "      <td>0.3000</td>\n",
       "      <td>0.2933</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.3867</td>\n",
       "      <td>0.6133</td>\n",
       "      <td>0.7836</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>spinach</td>\n",
       "      <td>fixing</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.9400</td>\n",
       "      <td>0.8400</td>\n",
       "      <td>0.9400</td>\n",
       "      <td>0.9733</td>\n",
       "      <td>0.7800</td>\n",
       "      <td>0.7267</td>\n",
       "      <td>0.6200</td>\n",
       "      <td>...</td>\n",
       "      <td>0.8977</td>\n",
       "      <td>0.8527</td>\n",
       "      <td>0.7902</td>\n",
       "      <td>0.8160</td>\n",
       "      <td>0.3533</td>\n",
       "      <td>0.3533</td>\n",
       "      <td>0.3333</td>\n",
       "      <td>0.3867</td>\n",
       "      <td>0.6667</td>\n",
       "      <td>0.8196</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>spinach</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.7533</td>\n",
       "      <td>0.5867</td>\n",
       "      <td>0.7533</td>\n",
       "      <td>0.7533</td>\n",
       "      <td>0.6200</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.7381</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.4533</td>\n",
       "      <td>0.5000</td>\n",
       "      <td>0.3733</td>\n",
       "      <td>0.4600</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>spinach</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.6333</td>\n",
       "      <td>0.4600</td>\n",
       "      <td>0.6667</td>\n",
       "      <td>0.6733</td>\n",
       "      <td>0.7133</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.6965</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.3800</td>\n",
       "      <td>0.3667</td>\n",
       "      <td>0.3000</td>\n",
       "      <td>0.3400</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>synthetic</td>\n",
       "      <td>classification</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.8933</td>\n",
       "      <td>0.8867</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.9733</td>\n",
       "      <td>0.8867</td>\n",
       "      <td>0.6000</td>\n",
       "      <td>0.5867</td>\n",
       "      <td>...</td>\n",
       "      <td>0.9285</td>\n",
       "      <td>0.8146</td>\n",
       "      <td>0.7942</td>\n",
       "      <td>0.8114</td>\n",
       "      <td>0.0933</td>\n",
       "      <td>0.0933</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.1667</td>\n",
       "      <td>0.6000</td>\n",
       "      <td>0.8067</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>synthetic</td>\n",
       "      <td>fixing</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.8800</td>\n",
       "      <td>0.9000</td>\n",
       "      <td>0.9667</td>\n",
       "      <td>0.9933</td>\n",
       "      <td>0.9267</td>\n",
       "      <td>0.5667</td>\n",
       "      <td>0.5600</td>\n",
       "      <td>...</td>\n",
       "      <td>0.9482</td>\n",
       "      <td>0.7964</td>\n",
       "      <td>0.7859</td>\n",
       "      <td>0.7853</td>\n",
       "      <td>0.1267</td>\n",
       "      <td>0.1333</td>\n",
       "      <td>0.1200</td>\n",
       "      <td>0.1467</td>\n",
       "      <td>0.5578</td>\n",
       "      <td>0.7892</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>synthetic</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.6667</td>\n",
       "      <td>0.5400</td>\n",
       "      <td>0.7333</td>\n",
       "      <td>0.7800</td>\n",
       "      <td>0.8667</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.7549</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.2133</td>\n",
       "      <td>0.2333</td>\n",
       "      <td>0.1200</td>\n",
       "      <td>0.2067</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>synthetic</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.5533</td>\n",
       "      <td>0.3933</td>\n",
       "      <td>0.6067</td>\n",
       "      <td>0.6467</td>\n",
       "      <td>0.8667</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.6867</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.1533</td>\n",
       "      <td>0.1267</td>\n",
       "      <td>0.0467</td>\n",
       "      <td>0.1333</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>overall</td>\n",
       "      <td>classification</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.9133</td>\n",
       "      <td>0.8911</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.9733</td>\n",
       "      <td>0.7022</td>\n",
       "      <td>0.5822</td>\n",
       "      <td>0.6000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.9412</td>\n",
       "      <td>0.7731</td>\n",
       "      <td>0.7712</td>\n",
       "      <td>0.7817</td>\n",
       "      <td>0.2689</td>\n",
       "      <td>0.2644</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.3622</td>\n",
       "      <td>0.5978</td>\n",
       "      <td>0.7753</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>overall</td>\n",
       "      <td>fixing</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.9044</td>\n",
       "      <td>0.8778</td>\n",
       "      <td>0.9533</td>\n",
       "      <td>0.9844</td>\n",
       "      <td>0.7756</td>\n",
       "      <td>0.6667</td>\n",
       "      <td>0.6289</td>\n",
       "      <td>...</td>\n",
       "      <td>0.9217</td>\n",
       "      <td>0.8237</td>\n",
       "      <td>0.8035</td>\n",
       "      <td>0.8027</td>\n",
       "      <td>0.3311</td>\n",
       "      <td>0.3378</td>\n",
       "      <td>0.3200</td>\n",
       "      <td>0.3711</td>\n",
       "      <td>0.6415</td>\n",
       "      <td>0.8100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>overall</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.7467</td>\n",
       "      <td>0.5889</td>\n",
       "      <td>0.7578</td>\n",
       "      <td>0.7667</td>\n",
       "      <td>0.6622</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.7585</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.4422</td>\n",
       "      <td>0.4600</td>\n",
       "      <td>0.3444</td>\n",
       "      <td>0.4200</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>overall</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.6222</td>\n",
       "      <td>0.4733</td>\n",
       "      <td>0.6667</td>\n",
       "      <td>0.6867</td>\n",
       "      <td>0.7222</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.7084</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.3600</td>\n",
       "      <td>0.3400</td>\n",
       "      <td>0.2622</td>\n",
       "      <td>0.3333</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>16 rows × 22 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      dataset          action    llm  ?A1=A2  ?A1=A3+A4  ?A1>A3  ?A1>A4  \\\n",
       "0      qawiki  classification  gpt-5  0.9133     0.9267  1.0000  0.9733   \n",
       "1      qawiki          fixing  gpt-5  0.8933     0.8933  0.9533  0.9867   \n",
       "2      qawiki        wikidata  gpt-5  0.8200     0.6400  0.7867  0.7667   \n",
       "3      qawiki       zero-shot  gpt-5  0.6800     0.5667  0.7267  0.7400   \n",
       "4     spinach  classification  gpt-5  0.9333     0.8600  1.0000  0.9733   \n",
       "5     spinach          fixing  gpt-5  0.9400     0.8400  0.9400  0.9733   \n",
       "6     spinach        wikidata  gpt-5  0.7533     0.5867  0.7533  0.7533   \n",
       "7     spinach       zero-shot  gpt-5  0.6333     0.4600  0.6667  0.6733   \n",
       "8   synthetic  classification  gpt-5  0.8933     0.8867  1.0000  0.9733   \n",
       "9   synthetic          fixing  gpt-5  0.8800     0.9000  0.9667  0.9933   \n",
       "10  synthetic        wikidata  gpt-5  0.6667     0.5400  0.7333  0.7800   \n",
       "11  synthetic       zero-shot  gpt-5  0.5533     0.3933  0.6067  0.6467   \n",
       "12    overall  classification  gpt-5  0.9133     0.8911  1.0000  0.9733   \n",
       "13    overall          fixing  gpt-5  0.9044     0.8778  0.9533  0.9844   \n",
       "14    overall        wikidata  gpt-5  0.7467     0.5889  0.7578  0.7667   \n",
       "15    overall       zero-shot  gpt-5  0.6222     0.4733  0.6667  0.6867   \n",
       "\n",
       "    ?A3∅A4  ?A1=A1*  ?A1=A1**  ...  J(A1-A34)  J(A1-A1*)  J(A1-A1**)  \\\n",
       "0   0.5200   0.5267    0.6067  ...     0.9644     0.7286      0.7324   \n",
       "1   0.6200   0.7067    0.7067  ...     0.9191     0.8220      0.8343   \n",
       "2   0.5000      NaN       NaN  ...     0.7825        NaN         NaN   \n",
       "3   0.5867      NaN       NaN  ...     0.7421        NaN         NaN   \n",
       "4   0.7000   0.6200    0.6067  ...     0.9308     0.7761      0.7870   \n",
       "5   0.7800   0.7267    0.6200  ...     0.8977     0.8527      0.7902   \n",
       "6   0.6200      NaN       NaN  ...     0.7381        NaN         NaN   \n",
       "7   0.7133      NaN       NaN  ...     0.6965        NaN         NaN   \n",
       "8   0.8867   0.6000    0.5867  ...     0.9285     0.8146      0.7942   \n",
       "9   0.9267   0.5667    0.5600  ...     0.9482     0.7964      0.7859   \n",
       "10  0.8667      NaN       NaN  ...     0.7549        NaN         NaN   \n",
       "11  0.8667      NaN       NaN  ...     0.6867        NaN         NaN   \n",
       "12  0.7022   0.5822    0.6000  ...     0.9412     0.7731      0.7712   \n",
       "13  0.7756   0.6667    0.6289  ...     0.9217     0.8237      0.8035   \n",
       "14  0.6622      NaN       NaN  ...     0.7585        NaN         NaN   \n",
       "15  0.7222      NaN       NaN  ...     0.7084        NaN         NaN   \n",
       "\n",
       "    J(A1*-A1**)  idk_A1  idk_A2  idk_A3  idk_A4  ?A1=A1(ave)  J_A1_ave  \n",
       "0        0.7460  0.4133  0.4067  1.0000  0.5333       0.5800    0.7357  \n",
       "1        0.8067  0.5133  0.5267  0.5067  0.5800       0.7000    0.8210  \n",
       "2           NaN  0.6600  0.6467  0.5400  0.5933          NaN       NaN  \n",
       "3           NaN  0.5467  0.5267  0.4400  0.5267          NaN       NaN  \n",
       "4        0.7876  0.3000  0.2933  1.0000  0.3867       0.6133    0.7836  \n",
       "5        0.8160  0.3533  0.3533  0.3333  0.3867       0.6667    0.8196  \n",
       "6           NaN  0.4533  0.5000  0.3733  0.4600          NaN       NaN  \n",
       "7           NaN  0.3800  0.3667  0.3000  0.3400          NaN       NaN  \n",
       "8        0.8114  0.0933  0.0933  1.0000  0.1667       0.6000    0.8067  \n",
       "9        0.7853  0.1267  0.1333  0.1200  0.1467       0.5578    0.7892  \n",
       "10          NaN  0.2133  0.2333  0.1200  0.2067          NaN       NaN  \n",
       "11          NaN  0.1533  0.1267  0.0467  0.1333          NaN       NaN  \n",
       "12       0.7817  0.2689  0.2644  1.0000  0.3622       0.5978    0.7753  \n",
       "13       0.8027  0.3311  0.3378  0.3200  0.3711       0.6415    0.8100  \n",
       "14          NaN  0.4422  0.4600  0.3444  0.4200          NaN       NaN  \n",
       "15          NaN  0.3600  0.3400  0.2622  0.3333          NaN       NaN  \n",
       "\n",
       "[16 rows x 22 columns]"
      ]
     },
     "execution_count": 234,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_summary = summary(df_analysis)\n",
    "df_summary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 200,
   "id": "2e78ecbb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "def update_summary_by_relations(\n",
    "    df_analysis: pd.DataFrame,\n",
    "    df_summary: pd.DataFrame,\n",
    "    task: str = \"zero-shot\",\n",
    "    # Ground truth labels per relation\n",
    "    relation_truths: dict[str, str] = None,\n",
    "    # Metrics to average per relation: {relation: [(metric_column_name, output_label_prefix or None), ...]}\n",
    "    # If output_label_prefix is None, we use the metric column name directly and append (+)/(-).\n",
    "    metric_spec: dict[str, list[tuple[str, str | None]]] = None,\n",
    ") -> pd.DataFrame:\n",
    "    \"\"\"\n",
    "    For each (dataset, llm) and 'overall' per llm, compute mean metrics on\n",
    "    positive vs negative rows for each relation, and write into df_summary.\n",
    "    \n",
    "    Returns\n",
    "    -------\n",
    "    df_summary : pd.DataFrame (mutated copy)\n",
    "    \"\"\"\n",
    "    # Defaults\n",
    "    if relation_truths is None:\n",
    "        relation_truths = {\n",
    "            \"R(1-2)\":  \"Equivalence\",\n",
    "            \"R(1-3)\":  \"Contains\",\n",
    "            \"R(1-4)\":  \"Contains\",\n",
    "            \"R(3-4)\":  \"Disjoint\",\n",
    "            \"R(1-34)\": \"Equivalence\",\n",
    "        }\n",
    "\n",
    "    if metric_spec is None:\n",
    "        metric_spec = {\n",
    "            \"R(1-2)\":  [(\"?A1=A2\", None), (\"J(A1-A2)\", \"J(1-2)\")],\n",
    "            \"R(1-3)\":  [(\"?A1>A3\", None)],\n",
    "            \"R(1-4)\":  [(\"?A1>A4\", None), (\"J(A1-A4)\", \"J(1-4)\")],\n",
    "            \"R(3-4)\":  [(\"?A3∅A4\",None),(\"J(A3-A4)\", \"J(3-4)\")],  # natural for Disjoint: J should be near 0\n",
    "            \"R(1-34)\": [(\"?A1=A3+A4\", None), (\"J(A1-A34)\", \"J(1-34)\")],\n",
    "        }\n",
    "\n",
    "    # Work on a copy to avoid accidental view issues\n",
    "    out = df_summary.copy()\n",
    "    df_temp = df_analysis[df_analysis[\"action\"] == task]\n",
    "\n",
    "    def set_means(mask, group, rel_col, truth_label):\n",
    "        # Split pos/neg\n",
    "        pos = group[group[rel_col] == truth_label]\n",
    "        neg = group[group[rel_col] != truth_label]\n",
    "\n",
    "        for metric_col, prefix in metric_spec.get(rel_col, []):\n",
    "            if metric_col not in group.columns:\n",
    "                # silently skip missing metrics\n",
    "                continue\n",
    "\n",
    "            # Compute means (NaN if empty)\n",
    "            pos_mean = pos[metric_col].mean() if len(pos) else pd.NA\n",
    "            neg_mean = neg[metric_col].mean() if len(neg) else pd.NA\n",
    "\n",
    "            # Build output column names\n",
    "            if prefix is None:\n",
    "                col_pos = f\"{metric_col}(+)\"\n",
    "                col_neg = f\"{metric_col}(-)\"\n",
    "            else:\n",
    "                col_pos = f\"{prefix}+\"\n",
    "                col_neg = f\"{prefix}-\"\n",
    "\n",
    "            # Ensure columns exist\n",
    "            if col_pos not in out.columns:\n",
    "                out[col_pos] = pd.NA\n",
    "            if col_neg not in out.columns:\n",
    "                out[col_neg] = pd.NA\n",
    "\n",
    "            # Assign\n",
    "            out.loc[mask, col_pos] = pos_mean\n",
    "            out.loc[mask, col_neg] = neg_mean\n",
    "\n",
    "    # Per (dataset, llm)\n",
    "    for (dataset, llm), group in df_temp.groupby([\"dataset\", \"llm\"]):\n",
    "        mask_common = (\n",
    "            (out[\"action\"] == task)\n",
    "            & (out[\"dataset\"] == dataset)\n",
    "            & (out[\"llm\"] == llm)\n",
    "        )\n",
    "        for rel_col, truth_label in relation_truths.items():\n",
    "            # Skip relation columns that aren't present\n",
    "            if rel_col not in group.columns:\n",
    "                continue\n",
    "            set_means(mask_common, group, rel_col, truth_label)\n",
    "\n",
    "    # Overall per llm\n",
    "    for llm, group in df_temp.groupby(\"llm\"):\n",
    "        mask_overall = ((out[\"dataset\"] == \"overall\") \n",
    "                        & (out[\"llm\"] == llm)\n",
    "                        & (out[\"action\"] == task))\n",
    "    \n",
    "        for rel_col, truth_label in relation_truths.items():\n",
    "            if rel_col not in group.columns:\n",
    "                continue\n",
    "            set_means(mask_overall, group, rel_col, truth_label)\n",
    "\n",
    "    return out\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 204,
   "id": "9dc56d98",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(16, 32)\n",
      "Index(['dataset', 'action', 'llm', '?A1=A2', '?A1=A3+A4', '?A1>A3', '?A1>A4',\n",
      "       '?A3∅A4', 'J(A1-A2)', 'J(A1-A34)', 'J(A1-A1*)', 'J(A1-A1**)',\n",
      "       'J(A1*-A1**)', 'idk_A1', 'idk_A2', 'idk_A3', 'idk_A4', 'J_A1_ave',\n",
      "       '?A1=A2(+)', '?A1=A2(-)', 'J(1-2)+', 'J(1-2)-', '?A1>A3(+)',\n",
      "       '?A1>A3(-)', '?A1>A4(+)', '?A1>A4(-)', '?A3∅A4(+)', '?A3∅A4(-)',\n",
      "       '?A1=A3+A4(+)', '?A1=A3+A4(-)', 'J(1-34)+', 'J(1-34)-'],\n",
      "      dtype='object')\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>dataset</th>\n",
       "      <th>action</th>\n",
       "      <th>llm</th>\n",
       "      <th>?A1=A2</th>\n",
       "      <th>?A1=A3+A4</th>\n",
       "      <th>?A1&gt;A3</th>\n",
       "      <th>?A1&gt;A4</th>\n",
       "      <th>?A3∅A4</th>\n",
       "      <th>J(A1-A2)</th>\n",
       "      <th>J(A1-A34)</th>\n",
       "      <th>...</th>\n",
       "      <th>?A1&gt;A3(+)</th>\n",
       "      <th>?A1&gt;A3(-)</th>\n",
       "      <th>?A1&gt;A4(+)</th>\n",
       "      <th>?A1&gt;A4(-)</th>\n",
       "      <th>?A3∅A4(+)</th>\n",
       "      <th>?A3∅A4(-)</th>\n",
       "      <th>?A1=A3+A4(+)</th>\n",
       "      <th>?A1=A3+A4(-)</th>\n",
       "      <th>J(1-34)+</th>\n",
       "      <th>J(1-34)-</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>qawiki</td>\n",
       "      <td>classification</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.9133</td>\n",
       "      <td>0.9267</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.9733</td>\n",
       "      <td>0.52</td>\n",
       "      <td>0.9636</td>\n",
       "      <td>0.9644</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>0.93007</td>\n",
       "      <td>0.857143</td>\n",
       "      <td>0.963338</td>\n",
       "      <td>0.9864</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>qawiki</td>\n",
       "      <td>fixing</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.8933</td>\n",
       "      <td>0.8933</td>\n",
       "      <td>0.9533</td>\n",
       "      <td>0.9867</td>\n",
       "      <td>0.62</td>\n",
       "      <td>0.9418</td>\n",
       "      <td>0.9191</td>\n",
       "      <td>...</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>qawiki</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.82</td>\n",
       "      <td>0.64</td>\n",
       "      <td>0.7867</td>\n",
       "      <td>0.7667</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.8907</td>\n",
       "      <td>0.7825</td>\n",
       "      <td>...</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>qawiki</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.68</td>\n",
       "      <td>0.5667</td>\n",
       "      <td>0.7267</td>\n",
       "      <td>0.74</td>\n",
       "      <td>0.5867</td>\n",
       "      <td>0.7996</td>\n",
       "      <td>0.7421</td>\n",
       "      <td>...</td>\n",
       "      <td>0.721088</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.741497</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.60274</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.56338</td>\n",
       "      <td>0.625</td>\n",
       "      <td>0.735675</td>\n",
       "      <td>0.855812</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>spinach</td>\n",
       "      <td>classification</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.9333</td>\n",
       "      <td>0.86</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.9733</td>\n",
       "      <td>0.7</td>\n",
       "      <td>0.9625</td>\n",
       "      <td>0.9308</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>0.892086</td>\n",
       "      <td>0.454545</td>\n",
       "      <td>0.942163</td>\n",
       "      <td>0.787336</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>spinach</td>\n",
       "      <td>fixing</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.94</td>\n",
       "      <td>0.84</td>\n",
       "      <td>0.94</td>\n",
       "      <td>0.9733</td>\n",
       "      <td>0.78</td>\n",
       "      <td>0.9592</td>\n",
       "      <td>0.8977</td>\n",
       "      <td>...</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>spinach</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.7533</td>\n",
       "      <td>0.5867</td>\n",
       "      <td>0.7533</td>\n",
       "      <td>0.7533</td>\n",
       "      <td>0.62</td>\n",
       "      <td>0.8193</td>\n",
       "      <td>0.7381</td>\n",
       "      <td>...</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>spinach</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.6333</td>\n",
       "      <td>0.46</td>\n",
       "      <td>0.6667</td>\n",
       "      <td>0.6733</td>\n",
       "      <td>0.7133</td>\n",
       "      <td>0.7885</td>\n",
       "      <td>0.6965</td>\n",
       "      <td>...</td>\n",
       "      <td>0.668919</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.675676</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.739437</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.452555</td>\n",
       "      <td>0.538462</td>\n",
       "      <td>0.702392</td>\n",
       "      <td>0.634269</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>synthetic</td>\n",
       "      <td>classification</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.8933</td>\n",
       "      <td>0.8867</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.9733</td>\n",
       "      <td>0.8867</td>\n",
       "      <td>0.9599</td>\n",
       "      <td>0.9285</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>0.903448</td>\n",
       "      <td>0.4</td>\n",
       "      <td>0.939812</td>\n",
       "      <td>0.599</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>synthetic</td>\n",
       "      <td>fixing</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.88</td>\n",
       "      <td>0.9</td>\n",
       "      <td>0.9667</td>\n",
       "      <td>0.9933</td>\n",
       "      <td>0.9267</td>\n",
       "      <td>0.9697</td>\n",
       "      <td>0.9482</td>\n",
       "      <td>...</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>synthetic</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.6667</td>\n",
       "      <td>0.54</td>\n",
       "      <td>0.7333</td>\n",
       "      <td>0.78</td>\n",
       "      <td>0.8667</td>\n",
       "      <td>0.8036</td>\n",
       "      <td>0.7549</td>\n",
       "      <td>...</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>synthetic</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.5533</td>\n",
       "      <td>0.3933</td>\n",
       "      <td>0.6067</td>\n",
       "      <td>0.6467</td>\n",
       "      <td>0.8667</td>\n",
       "      <td>0.7869</td>\n",
       "      <td>0.6867</td>\n",
       "      <td>...</td>\n",
       "      <td>0.609589</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.648649</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.883562</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.39726</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.692465</td>\n",
       "      <td>0.474675</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>overall</td>\n",
       "      <td>classification</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.9133</td>\n",
       "      <td>0.8911</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.9733</td>\n",
       "      <td>0.7022</td>\n",
       "      <td>0.962</td>\n",
       "      <td>0.9412</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>0.908665</td>\n",
       "      <td>0.565217</td>\n",
       "      <td>0.948456</td>\n",
       "      <td>0.806978</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>overall</td>\n",
       "      <td>fixing</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.9044</td>\n",
       "      <td>0.8778</td>\n",
       "      <td>0.9533</td>\n",
       "      <td>0.9844</td>\n",
       "      <td>0.7756</td>\n",
       "      <td>0.9569</td>\n",
       "      <td>0.9217</td>\n",
       "      <td>...</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>overall</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.7467</td>\n",
       "      <td>0.5889</td>\n",
       "      <td>0.7578</td>\n",
       "      <td>0.7667</td>\n",
       "      <td>0.6622</td>\n",
       "      <td>0.8379</td>\n",
       "      <td>0.7585</td>\n",
       "      <td>...</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>overall</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.6222</td>\n",
       "      <td>0.4733</td>\n",
       "      <td>0.6667</td>\n",
       "      <td>0.6867</td>\n",
       "      <td>0.7222</td>\n",
       "      <td>0.7917</td>\n",
       "      <td>0.7084</td>\n",
       "      <td>...</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.688488</td>\n",
       "      <td>0.571429</td>\n",
       "      <td>0.741935</td>\n",
       "      <td>0.1875</td>\n",
       "      <td>0.470588</td>\n",
       "      <td>0.52</td>\n",
       "      <td>0.710102</td>\n",
       "      <td>0.679628</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>16 rows × 32 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      dataset          action    llm  ?A1=A2  ?A1=A3+A4  ?A1>A3  ?A1>A4  \\\n",
       "0      qawiki  classification  gpt-5  0.9133     0.9267     1.0  0.9733   \n",
       "1      qawiki          fixing  gpt-5  0.8933     0.8933  0.9533  0.9867   \n",
       "2      qawiki        wikidata  gpt-5    0.82       0.64  0.7867  0.7667   \n",
       "3      qawiki       zero-shot  gpt-5    0.68     0.5667  0.7267    0.74   \n",
       "4     spinach  classification  gpt-5  0.9333       0.86     1.0  0.9733   \n",
       "5     spinach          fixing  gpt-5    0.94       0.84    0.94  0.9733   \n",
       "6     spinach        wikidata  gpt-5  0.7533     0.5867  0.7533  0.7533   \n",
       "7     spinach       zero-shot  gpt-5  0.6333       0.46  0.6667  0.6733   \n",
       "8   synthetic  classification  gpt-5  0.8933     0.8867     1.0  0.9733   \n",
       "9   synthetic          fixing  gpt-5    0.88        0.9  0.9667  0.9933   \n",
       "10  synthetic        wikidata  gpt-5  0.6667       0.54  0.7333    0.78   \n",
       "11  synthetic       zero-shot  gpt-5  0.5533     0.3933  0.6067  0.6467   \n",
       "12    overall  classification  gpt-5  0.9133     0.8911     1.0  0.9733   \n",
       "13    overall          fixing  gpt-5  0.9044     0.8778  0.9533  0.9844   \n",
       "14    overall        wikidata  gpt-5  0.7467     0.5889  0.7578  0.7667   \n",
       "15    overall       zero-shot  gpt-5  0.6222     0.4733  0.6667  0.6867   \n",
       "\n",
       "    ?A3∅A4  J(A1-A2)  J(A1-A34)  ...  ?A1>A3(+)  ?A1>A3(-)  ?A1>A4(+)  \\\n",
       "0     0.52    0.9636     0.9644  ...        1.0        1.0       <NA>   \n",
       "1     0.62    0.9418     0.9191  ...       <NA>       <NA>       <NA>   \n",
       "2      0.5    0.8907     0.7825  ...       <NA>       <NA>       <NA>   \n",
       "3   0.5867    0.7996     0.7421  ...   0.721088        1.0   0.741497   \n",
       "4      0.7    0.9625     0.9308  ...        1.0        1.0       <NA>   \n",
       "5     0.78    0.9592     0.8977  ...       <NA>       <NA>       <NA>   \n",
       "6     0.62    0.8193     0.7381  ...       <NA>       <NA>       <NA>   \n",
       "7   0.7133    0.7885     0.6965  ...   0.668919        0.5   0.675676   \n",
       "8   0.8867    0.9599     0.9285  ...        1.0        1.0       <NA>   \n",
       "9   0.9267    0.9697     0.9482  ...       <NA>       <NA>       <NA>   \n",
       "10  0.8667    0.8036     0.7549  ...       <NA>       <NA>       <NA>   \n",
       "11  0.8667    0.7869     0.6867  ...   0.609589        0.5   0.648649   \n",
       "12  0.7022     0.962     0.9412  ...        1.0        1.0       <NA>   \n",
       "13  0.7756    0.9569     0.9217  ...       <NA>       <NA>       <NA>   \n",
       "14  0.6622    0.8379     0.7585  ...       <NA>       <NA>       <NA>   \n",
       "15  0.7222    0.7917     0.7084  ...   0.666667   0.666667   0.688488   \n",
       "\n",
       "    ?A1>A4(-)  ?A3∅A4(+)  ?A3∅A4(-)  ?A1=A3+A4(+)  ?A1=A3+A4(-)  J(1-34)+  \\\n",
       "0        <NA>       <NA>       <NA>       0.93007      0.857143  0.963338   \n",
       "1        <NA>       <NA>       <NA>          <NA>          <NA>      <NA>   \n",
       "2        <NA>       <NA>       <NA>          <NA>          <NA>      <NA>   \n",
       "3    0.666667    0.60274        0.0       0.56338         0.625  0.735675   \n",
       "4        <NA>       <NA>       <NA>      0.892086      0.454545  0.942163   \n",
       "5        <NA>       <NA>       <NA>          <NA>          <NA>      <NA>   \n",
       "6        <NA>       <NA>       <NA>          <NA>          <NA>      <NA>   \n",
       "7         0.5   0.739437       0.25      0.452555      0.538462  0.702392   \n",
       "8        <NA>       <NA>       <NA>      0.903448           0.4  0.939812   \n",
       "9        <NA>       <NA>       <NA>          <NA>          <NA>      <NA>   \n",
       "10       <NA>       <NA>       <NA>          <NA>          <NA>      <NA>   \n",
       "11        0.5   0.883562       0.25       0.39726          0.25  0.692465   \n",
       "12       <NA>       <NA>       <NA>      0.908665      0.565217  0.948456   \n",
       "13       <NA>       <NA>       <NA>          <NA>          <NA>      <NA>   \n",
       "14       <NA>       <NA>       <NA>          <NA>          <NA>      <NA>   \n",
       "15   0.571429   0.741935     0.1875      0.470588          0.52  0.710102   \n",
       "\n",
       "    J(1-34)-  \n",
       "0     0.9864  \n",
       "1       <NA>  \n",
       "2       <NA>  \n",
       "3   0.855812  \n",
       "4   0.787336  \n",
       "5       <NA>  \n",
       "6       <NA>  \n",
       "7   0.634269  \n",
       "8      0.599  \n",
       "9       <NA>  \n",
       "10      <NA>  \n",
       "11  0.474675  \n",
       "12  0.806978  \n",
       "13      <NA>  \n",
       "14      <NA>  \n",
       "15  0.679628  \n",
       "\n",
       "[16 rows x 32 columns]"
      ]
     },
     "execution_count": 204,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_summary = summary(df_analysis)\n",
    "\n",
    "df_summary = update_summary_by_relations(\n",
    "    df_analysis=df_analysis,\n",
    "    df_summary=df_summary,\n",
    "    task=\"zero-shot\",  # or \"classification\"\n",
    "    # you can omit relation_truths & metric_spec to use the defaults shown above\n",
    ")\n",
    "\n",
    "df_summary = update_summary_by_relations(\n",
    "    df_analysis=df_analysis,\n",
    "    df_summary=df_summary,\n",
    "    task=\"classification\",  # or \"classification\"\n",
    "    # you can omit relation_truths & metric_spec to use the defaults shown above\n",
    ")\n",
    "print(df_summary.shape)\n",
    "print(df_summary.columns)\n",
    "df_summary\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6f5c4b0d",
   "metadata": {},
   "source": [
    "#### P-values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "78aba9dc",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from statsmodels.stats.contingency_tables import mcnemar\n",
    "\n",
    "PREDICATES = ['?A1=A2', '?A1=A3+A4', '?A1>A3', '?A1>A4', '?A3∅A4']\n",
    "P_COLS     = ['p(A1=A2)', 'p(A1=A3+A4)', 'p(A1>A3)', 'p(A1>A4)', 'p(A3∅A4)']\n",
    "\n",
    "def _mcnemar_p(z, a):\n",
    "    \"\"\"Exact McNemar p-value for two paired 0/1 vectors (same length).\"\"\"\n",
    "    n10 = ((z == 1) & (a == 0)).sum()\n",
    "    n01 = ((z == 0) & (a == 1)).sum()\n",
    "    if (n10 + n01) == 0:\n",
    "        # no discordant pairs -> identical performance\n",
    "        return 1.0, n10, n01\n",
    "    res = mcnemar([[0, n01], [n10, 0]], exact=True)\n",
    "    return float(res.pvalue), n10, n01\n",
    "\n",
    "def _compare_group(g, dataset_label):\n",
    "    \"\"\"Run action-vs-zero-shot McNemar for all predicates within group g.\"\"\"\n",
    "    out_rows = []\n",
    "    if 'zero-shot' not in set(g['action']):\n",
    "        return pd.DataFrame(out_rows)\n",
    "\n",
    "    base = g[g['action'] == 'zero-shot'][['Q_ID'] + PREDICATES].copy()\n",
    "\n",
    "    for action, g_act in g.groupby('action'):\n",
    "        merged = base.merge(\n",
    "            g_act[['Q_ID'] + PREDICATES], \n",
    "            on='Q_ID', \n",
    "            suffixes=('_zero', '_act')\n",
    "        )\n",
    "        if merged.empty:\n",
    "            continue\n",
    "\n",
    "        row = {\n",
    "            'dataset': dataset_label,\n",
    "            'llm': g['llm'].iloc[0],\n",
    "            'action': action\n",
    "        }\n",
    "\n",
    "        for pred, pcol in zip(PREDICATES, P_COLS):\n",
    "            z = merged[f'{pred}_zero']\n",
    "            a = merged[f'{pred}_act']\n",
    "            pval, n10, n01 = _mcnemar_p(z, a)\n",
    "            row[pcol] = pval\n",
    "            # row[f'{pcol}_winner'] = (\n",
    "            #     'action' if n01 > n10 else \n",
    "            #     'zero-shot' if n10 > n01 else \n",
    "            #     'tie'\n",
    "            # )\n",
    "\n",
    "        out_rows.append(row)\n",
    "\n",
    "    return pd.DataFrame(out_rows)\n",
    "\n",
    "def compute_pvals(df: pd.DataFrame) -> pd.DataFrame:\n",
    "    \"\"\"\n",
    "    For each (dataset, llm, action) compute McNemar p(action vs zero-shot).\n",
    "    Includes zero-shot itself (p=1, tie).\n",
    "    Also adds an 'overall' dataset (pooled across datasets per llm).\n",
    "    Returns a wide DataFrame with p-values (4 decimals) and winner columns.\n",
    "    \"\"\"\n",
    "    frames = []\n",
    "\n",
    "    # per-dataset\n",
    "    for (dataset, llm), g in df.groupby(['dataset', 'llm']):\n",
    "        frames.append(_compare_group(g, dataset))\n",
    "\n",
    "    # overall (pool datasets) per llm\n",
    "    for llm, g in df.groupby('llm'):\n",
    "        frames.append(_compare_group(g, 'overall'))\n",
    "\n",
    "    res = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame(\n",
    "        columns=['dataset','llm','action'] + P_COLS\n",
    "    )\n",
    "\n",
    "    return res.sort_values(['dataset', 'llm', 'action'], ignore_index=True)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 222,
   "id": "392db99c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>dataset</th>\n",
       "      <th>llm</th>\n",
       "      <th>action</th>\n",
       "      <th>p(A1=A2)</th>\n",
       "      <th>p(A1=A3+A4)</th>\n",
       "      <th>p(A1&gt;A3)</th>\n",
       "      <th>p(A1&gt;A4)</th>\n",
       "      <th>p(A3∅A4)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>overall</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>classification</td>\n",
       "      <td>4.528857e-72</td>\n",
       "      <td>9.877908e-127</td>\n",
       "      <td>6.879105e-136</td>\n",
       "      <td>1.150670e-95</td>\n",
       "      <td>0.230875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>overall</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>fixing</td>\n",
       "      <td>2.453163e-72</td>\n",
       "      <td>1.439585e-114</td>\n",
       "      <td>1.863491e-85</td>\n",
       "      <td>7.762493e-109</td>\n",
       "      <td>0.000601</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>overall</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>3.176076e-13</td>\n",
       "      <td>1.191867e-10</td>\n",
       "      <td>4.601966e-08</td>\n",
       "      <td>8.012738e-07</td>\n",
       "      <td>0.000264</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>overall</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>qawiki</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>classification</td>\n",
       "      <td>7.878384e-08</td>\n",
       "      <td>1.187939e-14</td>\n",
       "      <td>9.094947e-13</td>\n",
       "      <td>2.841261e-09</td>\n",
       "      <td>0.087159</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>qawiki</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>fixing</td>\n",
       "      <td>9.430375e-07</td>\n",
       "      <td>1.541878e-12</td>\n",
       "      <td>1.076842e-09</td>\n",
       "      <td>1.455192e-10</td>\n",
       "      <td>0.486850</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>qawiki</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>1.037158e-04</td>\n",
       "      <td>9.887175e-02</td>\n",
       "      <td>1.995909e-01</td>\n",
       "      <td>6.075914e-01</td>\n",
       "      <td>0.014633</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>qawiki</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>spinach</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>classification</td>\n",
       "      <td>1.358992e-09</td>\n",
       "      <td>2.256225e-16</td>\n",
       "      <td>1.776357e-15</td>\n",
       "      <td>4.355627e-12</td>\n",
       "      <td>0.850554</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>spinach</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>fixing</td>\n",
       "      <td>3.481659e-13</td>\n",
       "      <td>1.641048e-15</td>\n",
       "      <td>1.000444e-11</td>\n",
       "      <td>6.821210e-13</td>\n",
       "      <td>0.021271</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>spinach</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>7.915897e-03</td>\n",
       "      <td>2.563208e-03</td>\n",
       "      <td>4.095959e-02</td>\n",
       "      <td>1.189205e-01</td>\n",
       "      <td>0.006611</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>spinach</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>synthetic</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>classification</td>\n",
       "      <td>4.290179e-13</td>\n",
       "      <td>1.412841e-19</td>\n",
       "      <td>3.469447e-18</td>\n",
       "      <td>4.618528e-14</td>\n",
       "      <td>0.663624</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>synthetic</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>fixing</td>\n",
       "      <td>4.618528e-14</td>\n",
       "      <td>5.361785e-21</td>\n",
       "      <td>1.187939e-14</td>\n",
       "      <td>4.440892e-16</td>\n",
       "      <td>0.078354</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>synthetic</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>2.409291e-02</td>\n",
       "      <td>4.561533e-03</td>\n",
       "      <td>1.266034e-02</td>\n",
       "      <td>6.600448e-03</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>synthetic</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      dataset    llm          action      p(A1=A2)    p(A1=A3+A4)  \\\n",
       "0     overall  gpt-5  classification  4.528857e-72  9.877908e-127   \n",
       "1     overall  gpt-5          fixing  2.453163e-72  1.439585e-114   \n",
       "2     overall  gpt-5        wikidata  3.176076e-13   1.191867e-10   \n",
       "3     overall  gpt-5       zero-shot  1.000000e+00   1.000000e+00   \n",
       "4      qawiki  gpt-5  classification  7.878384e-08   1.187939e-14   \n",
       "5      qawiki  gpt-5          fixing  9.430375e-07   1.541878e-12   \n",
       "6      qawiki  gpt-5        wikidata  1.037158e-04   9.887175e-02   \n",
       "7      qawiki  gpt-5       zero-shot  1.000000e+00   1.000000e+00   \n",
       "8     spinach  gpt-5  classification  1.358992e-09   2.256225e-16   \n",
       "9     spinach  gpt-5          fixing  3.481659e-13   1.641048e-15   \n",
       "10    spinach  gpt-5        wikidata  7.915897e-03   2.563208e-03   \n",
       "11    spinach  gpt-5       zero-shot  1.000000e+00   1.000000e+00   \n",
       "12  synthetic  gpt-5  classification  4.290179e-13   1.412841e-19   \n",
       "13  synthetic  gpt-5          fixing  4.618528e-14   5.361785e-21   \n",
       "14  synthetic  gpt-5        wikidata  2.409291e-02   4.561533e-03   \n",
       "15  synthetic  gpt-5       zero-shot  1.000000e+00   1.000000e+00   \n",
       "\n",
       "         p(A1>A3)       p(A1>A4)  p(A3∅A4)  \n",
       "0   6.879105e-136   1.150670e-95  0.230875  \n",
       "1    1.863491e-85  7.762493e-109  0.000601  \n",
       "2    4.601966e-08   8.012738e-07  0.000264  \n",
       "3    1.000000e+00   1.000000e+00  1.000000  \n",
       "4    9.094947e-13   2.841261e-09  0.087159  \n",
       "5    1.076842e-09   1.455192e-10  0.486850  \n",
       "6    1.995909e-01   6.075914e-01  0.014633  \n",
       "7    1.000000e+00   1.000000e+00  1.000000  \n",
       "8    1.776357e-15   4.355627e-12  0.850554  \n",
       "9    1.000444e-11   6.821210e-13  0.021271  \n",
       "10   4.095959e-02   1.189205e-01  0.006611  \n",
       "11   1.000000e+00   1.000000e+00  1.000000  \n",
       "12   3.469447e-18   4.618528e-14  0.663624  \n",
       "13   1.187939e-14   4.440892e-16  0.078354  \n",
       "14   1.266034e-02   6.600448e-03  1.000000  \n",
       "15   1.000000e+00   1.000000e+00  1.000000  "
      ]
     },
     "execution_count": 222,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_pval = compute_pvals(df_analysis)\n",
    "df_pval\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 224,
   "id": "f1be0d2e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>dataset</th>\n",
       "      <th>action</th>\n",
       "      <th>llm</th>\n",
       "      <th>?A1=A2</th>\n",
       "      <th>?A1=A3+A4</th>\n",
       "      <th>?A1&gt;A3</th>\n",
       "      <th>?A1&gt;A4</th>\n",
       "      <th>?A3∅A4</th>\n",
       "      <th>J(A1-A2)</th>\n",
       "      <th>J(A1-A34)</th>\n",
       "      <th>...</th>\n",
       "      <th>?A3∅A4(-)</th>\n",
       "      <th>?A1=A3+A4(+)</th>\n",
       "      <th>?A1=A3+A4(-)</th>\n",
       "      <th>J(1-34)+</th>\n",
       "      <th>J(1-34)-</th>\n",
       "      <th>p(A1=A2)</th>\n",
       "      <th>p(A1=A3+A4)</th>\n",
       "      <th>p(A1&gt;A3)</th>\n",
       "      <th>p(A1&gt;A4)</th>\n",
       "      <th>p(A3∅A4)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>qawiki</td>\n",
       "      <td>classification</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.9133</td>\n",
       "      <td>0.9267</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.9733</td>\n",
       "      <td>0.52</td>\n",
       "      <td>0.9636</td>\n",
       "      <td>0.9644</td>\n",
       "      <td>...</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>0.93007</td>\n",
       "      <td>0.857143</td>\n",
       "      <td>0.963338</td>\n",
       "      <td>0.9864</td>\n",
       "      <td>7.878384e-08</td>\n",
       "      <td>1.187939e-14</td>\n",
       "      <td>9.094947e-13</td>\n",
       "      <td>2.841261e-09</td>\n",
       "      <td>0.087159</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>qawiki</td>\n",
       "      <td>fixing</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.8933</td>\n",
       "      <td>0.8933</td>\n",
       "      <td>0.9533</td>\n",
       "      <td>0.9867</td>\n",
       "      <td>0.62</td>\n",
       "      <td>0.9418</td>\n",
       "      <td>0.9191</td>\n",
       "      <td>...</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>9.430375e-07</td>\n",
       "      <td>1.541878e-12</td>\n",
       "      <td>1.076842e-09</td>\n",
       "      <td>1.455192e-10</td>\n",
       "      <td>0.486850</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>qawiki</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.82</td>\n",
       "      <td>0.64</td>\n",
       "      <td>0.7867</td>\n",
       "      <td>0.7667</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.8907</td>\n",
       "      <td>0.7825</td>\n",
       "      <td>...</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>1.037158e-04</td>\n",
       "      <td>9.887175e-02</td>\n",
       "      <td>1.995909e-01</td>\n",
       "      <td>6.075914e-01</td>\n",
       "      <td>0.014633</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>qawiki</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.68</td>\n",
       "      <td>0.5667</td>\n",
       "      <td>0.7267</td>\n",
       "      <td>0.74</td>\n",
       "      <td>0.5867</td>\n",
       "      <td>0.7996</td>\n",
       "      <td>0.7421</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.56338</td>\n",
       "      <td>0.625</td>\n",
       "      <td>0.735675</td>\n",
       "      <td>0.855812</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>spinach</td>\n",
       "      <td>classification</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.9333</td>\n",
       "      <td>0.86</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.9733</td>\n",
       "      <td>0.7</td>\n",
       "      <td>0.9625</td>\n",
       "      <td>0.9308</td>\n",
       "      <td>...</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>0.892086</td>\n",
       "      <td>0.454545</td>\n",
       "      <td>0.942163</td>\n",
       "      <td>0.787336</td>\n",
       "      <td>1.358992e-09</td>\n",
       "      <td>2.256225e-16</td>\n",
       "      <td>1.776357e-15</td>\n",
       "      <td>4.355627e-12</td>\n",
       "      <td>0.850554</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>spinach</td>\n",
       "      <td>fixing</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.94</td>\n",
       "      <td>0.84</td>\n",
       "      <td>0.94</td>\n",
       "      <td>0.9733</td>\n",
       "      <td>0.78</td>\n",
       "      <td>0.9592</td>\n",
       "      <td>0.8977</td>\n",
       "      <td>...</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>3.481659e-13</td>\n",
       "      <td>1.641048e-15</td>\n",
       "      <td>1.000444e-11</td>\n",
       "      <td>6.821210e-13</td>\n",
       "      <td>0.021271</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>spinach</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.7533</td>\n",
       "      <td>0.5867</td>\n",
       "      <td>0.7533</td>\n",
       "      <td>0.7533</td>\n",
       "      <td>0.62</td>\n",
       "      <td>0.8193</td>\n",
       "      <td>0.7381</td>\n",
       "      <td>...</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>7.915897e-03</td>\n",
       "      <td>2.563208e-03</td>\n",
       "      <td>4.095959e-02</td>\n",
       "      <td>1.189205e-01</td>\n",
       "      <td>0.006611</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>spinach</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.6333</td>\n",
       "      <td>0.46</td>\n",
       "      <td>0.6667</td>\n",
       "      <td>0.6733</td>\n",
       "      <td>0.7133</td>\n",
       "      <td>0.7885</td>\n",
       "      <td>0.6965</td>\n",
       "      <td>...</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.452555</td>\n",
       "      <td>0.538462</td>\n",
       "      <td>0.702392</td>\n",
       "      <td>0.634269</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>synthetic</td>\n",
       "      <td>classification</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.8933</td>\n",
       "      <td>0.8867</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.9733</td>\n",
       "      <td>0.8867</td>\n",
       "      <td>0.9599</td>\n",
       "      <td>0.9285</td>\n",
       "      <td>...</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>0.903448</td>\n",
       "      <td>0.4</td>\n",
       "      <td>0.939812</td>\n",
       "      <td>0.599</td>\n",
       "      <td>4.290179e-13</td>\n",
       "      <td>1.412841e-19</td>\n",
       "      <td>3.469447e-18</td>\n",
       "      <td>4.618528e-14</td>\n",
       "      <td>0.663624</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>synthetic</td>\n",
       "      <td>fixing</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.88</td>\n",
       "      <td>0.9</td>\n",
       "      <td>0.9667</td>\n",
       "      <td>0.9933</td>\n",
       "      <td>0.9267</td>\n",
       "      <td>0.9697</td>\n",
       "      <td>0.9482</td>\n",
       "      <td>...</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>4.618528e-14</td>\n",
       "      <td>5.361785e-21</td>\n",
       "      <td>1.187939e-14</td>\n",
       "      <td>4.440892e-16</td>\n",
       "      <td>0.078354</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>synthetic</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.6667</td>\n",
       "      <td>0.54</td>\n",
       "      <td>0.7333</td>\n",
       "      <td>0.78</td>\n",
       "      <td>0.8667</td>\n",
       "      <td>0.8036</td>\n",
       "      <td>0.7549</td>\n",
       "      <td>...</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>2.409291e-02</td>\n",
       "      <td>4.561533e-03</td>\n",
       "      <td>1.266034e-02</td>\n",
       "      <td>6.600448e-03</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>synthetic</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.5533</td>\n",
       "      <td>0.3933</td>\n",
       "      <td>0.6067</td>\n",
       "      <td>0.6467</td>\n",
       "      <td>0.8667</td>\n",
       "      <td>0.7869</td>\n",
       "      <td>0.6867</td>\n",
       "      <td>...</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.39726</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.692465</td>\n",
       "      <td>0.474675</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>overall</td>\n",
       "      <td>classification</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.9133</td>\n",
       "      <td>0.8911</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.9733</td>\n",
       "      <td>0.7022</td>\n",
       "      <td>0.962</td>\n",
       "      <td>0.9412</td>\n",
       "      <td>...</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>0.908665</td>\n",
       "      <td>0.565217</td>\n",
       "      <td>0.948456</td>\n",
       "      <td>0.806978</td>\n",
       "      <td>4.528857e-72</td>\n",
       "      <td>9.877908e-127</td>\n",
       "      <td>6.879105e-136</td>\n",
       "      <td>1.150670e-95</td>\n",
       "      <td>0.230875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>overall</td>\n",
       "      <td>fixing</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.9044</td>\n",
       "      <td>0.8778</td>\n",
       "      <td>0.9533</td>\n",
       "      <td>0.9844</td>\n",
       "      <td>0.7756</td>\n",
       "      <td>0.9569</td>\n",
       "      <td>0.9217</td>\n",
       "      <td>...</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>2.453163e-72</td>\n",
       "      <td>1.439585e-114</td>\n",
       "      <td>1.863491e-85</td>\n",
       "      <td>7.762493e-109</td>\n",
       "      <td>0.000601</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>overall</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.7467</td>\n",
       "      <td>0.5889</td>\n",
       "      <td>0.7578</td>\n",
       "      <td>0.7667</td>\n",
       "      <td>0.6622</td>\n",
       "      <td>0.8379</td>\n",
       "      <td>0.7585</td>\n",
       "      <td>...</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>3.176076e-13</td>\n",
       "      <td>1.191867e-10</td>\n",
       "      <td>4.601966e-08</td>\n",
       "      <td>8.012738e-07</td>\n",
       "      <td>0.000264</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>overall</td>\n",
       "      <td>zero-shot</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>0.6222</td>\n",
       "      <td>0.4733</td>\n",
       "      <td>0.6667</td>\n",
       "      <td>0.6867</td>\n",
       "      <td>0.7222</td>\n",
       "      <td>0.7917</td>\n",
       "      <td>0.7084</td>\n",
       "      <td>...</td>\n",
       "      <td>0.1875</td>\n",
       "      <td>0.470588</td>\n",
       "      <td>0.52</td>\n",
       "      <td>0.710102</td>\n",
       "      <td>0.679628</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>16 rows × 37 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      dataset          action    llm  ?A1=A2  ?A1=A3+A4  ?A1>A3  ?A1>A4  \\\n",
       "0      qawiki  classification  gpt-5  0.9133     0.9267     1.0  0.9733   \n",
       "1      qawiki          fixing  gpt-5  0.8933     0.8933  0.9533  0.9867   \n",
       "2      qawiki        wikidata  gpt-5    0.82       0.64  0.7867  0.7667   \n",
       "3      qawiki       zero-shot  gpt-5    0.68     0.5667  0.7267    0.74   \n",
       "4     spinach  classification  gpt-5  0.9333       0.86     1.0  0.9733   \n",
       "5     spinach          fixing  gpt-5    0.94       0.84    0.94  0.9733   \n",
       "6     spinach        wikidata  gpt-5  0.7533     0.5867  0.7533  0.7533   \n",
       "7     spinach       zero-shot  gpt-5  0.6333       0.46  0.6667  0.6733   \n",
       "8   synthetic  classification  gpt-5  0.8933     0.8867     1.0  0.9733   \n",
       "9   synthetic          fixing  gpt-5    0.88        0.9  0.9667  0.9933   \n",
       "10  synthetic        wikidata  gpt-5  0.6667       0.54  0.7333    0.78   \n",
       "11  synthetic       zero-shot  gpt-5  0.5533     0.3933  0.6067  0.6467   \n",
       "12    overall  classification  gpt-5  0.9133     0.8911     1.0  0.9733   \n",
       "13    overall          fixing  gpt-5  0.9044     0.8778  0.9533  0.9844   \n",
       "14    overall        wikidata  gpt-5  0.7467     0.5889  0.7578  0.7667   \n",
       "15    overall       zero-shot  gpt-5  0.6222     0.4733  0.6667  0.6867   \n",
       "\n",
       "    ?A3∅A4  J(A1-A2)  J(A1-A34)  ...  ?A3∅A4(-)  ?A1=A3+A4(+)  ?A1=A3+A4(-)  \\\n",
       "0     0.52    0.9636     0.9644  ...       <NA>       0.93007      0.857143   \n",
       "1     0.62    0.9418     0.9191  ...       <NA>          <NA>          <NA>   \n",
       "2      0.5    0.8907     0.7825  ...       <NA>          <NA>          <NA>   \n",
       "3   0.5867    0.7996     0.7421  ...        0.0       0.56338         0.625   \n",
       "4      0.7    0.9625     0.9308  ...       <NA>      0.892086      0.454545   \n",
       "5     0.78    0.9592     0.8977  ...       <NA>          <NA>          <NA>   \n",
       "6     0.62    0.8193     0.7381  ...       <NA>          <NA>          <NA>   \n",
       "7   0.7133    0.7885     0.6965  ...       0.25      0.452555      0.538462   \n",
       "8   0.8867    0.9599     0.9285  ...       <NA>      0.903448           0.4   \n",
       "9   0.9267    0.9697     0.9482  ...       <NA>          <NA>          <NA>   \n",
       "10  0.8667    0.8036     0.7549  ...       <NA>          <NA>          <NA>   \n",
       "11  0.8667    0.7869     0.6867  ...       0.25       0.39726          0.25   \n",
       "12  0.7022     0.962     0.9412  ...       <NA>      0.908665      0.565217   \n",
       "13  0.7756    0.9569     0.9217  ...       <NA>          <NA>          <NA>   \n",
       "14  0.6622    0.8379     0.7585  ...       <NA>          <NA>          <NA>   \n",
       "15  0.7222    0.7917     0.7084  ...     0.1875      0.470588          0.52   \n",
       "\n",
       "    J(1-34)+  J(1-34)-      p(A1=A2)    p(A1=A3+A4)       p(A1>A3)  \\\n",
       "0   0.963338    0.9864  7.878384e-08   1.187939e-14   9.094947e-13   \n",
       "1       <NA>      <NA>  9.430375e-07   1.541878e-12   1.076842e-09   \n",
       "2       <NA>      <NA>  1.037158e-04   9.887175e-02   1.995909e-01   \n",
       "3   0.735675  0.855812  1.000000e+00   1.000000e+00   1.000000e+00   \n",
       "4   0.942163  0.787336  1.358992e-09   2.256225e-16   1.776357e-15   \n",
       "5       <NA>      <NA>  3.481659e-13   1.641048e-15   1.000444e-11   \n",
       "6       <NA>      <NA>  7.915897e-03   2.563208e-03   4.095959e-02   \n",
       "7   0.702392  0.634269  1.000000e+00   1.000000e+00   1.000000e+00   \n",
       "8   0.939812     0.599  4.290179e-13   1.412841e-19   3.469447e-18   \n",
       "9       <NA>      <NA>  4.618528e-14   5.361785e-21   1.187939e-14   \n",
       "10      <NA>      <NA>  2.409291e-02   4.561533e-03   1.266034e-02   \n",
       "11  0.692465  0.474675  1.000000e+00   1.000000e+00   1.000000e+00   \n",
       "12  0.948456  0.806978  4.528857e-72  9.877908e-127  6.879105e-136   \n",
       "13      <NA>      <NA>  2.453163e-72  1.439585e-114   1.863491e-85   \n",
       "14      <NA>      <NA>  3.176076e-13   1.191867e-10   4.601966e-08   \n",
       "15  0.710102  0.679628  1.000000e+00   1.000000e+00   1.000000e+00   \n",
       "\n",
       "         p(A1>A4)  p(A3∅A4)  \n",
       "0    2.841261e-09  0.087159  \n",
       "1    1.455192e-10  0.486850  \n",
       "2    6.075914e-01  0.014633  \n",
       "3    1.000000e+00  1.000000  \n",
       "4    4.355627e-12  0.850554  \n",
       "5    6.821210e-13  0.021271  \n",
       "6    1.189205e-01  0.006611  \n",
       "7    1.000000e+00  1.000000  \n",
       "8    4.618528e-14  0.663624  \n",
       "9    4.440892e-16  0.078354  \n",
       "10   6.600448e-03  1.000000  \n",
       "11   1.000000e+00  1.000000  \n",
       "12   1.150670e-95  0.230875  \n",
       "13  7.762493e-109  0.000601  \n",
       "14   8.012738e-07  0.000264  \n",
       "15   1.000000e+00  1.000000  \n",
       "\n",
       "[16 rows x 37 columns]"
      ]
     },
     "execution_count": 224,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_summery = df_summary.merge(df_pval, on=[\"dataset\",\"llm\",\"action\"], how=\"left\")\n",
    "df_summery"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 225,
   "id": "ca70299a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['dataset', 'action', 'llm', '?A1=A2', '?A1=A3+A4', '?A1>A3', '?A1>A4',\n",
       "       '?A3∅A4', 'J(A1-A2)', 'J(A1-A34)', 'J(A1-A1*)', 'J(A1-A1**)',\n",
       "       'J(A1*-A1**)', 'idk_A1', 'idk_A2', 'idk_A3', 'idk_A4', 'J_A1_ave',\n",
       "       '?A1=A2(+)', '?A1=A2(-)', 'J(1-2)+', 'J(1-2)-', '?A1>A3(+)',\n",
       "       '?A1>A3(-)', '?A1>A4(+)', '?A1>A4(-)', '?A3∅A4(+)', '?A3∅A4(-)',\n",
       "       '?A1=A3+A4(+)', '?A1=A3+A4(-)', 'J(1-34)+', 'J(1-34)-', 'p(A1=A2)',\n",
       "       'p(A1=A3+A4)', 'p(A1>A3)', 'p(A1>A4)', 'p(A3∅A4)'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 225,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_summery.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f986f114",
   "metadata": {},
   "source": [
    "# TODO:\n",
    "\n",
    "1. relation-classification analysis\n",
    "\n",
    "    In the zero-shot, there are relation classification to see \n",
    " - if the LLMs able to understand the relation between answers correctly. \n",
    " - if incorrect, normally which ones misclassified to which ones? Probably confusion matrix here. \n",
    "    \n",
    "2. internal inconsistency\n",
    "\n",
    "    We observe that even we ask the exactly same questions multiple times, there are always with difference/inconsistency. \n",
    " - how much these internal inconsistency they are? \n",
    " - how to exclude them into the inconsistency between questions, say properly evaluate the inconsistency caused by questions.\n",
    " - the impact factor of these internal inconsistency to answers inconsistency? \n",
    "\n",
    "3. *correct relation identification lead consistency answers? \n",
    "\n",
    "    We have the relation classification in classification-and-question actions\n",
    "- the inconsistency if LLMs identify relation correctly, as well as when they identify incorrectly;\n",
    "- the impact factor of this relation identification to inconsistency? \n",
    "\n",
    "4. can we build the impact factors model to attribute the cause of inconsistency? \n",
    " We can conclude some factors such as internal inconsistency, relation-identification capability (semantic understanding to text), ...\n",
    "\n",
    "5. how can we mitigate the inconsistency? \n",
    "\n",
    "    we tried several actions, and let's do further detailed analysis to actions and consequencs. \n",
    "- wikidata: consistency improved, the trade-off is more \"idk\" answers.\n",
    "\n",
    "6. *an overall analysis include all datasets together. \n",
    "\n",
    "\n",
    "7. explain of how we get the data, and the properties of each datasets. \n",
    "\n",
    "8. *empty ratio need be include \"idk\", rename is no answers or idk. \n",
    "\n",
    "9. *compute p-values.\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a1de4d42",
   "metadata": {},
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
