{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "89317569",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "import pandas as pd\n",
    "\n",
    "def summarize_json_scores(folder):\n",
    "    \"\"\"\n",
    "    Reads all JSON files in the given folder and computes the average score for each file.\n",
    "    Assumes each JSON file is a dictionary of numeric values.\n",
    "    \n",
    "    Returns:\n",
    "        A pandas DataFrame with columns: Filename, Average Score, Count of 1s, Ratio of 1s\n",
    "    \"\"\"\n",
    "    rows = []\n",
    "\n",
    "    for filename in sorted(os.listdir(folder)):\n",
    "        if not filename.endswith(\".json\"):\n",
    "            continue\n",
    "\n",
    "        filepath = os.path.join(folder, filename)\n",
    "\n",
    "        with open(filepath, \"r\") as f:\n",
    "            data = json.load(f)\n",
    "\n",
    "        values = list(data.values())\n",
    "        if not values:\n",
    "            continue\n",
    "\n",
    "        avg_score = sum(values) / len(values)\n",
    "        count_ones = sum(1 for v in values if v == 1)\n",
    "        ratio_ones = count_ones / len(values)\n",
    "\n",
    "        rows.append({\n",
    "            \"Filename\": filename,\n",
    "            \"Average Score\": round(avg_score, 4),\n",
    "            \"Count of 1s\": count_ones,\n",
    "            \"Size\": len(values)\n",
    "        })\n",
    "\n",
    "    return pd.DataFrame(rows)\n",
    "\n",
    "# Example usage\n",
    "# df = summarize_json_scores(\"../data/evaluation_results/\")\n",
    "# print(df)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "181ad737",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                   Filename  Average Score  Count of 1s  Size\n",
      "0  Containment_gpt-4.1-mini-2025-04-14.json         0.9800          147   150\n",
      "1  Containment_gpt-4.1-nano-2025-04-14.json         0.4000           60   150\n",
      "2                  Containment_gpt-4.1.json         0.9933          149   150\n",
      "3       Equivalence_gpt-4.1-2025-04-14.json         0.9667          145   150\n",
      "4  Equivalence_gpt-4.1-mini-2025-04-14.json         0.9600          144   150\n",
      "5  Equivalence_gpt-4.1-nano-2025-04-14.json         0.9467          142   150\n",
      "6             Minus_gpt-4.1-2025-04-14.json         0.4533           68   150\n",
      "7        Minus_gpt-4.1-mini-2025-04-14.json         0.4800           72   150\n",
      "8        Minus_gpt-4.1-nano-2025-04-14.json         0.0133            2   150\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import json\n",
    "\n",
    "datasets = ['spinach']\n",
    "\n",
    "for dataset in datasets:\n",
    "    folder = f'../data/answers/zero-shot/{dataset}/relation-classification/'\n",
    "    df = summarize_json_scores(folder)\n",
    "    print(df)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1864ee3d",
   "metadata": {},
   "source": [
    "### Evaluation Pipeline "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "13fd45b8",
   "metadata": {},
   "outputs": [],
   "source": [
    "def jaccard_similarity(set1, set2):\n",
    "    \"\"\"Calculate the Jaccard similarity between two sets.\"\"\"\n",
    "    intersection = len(set1.intersection(set2))\n",
    "    union = len(set1.union(set2))\n",
    "    if union == 0:\n",
    "        return 0.0\n",
    "    return intersection / union\n",
    "\n",
    "# For the subset-superset check\n",
    "def is_subset(set1, set2):\n",
    "    \"\"\"Check if set1 is a subset of set2.\"\"\"\n",
    "    return set1.issubset(set2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "c65423f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "import pandas as pd\n",
    "\n",
    "def compute_similarity(\n",
    "    model: str,\n",
    "    task: str,\n",
    "    action: str,\n",
    "    base_dir: str = '../data/answers',\n",
    "    output_dir: str = '.',\n",
    "    star = False\n",
    "):\n",
    "    if star:\n",
    "        prefix = '*'\n",
    "    else:\n",
    "        prefix = ''\n",
    "    # Build file paths\n",
    "    action_word = action+ \"\"\n",
    "    #if action == \"wikidata\":\n",
    "    #     action_word = \"wikidata_\"\n",
    "    \n",
    "    if task == \"equal\":\n",
    "        ql1_path = os.path.join(base_dir, task, f'{prefix}Q1_{task}_answers_{action_word}{model}_wikidata.json')\n",
    "        ql2_path = os.path.join(base_dir, task, f'{prefix}Q2_{task}_answers_{action_word}{model}_wikidata.json')\n",
    "    if task == \"minus\":\n",
    "        ql3_path = os.path.join(base_dir, task, f'{prefix}Q4_{task}_answers_{action_word}{model}_wikidata.json')\n",
    "        ql1_path = os.path.join(base_dir, \"equal\", f'{prefix}Q1_equal_answers_{action_word}{model}_wikidata.json')\n",
    "        ql2_path = os.path.join(base_dir, \"sup-sub\", f'{prefix}Q3_sup-sub_answers_{action_word}{model}_wikidata.json')\n",
    "    if task == \"sup-sub\":\n",
    "        ql1_path = os.path.join(base_dir, 'equal', f'{prefix}Q1_equal_answers_{action_word}{model}_wikidata.json')\n",
    "        ql2_path = os.path.join(base_dir, 'sup-sub', f'{prefix}Q3_sup-sub_answers_{action_word}{model}_wikidata.json')\n",
    "        ql3_path = os.path.join(base_dir, 'minus', f'{prefix}Q4_minus_answers_{action_word}{model}_wikidata.json')\n",
    "\n",
    "    # Load data\n",
    "    try: \n",
    "        with open(ql1_path, 'r', encoding='utf-8') as f:\n",
    "            ql1_answers = json.load(f)\n",
    "        with open(ql2_path, 'r', encoding='utf-8') as f:\n",
    "            ql2_answers = json.load(f)\n",
    "        if task != \"equal\":\n",
    "            with open(ql3_path, 'r', encoding='utf-8') as f:\n",
    "                ql3_answers = json.load(f)\n",
    "        else:\n",
    "            ql3_answers = None\n",
    "    except:\n",
    "        print(f\"Error loading files: {ql1_path}, {ql2_path}, {ql3_path}\")   \n",
    "        return None\n",
    "    # Compute metrics per question\n",
    "    similarity_scores = {}\n",
    "    if task == \"minus\":\n",
    "        # For minus task, we need to compare ql1 and ql2 with ql3\n",
    "        for qid, ans3 in ql3_answers.items():\n",
    "            set3 = set(ans3)\n",
    "            #qid = str(int(qid) + 1)\n",
    "            set_a = set(ql1_answers.get(qid, []))\n",
    "            set_b = set(ql2_answers.get(qid, [])) \n",
    "            set_c =  set_a - set_b \n",
    "            sim = jaccard_similarity(set3, set_c)\n",
    "            is_empty = int(len(set3) == 0 and len(set_c) == 0)\n",
    "            binary_eq = int(set_c == set3)\n",
    "            similarity_scores[qid] = (sim, is_empty, binary_eq, 0)\n",
    "    else:\n",
    "        for qid, ans1 in ql1_answers.items():\n",
    "            set1 = set(ans1)\n",
    "            set2 = set(ql2_answers.get(qid, []))\n",
    "            is_empty = int(len(set1) == 0 and len(set2) == 0)\n",
    "            if task == \"equal\":\n",
    "                sim = jaccard_similarity(set1, set2)\n",
    "                binary_eq = int(set1 == set2)\n",
    "                is_intersection_empty = 0\n",
    "            else:\n",
    "                set_c = set(ql3_answers.get(qid, []))\n",
    "                union = set2.union(set_c) \n",
    "                sim = jaccard_similarity(set1, union)\n",
    "                binary_eq = int(is_subset(set2, set1))\n",
    "                is_intersection_empty = not set2 & set_c\n",
    "            similarity_scores[qid] = (sim, is_empty, binary_eq, int(is_intersection_empty))\n",
    "\n",
    "    # Create DataFrame\n",
    "    sim_df = pd.DataFrame.from_dict(\n",
    "        similarity_scores,\n",
    "        orient='index',\n",
    "        columns=['JaccardSimilarity', 'IsEmptySet', 'BinaryEqual','B_Intersected_C_EmptyCount']\n",
    "    )\n",
    "\n",
    "    # Ensure index is a column\n",
    "    sim_df = sim_df.reset_index().rename(columns={'index': 'QuestionID'})\n",
    "\n",
    "    # Save to TSV\n",
    "    output_filename = f'{prefix}{task}_{model}_{action}_wikidata.tsv'\n",
    "    output_path = os.path.join(output_dir, output_filename)\n",
    "    sim_df.to_csv(output_path, sep='\\t', index=False)\n",
    "    print(f\"Saved similarity results to {output_path}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "6a280277",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved similarity results to ../data/evaluation_results/spinach/equal_gpt-4.1-nano-2025-04-14__wikidata.tsv\n",
      "Saved similarity results to ../data/evaluation_results/spinach/equal_gpt-4.1-nano-2025-04-14__wikidata.tsv\n",
      "Saved similarity results to ../data/evaluation_results/spinach/sup-sub_gpt-4.1-nano-2025-04-14__wikidata.tsv\n",
      "Saved similarity results to ../data/evaluation_results/spinach/sup-sub_gpt-4.1-nano-2025-04-14__wikidata.tsv\n",
      "Saved similarity results to ../data/evaluation_results/spinach/minus_gpt-4.1-nano-2025-04-14__wikidata.tsv\n",
      "Saved similarity results to ../data/evaluation_results/spinach/minus_gpt-4.1-nano-2025-04-14__wikidata.tsv\n",
      "Saved similarity results to ../data/evaluation_results/spinach/equal_gpt-4.1-mini-2025-04-14__wikidata.tsv\n",
      "Saved similarity results to ../data/evaluation_results/spinach/equal_gpt-4.1-mini-2025-04-14__wikidata.tsv\n",
      "Saved similarity results to ../data/evaluation_results/spinach/sup-sub_gpt-4.1-mini-2025-04-14__wikidata.tsv\n",
      "Saved similarity results to ../data/evaluation_results/spinach/sup-sub_gpt-4.1-mini-2025-04-14__wikidata.tsv\n",
      "Saved similarity results to ../data/evaluation_results/spinach/minus_gpt-4.1-mini-2025-04-14__wikidata.tsv\n",
      "Saved similarity results to ../data/evaluation_results/spinach/minus_gpt-4.1-mini-2025-04-14__wikidata.tsv\n",
      "Saved similarity results to ../data/evaluation_results/spinach/equal_gpt-4.1-2025-04-14__wikidata.tsv\n",
      "Saved similarity results to ../data/evaluation_results/spinach/equal_gpt-4.1-2025-04-14__wikidata.tsv\n",
      "Saved similarity results to ../data/evaluation_results/spinach/sup-sub_gpt-4.1-2025-04-14__wikidata.tsv\n",
      "Saved similarity results to ../data/evaluation_results/spinach/sup-sub_gpt-4.1-2025-04-14__wikidata.tsv\n",
      "Saved similarity results to ../data/evaluation_results/spinach/minus_gpt-4.1-2025-04-14__wikidata.tsv\n",
      "Saved similarity results to ../data/evaluation_results/spinach/minus_gpt-4.1-2025-04-14__wikidata.tsv\n"
     ]
    }
   ],
   "source": [
    "# Example usage:\n",
    "model = ['gpt-4.1-nano-2025-04-14',\"gpt-4.1-mini-2025-04-14\",\"gpt-4.1-2025-04-14\"]\n",
    "task = ['equal', 'sup-sub', \"minus\"]\n",
    "action = [\"\"]\n",
    "operations = ['zero-shot',\"follow_up_fixing\",\"rel_classification_and_questions\"]\n",
    "datasets = ['spinach']\n",
    "for dataset in datasets:\n",
    "    for m in model:\n",
    "        for t in task:\n",
    "            for a in action:\n",
    "                compute_similarity(\n",
    "                    model=m,\n",
    "                    task=t,\n",
    "                    action=a,\n",
    "                    base_dir=f'../data/answers/{operations[0]}/{dataset}',\n",
    "                    output_dir=f'../data/evaluation_results/{dataset}',\n",
    "                )\n",
    "                compute_similarity(\n",
    "                    model=m,\n",
    "                    task=t,\n",
    "                    action=a,\n",
    "                    base_dir=f'../data/answers/{operations[0]}/{dataset}',\n",
    "                    output_dir=f'../data/evaluation_results/{dataset}',\n",
    "                    star=False\n",
    "                )\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "4337d49e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Task</th>\n",
       "      <th>Model</th>\n",
       "      <th>Action</th>\n",
       "      <th>Average_All</th>\n",
       "      <th>Ratio_empty</th>\n",
       "      <th>Average_NoEmpty</th>\n",
       "      <th>Binary_count</th>\n",
       "      <th>B_Intersected_C_EmptyCount</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>equal</td>\n",
       "      <td>gpt-4.1-2025-04-14</td>\n",
       "      <td></td>\n",
       "      <td>0.7438</td>\n",
       "      <td>0.0400</td>\n",
       "      <td>0.7748</td>\n",
       "      <td>0.4733</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>equal</td>\n",
       "      <td>gpt-4.1-2025-04-14</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>0.7980</td>\n",
       "      <td>0.0267</td>\n",
       "      <td>0.8199</td>\n",
       "      <td>0.5000</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>equal</td>\n",
       "      <td>gpt-4.1-2025-04-14</td>\n",
       "      <td>classAndAnswer</td>\n",
       "      <td>0.9249</td>\n",
       "      <td>0.0515</td>\n",
       "      <td>0.9751</td>\n",
       "      <td>0.9485</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>equal</td>\n",
       "      <td>gpt-4.1-2025-04-14</td>\n",
       "      <td>fixing</td>\n",
       "      <td>0.9275</td>\n",
       "      <td>0.0467</td>\n",
       "      <td>0.9730</td>\n",
       "      <td>0.9067</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>equal</td>\n",
       "      <td>gpt-4.1-mini-2025-04-14</td>\n",
       "      <td></td>\n",
       "      <td>0.7116</td>\n",
       "      <td>0.1000</td>\n",
       "      <td>0.7907</td>\n",
       "      <td>0.5067</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>equal</td>\n",
       "      <td>gpt-4.1-mini-2025-04-14</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>0.7297</td>\n",
       "      <td>0.0333</td>\n",
       "      <td>0.7549</td>\n",
       "      <td>0.4400</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>equal</td>\n",
       "      <td>gpt-4.1-mini-2025-04-14</td>\n",
       "      <td>classAndAnswer</td>\n",
       "      <td>0.8755</td>\n",
       "      <td>0.1067</td>\n",
       "      <td>0.9800</td>\n",
       "      <td>0.9400</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>equal</td>\n",
       "      <td>gpt-4.1-mini-2025-04-14</td>\n",
       "      <td>fixing</td>\n",
       "      <td>0.9138</td>\n",
       "      <td>0.0333</td>\n",
       "      <td>0.9453</td>\n",
       "      <td>0.8533</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>equal</td>\n",
       "      <td>gpt-4.1-nano-2025-04-14</td>\n",
       "      <td></td>\n",
       "      <td>0.6025</td>\n",
       "      <td>0.1600</td>\n",
       "      <td>0.7172</td>\n",
       "      <td>0.3467</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>equal</td>\n",
       "      <td>gpt-4.1-nano-2025-04-14</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>0.6103</td>\n",
       "      <td>0.1333</td>\n",
       "      <td>0.7042</td>\n",
       "      <td>0.3333</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>equal</td>\n",
       "      <td>gpt-4.1-nano-2025-04-14</td>\n",
       "      <td>classAndAnswer</td>\n",
       "      <td>0.0155</td>\n",
       "      <td>0.9800</td>\n",
       "      <td>0.7755</td>\n",
       "      <td>0.8333</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>equal</td>\n",
       "      <td>gpt-4.1-nano-2025-04-14</td>\n",
       "      <td>fixing</td>\n",
       "      <td>0.6881</td>\n",
       "      <td>0.2733</td>\n",
       "      <td>0.9469</td>\n",
       "      <td>0.7600</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>minus</td>\n",
       "      <td>gpt-4.1-2025-04-14</td>\n",
       "      <td></td>\n",
       "      <td>0.3465</td>\n",
       "      <td>0.3600</td>\n",
       "      <td>0.5414</td>\n",
       "      <td>0.1533</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>minus</td>\n",
       "      <td>gpt-4.1-2025-04-14</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>0.3664</td>\n",
       "      <td>0.3600</td>\n",
       "      <td>0.5726</td>\n",
       "      <td>0.1667</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>minus</td>\n",
       "      <td>gpt-4.1-2025-04-14</td>\n",
       "      <td>classAndAnswer</td>\n",
       "      <td>0.3317</td>\n",
       "      <td>0.5533</td>\n",
       "      <td>0.7426</td>\n",
       "      <td>0.3933</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>minus</td>\n",
       "      <td>gpt-4.1-2025-04-14</td>\n",
       "      <td>fixing</td>\n",
       "      <td>0.6476</td>\n",
       "      <td>0.1867</td>\n",
       "      <td>0.7963</td>\n",
       "      <td>0.5200</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>minus</td>\n",
       "      <td>gpt-4.1-mini-2025-04-14</td>\n",
       "      <td></td>\n",
       "      <td>0.2616</td>\n",
       "      <td>0.4267</td>\n",
       "      <td>0.4563</td>\n",
       "      <td>0.0933</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>minus</td>\n",
       "      <td>gpt-4.1-mini-2025-04-14</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>0.2763</td>\n",
       "      <td>0.3933</td>\n",
       "      <td>0.4555</td>\n",
       "      <td>0.0667</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>minus</td>\n",
       "      <td>gpt-4.1-mini-2025-04-14</td>\n",
       "      <td>classAndAnswer</td>\n",
       "      <td>0.3798</td>\n",
       "      <td>0.4533</td>\n",
       "      <td>0.6947</td>\n",
       "      <td>0.3400</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>minus</td>\n",
       "      <td>gpt-4.1-mini-2025-04-14</td>\n",
       "      <td>fixing</td>\n",
       "      <td>0.5571</td>\n",
       "      <td>0.2667</td>\n",
       "      <td>0.7597</td>\n",
       "      <td>0.4667</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>minus</td>\n",
       "      <td>gpt-4.1-nano-2025-04-14</td>\n",
       "      <td></td>\n",
       "      <td>0.1749</td>\n",
       "      <td>0.5933</td>\n",
       "      <td>0.4300</td>\n",
       "      <td>0.0933</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>minus</td>\n",
       "      <td>gpt-4.1-nano-2025-04-14</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>0.1955</td>\n",
       "      <td>0.5333</td>\n",
       "      <td>0.4190</td>\n",
       "      <td>0.0800</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>minus</td>\n",
       "      <td>gpt-4.1-nano-2025-04-14</td>\n",
       "      <td>classAndAnswer</td>\n",
       "      <td>0.0003</td>\n",
       "      <td>0.9933</td>\n",
       "      <td>0.0392</td>\n",
       "      <td>0.8600</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>minus</td>\n",
       "      <td>gpt-4.1-nano-2025-04-14</td>\n",
       "      <td>fixing</td>\n",
       "      <td>0.1702</td>\n",
       "      <td>0.6000</td>\n",
       "      <td>0.4255</td>\n",
       "      <td>0.1533</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>sup-sub</td>\n",
       "      <td>gpt-4.1-2025-04-14</td>\n",
       "      <td></td>\n",
       "      <td>0.5839</td>\n",
       "      <td>0.0600</td>\n",
       "      <td>0.6211</td>\n",
       "      <td>0.5467</td>\n",
       "      <td>99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>sup-sub</td>\n",
       "      <td>gpt-4.1-2025-04-14</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>0.6200</td>\n",
       "      <td>0.0533</td>\n",
       "      <td>0.6550</td>\n",
       "      <td>0.5667</td>\n",
       "      <td>97</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>sup-sub</td>\n",
       "      <td>gpt-4.1-2025-04-14</td>\n",
       "      <td>classAndAnswer</td>\n",
       "      <td>0.7016</td>\n",
       "      <td>0.0825</td>\n",
       "      <td>0.7646</td>\n",
       "      <td>0.6701</td>\n",
       "      <td>85</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>sup-sub</td>\n",
       "      <td>gpt-4.1-2025-04-14</td>\n",
       "      <td>fixing</td>\n",
       "      <td>0.8123</td>\n",
       "      <td>0.0333</td>\n",
       "      <td>0.8403</td>\n",
       "      <td>0.8000</td>\n",
       "      <td>141</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>sup-sub</td>\n",
       "      <td>gpt-4.1-mini-2025-04-14</td>\n",
       "      <td></td>\n",
       "      <td>0.5521</td>\n",
       "      <td>0.0733</td>\n",
       "      <td>0.5958</td>\n",
       "      <td>0.5067</td>\n",
       "      <td>78</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>sup-sub</td>\n",
       "      <td>gpt-4.1-mini-2025-04-14</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>0.5670</td>\n",
       "      <td>0.0400</td>\n",
       "      <td>0.5906</td>\n",
       "      <td>0.4867</td>\n",
       "      <td>73</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>sup-sub</td>\n",
       "      <td>gpt-4.1-mini-2025-04-14</td>\n",
       "      <td>classAndAnswer</td>\n",
       "      <td>0.5671</td>\n",
       "      <td>0.1733</td>\n",
       "      <td>0.6860</td>\n",
       "      <td>0.6400</td>\n",
       "      <td>133</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>sup-sub</td>\n",
       "      <td>gpt-4.1-mini-2025-04-14</td>\n",
       "      <td>fixing</td>\n",
       "      <td>0.7235</td>\n",
       "      <td>0.0733</td>\n",
       "      <td>0.7807</td>\n",
       "      <td>0.6533</td>\n",
       "      <td>127</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>sup-sub</td>\n",
       "      <td>gpt-4.1-nano-2025-04-14</td>\n",
       "      <td></td>\n",
       "      <td>0.4385</td>\n",
       "      <td>0.1667</td>\n",
       "      <td>0.5263</td>\n",
       "      <td>0.6067</td>\n",
       "      <td>97</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>sup-sub</td>\n",
       "      <td>gpt-4.1-nano-2025-04-14</td>\n",
       "      <td>wikidata</td>\n",
       "      <td>0.4769</td>\n",
       "      <td>0.1133</td>\n",
       "      <td>0.5379</td>\n",
       "      <td>0.5733</td>\n",
       "      <td>82</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>sup-sub</td>\n",
       "      <td>gpt-4.1-nano-2025-04-14</td>\n",
       "      <td>classAndAnswer</td>\n",
       "      <td>0.0065</td>\n",
       "      <td>0.9800</td>\n",
       "      <td>0.3248</td>\n",
       "      <td>0.7533</td>\n",
       "      <td>148</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>sup-sub</td>\n",
       "      <td>gpt-4.1-nano-2025-04-14</td>\n",
       "      <td>fixing</td>\n",
       "      <td>0.3216</td>\n",
       "      <td>0.4200</td>\n",
       "      <td>0.5546</td>\n",
       "      <td>0.4600</td>\n",
       "      <td>85</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       Task                    Model          Action  Average_All  \\\n",
       "0     equal       gpt-4.1-2025-04-14                       0.7438   \n",
       "1     equal       gpt-4.1-2025-04-14        wikidata       0.7980   \n",
       "2     equal       gpt-4.1-2025-04-14  classAndAnswer       0.9249   \n",
       "3     equal       gpt-4.1-2025-04-14          fixing       0.9275   \n",
       "4     equal  gpt-4.1-mini-2025-04-14                       0.7116   \n",
       "5     equal  gpt-4.1-mini-2025-04-14        wikidata       0.7297   \n",
       "6     equal  gpt-4.1-mini-2025-04-14  classAndAnswer       0.8755   \n",
       "7     equal  gpt-4.1-mini-2025-04-14          fixing       0.9138   \n",
       "8     equal  gpt-4.1-nano-2025-04-14                       0.6025   \n",
       "9     equal  gpt-4.1-nano-2025-04-14        wikidata       0.6103   \n",
       "10    equal  gpt-4.1-nano-2025-04-14  classAndAnswer       0.0155   \n",
       "11    equal  gpt-4.1-nano-2025-04-14          fixing       0.6881   \n",
       "12    minus       gpt-4.1-2025-04-14                       0.3465   \n",
       "13    minus       gpt-4.1-2025-04-14        wikidata       0.3664   \n",
       "14    minus       gpt-4.1-2025-04-14  classAndAnswer       0.3317   \n",
       "15    minus       gpt-4.1-2025-04-14          fixing       0.6476   \n",
       "16    minus  gpt-4.1-mini-2025-04-14                       0.2616   \n",
       "17    minus  gpt-4.1-mini-2025-04-14        wikidata       0.2763   \n",
       "18    minus  gpt-4.1-mini-2025-04-14  classAndAnswer       0.3798   \n",
       "19    minus  gpt-4.1-mini-2025-04-14          fixing       0.5571   \n",
       "20    minus  gpt-4.1-nano-2025-04-14                       0.1749   \n",
       "21    minus  gpt-4.1-nano-2025-04-14        wikidata       0.1955   \n",
       "22    minus  gpt-4.1-nano-2025-04-14  classAndAnswer       0.0003   \n",
       "23    minus  gpt-4.1-nano-2025-04-14          fixing       0.1702   \n",
       "24  sup-sub       gpt-4.1-2025-04-14                       0.5839   \n",
       "25  sup-sub       gpt-4.1-2025-04-14        wikidata       0.6200   \n",
       "26  sup-sub       gpt-4.1-2025-04-14  classAndAnswer       0.7016   \n",
       "27  sup-sub       gpt-4.1-2025-04-14          fixing       0.8123   \n",
       "28  sup-sub  gpt-4.1-mini-2025-04-14                       0.5521   \n",
       "29  sup-sub  gpt-4.1-mini-2025-04-14        wikidata       0.5670   \n",
       "30  sup-sub  gpt-4.1-mini-2025-04-14  classAndAnswer       0.5671   \n",
       "31  sup-sub  gpt-4.1-mini-2025-04-14          fixing       0.7235   \n",
       "32  sup-sub  gpt-4.1-nano-2025-04-14                       0.4385   \n",
       "33  sup-sub  gpt-4.1-nano-2025-04-14        wikidata       0.4769   \n",
       "34  sup-sub  gpt-4.1-nano-2025-04-14  classAndAnswer       0.0065   \n",
       "35  sup-sub  gpt-4.1-nano-2025-04-14          fixing       0.3216   \n",
       "\n",
       "    Ratio_empty  Average_NoEmpty  Binary_count  B_Intersected_C_EmptyCount  \n",
       "0        0.0400           0.7748        0.4733                           0  \n",
       "1        0.0267           0.8199        0.5000                           0  \n",
       "2        0.0515           0.9751        0.9485                           0  \n",
       "3        0.0467           0.9730        0.9067                           0  \n",
       "4        0.1000           0.7907        0.5067                           0  \n",
       "5        0.0333           0.7549        0.4400                           0  \n",
       "6        0.1067           0.9800        0.9400                           0  \n",
       "7        0.0333           0.9453        0.8533                           0  \n",
       "8        0.1600           0.7172        0.3467                           0  \n",
       "9        0.1333           0.7042        0.3333                           0  \n",
       "10       0.9800           0.7755        0.8333                           0  \n",
       "11       0.2733           0.9469        0.7600                           0  \n",
       "12       0.3600           0.5414        0.1533                           0  \n",
       "13       0.3600           0.5726        0.1667                           0  \n",
       "14       0.5533           0.7426        0.3933                           0  \n",
       "15       0.1867           0.7963        0.5200                           0  \n",
       "16       0.4267           0.4563        0.0933                           0  \n",
       "17       0.3933           0.4555        0.0667                           0  \n",
       "18       0.4533           0.6947        0.3400                           0  \n",
       "19       0.2667           0.7597        0.4667                           0  \n",
       "20       0.5933           0.4300        0.0933                           0  \n",
       "21       0.5333           0.4190        0.0800                           0  \n",
       "22       0.9933           0.0392        0.8600                           0  \n",
       "23       0.6000           0.4255        0.1533                           0  \n",
       "24       0.0600           0.6211        0.5467                          99  \n",
       "25       0.0533           0.6550        0.5667                          97  \n",
       "26       0.0825           0.7646        0.6701                          85  \n",
       "27       0.0333           0.8403        0.8000                         141  \n",
       "28       0.0733           0.5958        0.5067                          78  \n",
       "29       0.0400           0.5906        0.4867                          73  \n",
       "30       0.1733           0.6860        0.6400                         133  \n",
       "31       0.0733           0.7807        0.6533                         127  \n",
       "32       0.1667           0.5263        0.6067                          97  \n",
       "33       0.1133           0.5379        0.5733                          82  \n",
       "34       0.9800           0.3248        0.7533                         148  \n",
       "35       0.4200           0.5546        0.4600                          85  "
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "import json\n",
    "import re\n",
    "import pandas as pd\n",
    "\n",
    "def summarize_evaluation_results(folder):\n",
    "    \"\"\"\n",
    "    Summarize per-question TSV evaluation files in a folder.\n",
    "    Expects filenames in the format: task-model-action.tsv\n",
    "    and each TSV contains columns: QuestionID, JaccardSimilarity, IsEmptySet, BinaryEqual\n",
    "    \"\"\"\n",
    "    rows = []\n",
    "\n",
    "    for filename in sorted(os.listdir(folder)):\n",
    "        if not filename.endswith(\".tsv\"):\n",
    "            continue\n",
    "\n",
    "        # Extract metadata from filename\n",
    "        base = filename[:-4]  # remove .tsv\n",
    "        parts = base.split('_')\n",
    "        task = parts[0]\n",
    "        action = parts[-1]\n",
    "        model = parts[1]\n",
    "\n",
    "        # Read TSV\n",
    "        df = pd.read_csv(os.path.join(folder, filename), sep='\\t')\n",
    "\n",
    "        # Ensure JaccardSimilarity column exists\n",
    "        if 'JaccardSimilarity' not in df.columns:\n",
    "            raise ValueError(f\"Missing 'JaccardSimilarity' in {filename}\")\n",
    "\n",
    "        # Compute overall average\n",
    "        avg_all = df['JaccardSimilarity'].mean()\n",
    "\n",
    "        # Identify empty (zero) Jaccard entries\n",
    "        is_empty = df['JaccardSimilarity'] == 0\n",
    "        ratio_empty = is_empty.mean()\n",
    "\n",
    "        # Average excluding empty entries\n",
    "        non_empty = df.loc[~is_empty, 'JaccardSimilarity']\n",
    "        avg_non_empty = non_empty.mean() if not non_empty.empty else float('nan')\n",
    "\n",
    "        # Binary count: proportion where BinaryEqual == 1\n",
    "        if 'BinaryEqual' in df.columns:\n",
    "            binary_count = (df['BinaryEqual'] == 1).mean()\n",
    "        else:\n",
    "            binary_count = float('nan')\n",
    "        \n",
    "        if 'B_Intersected_C_EmptyCount' in df.columns:\n",
    "            # Calculate the number of questions where B_Intersected_C_EmptyCount is 0\n",
    "            b_intersected_c_empty_count = (df['B_Intersected_C_EmptyCount'] == 1).sum()\n",
    "        else:\n",
    "            b_intersected_c_empty_count = float('nan')\n",
    "\n",
    "        rows.append({\n",
    "            \"Task\": task,\n",
    "            \"Model\": model,\n",
    "            \"Action\": action,\n",
    "            \"Average_All\": round(avg_all, 4),\n",
    "            \"Ratio_empty\": round(ratio_empty, 4),\n",
    "            \"Average_NoEmpty\": round(avg_non_empty, 4),\n",
    "            \"Binary_count\": round(binary_count, 4),\n",
    "            \"B_Intersected_C_EmptyCount\": b_intersected_c_empty_count,\n",
    "        })\n",
    "\n",
    "    return pd.DataFrame(rows)\n",
    "\n",
    "# Example usage:\n",
    "dataset = 'spinach'\n",
    "df = summarize_evaluation_results(f\"../data/evaluation_results/{dataset}/\")\n",
    "df.to_csv(f'../data/evaluation_results/summary_{dataset}.tsv', sep='\\t', index=False)\n",
    "df\n",
    "\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
