{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import re\n",
    "import matplotlib.pyplot as plt\n",
    "from collections import defaultdict\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_json_data(file_path):\n",
    "    \"\"\"Loads data from a JSON file.\"\"\"\n",
    "    with open(file_path, 'r') as file:\n",
    "        return json.load(file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_final_answer(response_text):\n",
    "    \"\"\"\n",
    "    Extracts the numerical part or special keywords of the final answer from the response text.\n",
    "    \"\"\"\n",
    "    if response_text is None:\n",
    "        return 'N/A'\n",
    "    special_keywords = ['Unknown', 'None']\n",
    "    pattern = r\"Final Numerical Answer.*?:.*?([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?|\\bUnknown\\b|\\bNone\\b)\"\n",
    "    matches = list(re.finditer(pattern, response_text, re.DOTALL))\n",
    "    match = None if not matches else matches[-1]\n",
    "    if match:\n",
    "        if match.group(1) in special_keywords:\n",
    "            return match.group(1)  # return special keyword\n",
    "        return float(match.group(1))  # convert numeric answers to float\n",
    "    return 'N/A'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_final_answer_from_response(item):\n",
    "    \"\"\"\n",
    "    Converts the final answer to float, if it is a number.\n",
    "    \"\"\"\n",
    "    special_outcomes = ['Unknown', 'None']\n",
    "    final_answer = extract_final_answer(item[\"response\"])\n",
    "    if (final_answer not in special_outcomes) and (final_answer != 'N/A'):\n",
    "        final_answer = round(final_answer, 6)\n",
    "    return final_answer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def prep_data_mapping(data):\n",
    "    \"\"\"\n",
    "    Converts data to a dataframe.\n",
    "    \"\"\"\n",
    "    data_list = [\n",
    "        {\n",
    "            \"id\": item[\"id\"],\n",
    "            \"template_id\": item[\"template_id\"],\n",
    "            \"problem\": item[\"problem\"],\n",
    "            \"full_solution\": item[\"full_solution\"],\n",
    "            \"expected\": item[\"answer\"],\n",
    "            \"category\": item[\"category\"],\n",
    "            \"level\": item[\"level\"],\n",
    "            \"source\": item[\"source\"],\n",
    "            \"netlist\": item[\"netlist\"]\n",
    "        }\n",
    "        for item in data\n",
    "    ]\n",
    "\n",
    "    # convert to dataframe\n",
    "    df = pd.DataFrame(data_list)\n",
    "    return df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def prep_response_mapping(responses):\n",
    "    \"\"\"\n",
    "    Converts responses to a dataframe.\n",
    "    \"\"\"\n",
    "    response_list = [\n",
    "        {\n",
    "            \"id\": item[\"problem_id\"],\n",
    "            \"response\": item[\"response\"],\n",
    "            \"final_answer\": get_final_answer_from_response(item)\n",
    "        }\n",
    "        for item in responses\n",
    "    ]\n",
    "\n",
    "    # convert to dataframe\n",
    "    df = pd.DataFrame(response_list)\n",
    "    return df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def merge_dataframes(data, responses):\n",
    "    \"\"\"\n",
    "    Merges data and responses into one dataframe containing their combined information.\n",
    "    \"\"\"\n",
    "    # prepare dataframes\n",
    "    df_data = prep_data_mapping(data)\n",
    "    df_responses = prep_response_mapping(responses)\n",
    "\n",
    "    # merge dataframes on id, keeping all rows from df_responses\n",
    "    merged_df = pd.merge(df_responses, df_data, on=\"id\", how=\"left\")\n",
    "\n",
    "    return merged_df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "DATA_PATH = \"../data/new/data.json\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = load_json_data(DATA_PATH)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_df = prep_data_mapping(data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Response Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def is_correct(row, outcomes=['Unknown', 'None']):\n",
    "    \"\"\"\n",
    "    Checks whether an answer is correct. Spcial outcomes considered incorrect.\n",
    "    Numerical answers considered correct if within 0.001.\n",
    "    \"\"\"\n",
    "    if row['final_answer'] in outcomes or row['final_answer'] == 'N/A':\n",
    "        return False\n",
    "    try:\n",
    "        return abs(round(float(row['final_answer']), 6) - float(row['expected'])) < 0.001\n",
    "    except ValueError:\n",
    "        return False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_statistics(df):\n",
    "    outcomes=['Unknown', 'None']\n",
    "    df['is_correct'] = df.apply(lambda row: is_correct(row), axis=1)\n",
    "\n",
    "    global_correct = df['is_correct'].sum()\n",
    "    global_totals = len(df)\n",
    "\n",
    "    template_correct = df.groupby('template_id')['is_correct'].sum().to_dict()\n",
    "    template_totals = df.groupby('template_id').size().to_dict()\n",
    "\n",
    "    special_counts = {outcome: df[df['final_answer'] == outcome].groupby('template_id').size().to_dict() for outcome in outcomes}\n",
    "    special_counts['N/A'] = df[df['final_answer'] == 'N/A'].groupby('template_id').size().to_dict()\n",
    "\n",
    "    return global_correct, global_totals, template_correct, template_totals, special_counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_statistics_human(df):\n",
    "    outcomes=['Unknown', 'None']\n",
    "    # df['is_correct_human'] = df.apply(lambda row: is_correct(row), axis=1)\n",
    "\n",
    "    global_correct = df['is_correct_human'].sum()\n",
    "    global_totals = len(df)\n",
    "\n",
    "    template_correct = df.groupby('template_id')['is_correct_human'].sum().to_dict()\n",
    "    template_totals = df.groupby('template_id').size().to_dict()\n",
    "\n",
    "    special_counts = {outcome: df[df['final_answer'] == outcome].groupby('template_id').size().to_dict() for outcome in outcomes}\n",
    "    special_counts['N/A'] = df[df['final_answer'] == 'N/A'].groupby('template_id').size().to_dict()\n",
    "\n",
    "    return global_correct, global_totals, template_correct, template_totals, special_counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def print_statistics(global_correct, global_totals, template_correct, template_totals):\n",
    "    try:\n",
    "        global_accuracy = global_correct / global_totals\n",
    "    except:\n",
    "        global_accuracy = 0\n",
    "    print(f\"Global Accuracy: {global_accuracy:.2%} ({global_correct} out of {global_totals})\")\n",
    "\n",
    "    for k in range(5, 0, -1):\n",
    "        template_num_correct = sum(1 for t_id, count in template_correct.items() if count >= k)\n",
    "        template_accuracy = template_num_correct / len(template_totals) if len(template_totals) != 0 else 0\n",
    "        print(f\"Template {k}/5: {template_accuracy:.2%} ({template_num_correct} out of {len(template_totals)})\")\n",
    "\n",
    "    # print(\"Template Correct Counts:\")\n",
    "    # for template_id, correct in template_correct.items():\n",
    "    #     total = template_totals[template_id]\n",
    "    #     print(f\"Template ID {template_id}: {correct}/{total}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot_template_statistics(template_correct, template_totals, special_counts):\n",
    "    labels = list(template_totals.keys())\n",
    "    correct_counts = [template_correct.get(t_id, 0) for t_id in labels]\n",
    "    total_counts = [template_totals[t_id] for t_id in labels]\n",
    "\n",
    "    # incorrect counts excluding special outcomes\n",
    "    incorrect_counts = [total - correct - sum(special_counts[outcome].get(t_id, 0) for outcome in special_counts) for t_id, total, correct in zip(labels, total_counts, correct_counts)]\n",
    "\n",
    "    # special outcomes\n",
    "    special_outcome_counts = {outcome: [special_counts[outcome].get(t_id, 0) for t_id in labels] for outcome in special_counts}\n",
    "\n",
    "    bar_width = 0.15\n",
    "    index = np.arange(len(labels))\n",
    "\n",
    "    plt.figure(figsize=(20, 6))\n",
    "\n",
    "    plt.bar(index, correct_counts, bar_width, label='Correct')\n",
    "    plt.bar(index + bar_width, incorrect_counts, bar_width, label='Incorrect (General)')\n",
    "\n",
    "    # special outcomes\n",
    "    for i, outcome in enumerate(special_outcome_counts, start=2):\n",
    "        plt.bar(index + i*bar_width, special_outcome_counts[outcome], bar_width, label=f'Incorrect ({outcome})')\n",
    "\n",
    "    plt.xlabel('Template ID')\n",
    "    plt.ylabel('Number of Responses')\n",
    "    plt.title('Response Accuracy per Template ID')\n",
    "    plt.xticks(index + bar_width * (len(special_outcome_counts) / 2), labels, rotation=45)\n",
    "    plt.subplots_adjust(bottom=0.2)  # adjust bottom\n",
    "    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=3)  # adjust the legend position\n",
    "    plt.tight_layout()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Automatic Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_data_and_print_statistics(EXPERIMENT_NUM):\n",
    "    PATH = f\"../run/outputs/resp-experiment-{EXPERIMENT_NUM}.json\"\n",
    "    print(f\"ANALYZING EXPERIMENT {EXPERIMENT_NUM}\\n----------------------\")\n",
    "    r = load_json_data(PATH)\n",
    "    data = load_json_data(DATA_PATH)\n",
    "    resps = merge_dataframes(data, r)\n",
    "\n",
    "    global_correct, global_totals, template_correct, template_totals, special_counts = compute_statistics(resps)\n",
    "\n",
    "    print_statistics(global_correct, global_totals, template_correct, template_totals)\n",
    "    plot_template_statistics(template_correct, template_totals, special_counts)\n",
    "\n",
    "    wno = resps[resps['netlist'].notna() & resps['netlist'].ne('')].copy()\n",
    "    nno = resps[resps['netlist'].isna()  | resps['netlist'].eq('')].copy()\n",
    "\n",
    "    print(\"\\nOnly questions with netlists\\n------\")\n",
    "    global_correct, global_totals, template_correct, template_totals, special_counts = compute_statistics(wno)\n",
    "    print_statistics(global_correct, global_totals, template_correct, template_totals)\n",
    "    print(\"\\nOnly questions witout netlists\\n------\")\n",
    "    global_correct, global_totals, template_correct, template_totals, special_counts = compute_statistics(nno)\n",
    "    print_statistics(global_correct, global_totals, template_correct, template_totals)\n",
    "\n",
    "    print(\"\\n\\n\\n\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "def print_statistics_human(resps, plot=False):\n",
    "    global_correct, global_totals, template_correct, template_totals, special_counts = compute_statistics_human(resps)\n",
    "\n",
    "    print_statistics(global_correct, global_totals, template_correct, template_totals)\n",
    "    if plot:\n",
    "        plot_template_statistics(template_correct, template_totals, special_counts)\n",
    "\n",
    "    wno = resps[resps['netlist'].notna() & resps['netlist'].ne('')].copy()\n",
    "    nno = resps[resps['netlist'].isna()  | resps['netlist'].eq('')].copy()\n",
    "\n",
    "    print(\"\\nOnly questions with netlists\\n------\")\n",
    "    global_correct, global_totals, template_correct, template_totals, special_counts = compute_statistics_human(wno)\n",
    "    print_statistics(global_correct, global_totals, template_correct, template_totals)\n",
    "    print(\"\\nOnly questions witout netlists\\n------\")\n",
    "    global_correct, global_totals, template_correct, template_totals, special_counts = compute_statistics_human(nno)\n",
    "    print_statistics(global_correct, global_totals, template_correct, template_totals)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Human vs Auto"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_automatic(EXPERIMENT_NUM):\n",
    "    PATH = f\"../run/outputs/resp-experiment-{EXPERIMENT_NUM}.json\"\n",
    "    r = load_json_data(PATH)\n",
    "    data = load_json_data(DATA_PATH)\n",
    "    resps = merge_dataframes(data, r)\n",
    "    resps['is_correct'] = resps.apply(lambda row: is_correct(row), axis=1)\n",
    "\n",
    "    return resps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_human(EXPERIMENT_NUM):\n",
    "    PATH = f\"human_analysis_files/experiment-{EXPERIMENT_NUM}.csv\"\n",
    "    df = pd.read_csv(PATH)\n",
    "    df.replace({'Yes': True, 'No': False}, inplace=True)\n",
    "\n",
    "    error_columns = [\"math_err\", \"formatting_err\", \"reasoning_err\", \"topology_err\", \"direction_err\"]\n",
    "    df['is_correct_human'] = df.apply(lambda row:\n",
    "                                      False if row[error_columns].any()\n",
    "                                      else row['is_correct_human'], axis=1)\n",
    "\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_and_merge(EXPERIMENT_NUM):\n",
    "    auto = load_automatic(EXPERIMENT_NUM)\n",
    "    human = load_human(EXPERIMENT_NUM)\n",
    "    m = pd.merge(auto, human, on=\"id\")\n",
    "    return m\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "def error_stats(df):\n",
    "    total_entries = len(df)\n",
    "    false_positives = len(df[(df['is_correct'] == True) & (df['is_correct_human'] == False)])\n",
    "    false_negatives = len(df[(df['is_correct'] == False) & (df['is_correct_human'] == True)])\n",
    "    math_errors = len(df[df['math_err'] == True])\n",
    "    formatting_errors = len(df[df['formatting_err'] == True])\n",
    "    reasoning_errors = len(df[df['reasoning_err'] == True])\n",
    "    topology_errors = len(df[df['topology_err'] == True])\n",
    "    direction_errors = len(df[df['direction_err'] == True])\n",
    "\n",
    "    error_columns = [\"formatting_err\", \"reasoning_err\", \"topology_err\", \"direction_err\"]\n",
    "    math_only = len(df[(df['math_err'] == True) & (df[error_columns].sum(axis=1) == 0)])\n",
    "\n",
    "    print(f\"Total: {total_entries}\")\n",
    "    print(f\"Auto global accuracy: {len(df[(df['is_correct'] == True)])/total_entries * 100:.2f}\")\n",
    "    print(f\"Human global accuracy: {len(df[(df['is_correct_human'] == True)])/total_entries * 100:.2f}\")\n",
    "    print(f\"False Positives: {false_positives/total_entries * 100:.2f}% ({false_positives}/{total_entries})\")\n",
    "    print(f\"False Negatives: {false_negatives/total_entries * 100:.2f}% ({false_negatives}/{total_entries})\")\n",
    "    print(f\"Math Errors: {math_errors/total_entries * 100:.2f}% ({math_errors}/{total_entries})\")\n",
    "    print(f\"Math ONLY Errors: {math_only/total_entries * 100:.2f}% ({math_only}/{total_entries})\")\n",
    "    print(f\"Formatting Errors: {formatting_errors/total_entries * 100:.2f}% ({formatting_errors}/{total_entries})\")\n",
    "    print(f\"Reasoning Errors: {reasoning_errors/total_entries * 100:.2f}% ({reasoning_errors}/{total_entries})\")\n",
    "    print(f\"Topology Errors: {topology_errors/total_entries * 100:.2f}% ({topology_errors}/{total_entries})\")\n",
    "    print(f\"Direction Errors: {direction_errors/total_entries * 100:.2f}% ({direction_errors}/{total_entries})\\n\\n\\n\")\n",
    "\n",
    "    false_negatives = df[(df['is_correct'] == False) & (df['is_correct_human'] == True)]\n",
    "    if not false_negatives.empty:\n",
    "        print(\"False Negative Rows:\")\n",
    "        print(false_negatives)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculate_accuracy_per_category(df):\n",
    "    category_groups = df.groupby('category')\n",
    "\n",
    "    for category, group in category_groups:\n",
    "        total = len(group)\n",
    "        correct = group['is_correct_human'].sum()\n",
    "\n",
    "        accuracy = (correct / total) * 100\n",
    "\n",
    "        print(f\"Category: {category}\\n  - Total: {total}\\n  - Correct: {correct}\\n  - Accuracy: {accuracy:.2f}%\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Experiment 6 statistics\n",
      "------\n",
      "Total: 395\n",
      "Auto global accuracy: 42.53\n",
      "Human global accuracy: 36.46\n",
      "False Positives: 6.08% (24/395)\n",
      "False Negatives: 0.00% (0/395)\n",
      "Math Errors: 7.09% (28/395)\n",
      "Math ONLY Errors: 4.56% (18/395)\n",
      "Formatting Errors: 1.27% (5/395)\n",
      "Reasoning Errors: 58.48% (231/395)\n",
      "Topology Errors: 36.20% (143/395)\n",
      "Direction Errors: 4.05% (16/395)\n",
      "\n",
      "\n",
      "\n",
      "Global Accuracy: 36.46% (144 out of 395)\n",
      "Template 5/5: 17.72% (14 out of 79)\n",
      "Template 4/5: 27.85% (22 out of 79)\n",
      "Template 3/5: 35.44% (28 out of 79)\n",
      "Template 2/5: 43.04% (34 out of 79)\n",
      "Template 1/5: 58.23% (46 out of 79)\n",
      "\n",
      "Only questions with netlists\n",
      "------\n",
      "Global Accuracy: 36.46% (144 out of 395)\n",
      "Template 5/5: 17.72% (14 out of 79)\n",
      "Template 4/5: 27.85% (22 out of 79)\n",
      "Template 3/5: 35.44% (28 out of 79)\n",
      "Template 2/5: 43.04% (34 out of 79)\n",
      "Template 1/5: 58.23% (46 out of 79)\n",
      "\n",
      "Only questions witout netlists\n",
      "------\n",
      "Global Accuracy: nan% (0 out of 0)\n",
      "Template 5/5: 0.00% (0 out of 0)\n",
      "Template 4/5: 0.00% (0 out of 0)\n",
      "Template 3/5: 0.00% (0 out of 0)\n",
      "Template 2/5: 0.00% (0 out of 0)\n",
      "Template 1/5: 0.00% (0 out of 0)\n",
      "Category: analog\n",
      "  - Total: 85\n",
      "  - Correct: 26\n",
      "  - Accuracy: 30.59%\n",
      "\n",
      "Category: basic\n",
      "  - Total: 160\n",
      "  - Correct: 79\n",
      "  - Accuracy: 49.38%\n",
      "\n",
      "Category: power\n",
      "  - Total: 90\n",
      "  - Correct: 27\n",
      "  - Accuracy: 30.00%\n",
      "\n",
      "Category: rf\n",
      "  - Total: 60\n",
      "  - Correct: 12\n",
      "  - Accuracy: 20.00%\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "++++++++++++++++++++++++++++++++++++++++++++++++++\n",
      "\n",
      "\n",
      "Experiment 7 statistics\n",
      "------\n",
      "Total: 115\n",
      "Auto global accuracy: 67.83\n",
      "Human global accuracy: 63.48\n",
      "False Positives: 4.35% (5/115)\n",
      "False Negatives: 0.00% (0/115)\n",
      "Math Errors: 1.74% (2/115)\n",
      "Math ONLY Errors: 1.74% (2/115)\n",
      "Formatting Errors: 0.00% (0/115)\n",
      "Reasoning Errors: 34.78% (40/115)\n",
      "Topology Errors: 16.52% (19/115)\n",
      "Direction Errors: 4.35% (5/115)\n",
      "\n",
      "\n",
      "\n",
      "Global Accuracy: 63.48% (73 out of 115)\n",
      "Template 5/5: 52.17% (12 out of 23)\n",
      "Template 4/5: 65.22% (15 out of 23)\n",
      "Template 3/5: 65.22% (15 out of 23)\n",
      "Template 2/5: 65.22% (15 out of 23)\n",
      "Template 1/5: 69.57% (16 out of 23)\n",
      "\n",
      "Only questions with netlists\n",
      "------\n",
      "Global Accuracy: nan% (0 out of 0)\n",
      "Template 5/5: 0.00% (0 out of 0)\n",
      "Template 4/5: 0.00% (0 out of 0)\n",
      "Template 3/5: 0.00% (0 out of 0)\n",
      "Template 2/5: 0.00% (0 out of 0)\n",
      "Template 1/5: 0.00% (0 out of 0)\n",
      "\n",
      "Only questions witout netlists\n",
      "------\n",
      "Global Accuracy: 63.48% (73 out of 115)\n",
      "Template 5/5: 52.17% (12 out of 23)\n",
      "Template 4/5: 65.22% (15 out of 23)\n",
      "Template 3/5: 65.22% (15 out of 23)\n",
      "Template 2/5: 65.22% (15 out of 23)\n",
      "Template 1/5: 69.57% (16 out of 23)\n",
      "Category: analog\n",
      "  - Total: 15\n",
      "  - Correct: 5\n",
      "  - Accuracy: 33.33%\n",
      "\n",
      "Category: basic\n",
      "  - Total: 30\n",
      "  - Correct: 24\n",
      "  - Accuracy: 80.00%\n",
      "\n",
      "Category: power\n",
      "  - Total: 5\n",
      "  - Correct: 5\n",
      "  - Accuracy: 100.00%\n",
      "\n",
      "Category: rf\n",
      "  - Total: 65\n",
      "  - Correct: 39\n",
      "  - Accuracy: 60.00%\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "++++++++++++++++++++++++++++++++++++++++++++++++++\n",
      "\n",
      "\n",
      "Experiment 8 statistics\n",
      "------\n",
      "Total: 395\n",
      "Auto global accuracy: 37.22\n",
      "Human global accuracy: 31.90\n",
      "False Positives: 5.32% (21/395)\n",
      "False Negatives: 0.00% (0/395)\n",
      "Math Errors: 8.35% (33/395)\n",
      "Math ONLY Errors: 6.08% (24/395)\n",
      "Formatting Errors: 0.51% (2/395)\n",
      "Reasoning Errors: 61.77% (244/395)\n",
      "Topology Errors: 39.24% (155/395)\n",
      "Direction Errors: 3.54% (14/395)\n",
      "\n",
      "\n",
      "\n",
      "Global Accuracy: 31.90% (126 out of 395)\n",
      "Template 5/5: 18.99% (15 out of 79)\n",
      "Template 4/5: 27.85% (22 out of 79)\n",
      "Template 3/5: 31.65% (25 out of 79)\n",
      "Template 2/5: 34.18% (27 out of 79)\n",
      "Template 1/5: 46.84% (37 out of 79)\n",
      "\n",
      "Only questions with netlists\n",
      "------\n",
      "Global Accuracy: 31.90% (126 out of 395)\n",
      "Template 5/5: 18.99% (15 out of 79)\n",
      "Template 4/5: 27.85% (22 out of 79)\n",
      "Template 3/5: 31.65% (25 out of 79)\n",
      "Template 2/5: 34.18% (27 out of 79)\n",
      "Template 1/5: 46.84% (37 out of 79)\n",
      "\n",
      "Only questions witout netlists\n",
      "------\n",
      "Global Accuracy: nan% (0 out of 0)\n",
      "Template 5/5: 0.00% (0 out of 0)\n",
      "Template 4/5: 0.00% (0 out of 0)\n",
      "Template 3/5: 0.00% (0 out of 0)\n",
      "Template 2/5: 0.00% (0 out of 0)\n",
      "Template 1/5: 0.00% (0 out of 0)\n",
      "Category: analog\n",
      "  - Total: 85\n",
      "  - Correct: 24\n",
      "  - Accuracy: 28.24%\n",
      "\n",
      "Category: basic\n",
      "  - Total: 160\n",
      "  - Correct: 72\n",
      "  - Accuracy: 45.00%\n",
      "\n",
      "Category: power\n",
      "  - Total: 90\n",
      "  - Correct: 24\n",
      "  - Accuracy: 26.67%\n",
      "\n",
      "Category: rf\n",
      "  - Total: 60\n",
      "  - Correct: 6\n",
      "  - Accuracy: 10.00%\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "++++++++++++++++++++++++++++++++++++++++++++++++++\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for exp_num in (6, 7, 8):\n",
    "    exp = load_and_merge(exp_num)\n",
    "    print(f\"Experiment {exp_num} statistics\\n------\")\n",
    "    error_stats(exp)\n",
    "    print_statistics_human(exp)\n",
    "    calculate_accuracy_per_category(exp)\n",
    "    print(\"\\n\\n\\n\\n\")\n",
    "    print(\"++++++++++++++++++++++++++++++++++++++++++++++++++\\n\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
