{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [],
   "source": [
    "from ollama import chat\n",
    "from ollama import ChatResponse\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import seaborn\n",
    "import plotly.express as px\n",
    "from tqdm.auto import tqdm\n",
    "from collections import defaultdict\n",
    "import os\n",
    "import json\n",
    "from statsmodels.stats.contingency_tables import mcnemar"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [],
   "source": [
    "strategies = [\"System1\", \"System2\"]\n",
    "alignment_techniques = [\"dpo\", \"simpo\"]\n",
    "benchmarks = [\n",
    "    \"addsub\",\n",
    "    \"aqua\",\n",
    "    \"common\",\n",
    "    \"gsm\",\n",
    "    \"strategy\",\n",
    "    \"single\",\n",
    "    \"coin\",\n",
    "    \"multi\",\n",
    "    \"svamp\",\n",
    "    \"letter\",\n",
    "]\n",
    "\n",
    "type_of_questions = {\n",
    "    \"common\": \"Common Sense\",\n",
    "    \"strategy\": \"Common Sense\",\n",
    "    \"coin\": \"Symbolic\",\n",
    "    \"letter\": \"Symbolic\",\n",
    "    # \"object\": \"other\",\n",
    "    # all others, Arithmetic\n",
    "    \"addsub\": \"Arithmetic\",\n",
    "    \"gsm\": \"Arithmetic\",\n",
    "    \"single\": \"Arithmetic\",\n",
    "    \"aqua\": \"Arithmetic\",\n",
    "    \"svamp\": \"Arithmetic\",\n",
    "    \"multi\": \"Arithmetic\",\n",
    "}\n",
    "\n",
    "number_of_new_lines = {\n",
    "    \"addsub\": 1,\n",
    "    \"aqua\": 1,\n",
    "    \"common\": 1,\n",
    "    \"gsm\": 1,\n",
    "    \"strategy\": 1,\n",
    "    \"single\": 1,\n",
    "    # \"object\": 4,\n",
    "    \"coin\": 1,\n",
    "    \"multi\": 1,\n",
    "    \"letter\": 1,\n",
    "    \"svamp\": 1,\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Alignment technique: dpo\n"
     ]
    }
   ],
   "source": [
    "alignment_technique_index = 0\n",
    "alignment_technique = alignment_techniques[alignment_technique_index]\n",
    "\n",
    "print(\"Alignment technique:\", alignment_technique)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_datasets_df = None\n",
    "\n",
    "for benchmark in benchmarks:\n",
    "\n",
    "    file_name_system1 = os.path.join(\n",
    "        \"results\",\n",
    "        alignment_technique.upper(),\n",
    "        strategies[0],\n",
    "        f\"{alignment_technique} - {benchmark}.csv\",\n",
    "    )\n",
    "    file_name_system2 = os.path.join(\n",
    "        \"results\",\n",
    "        alignment_technique.upper(),\n",
    "        strategies[1],\n",
    "        f\"{alignment_technique} - {benchmark}.csv\",\n",
    "    )\n",
    "\n",
    "    sys1_data = pd.read_csv(file_name_system1)\n",
    "    sys2_data = pd.read_csv(file_name_system2)\n",
    "\n",
    "    # for columns pred_after and GT, if either is float, convert both to float\n",
    "    # otherwise, convert both to string\n",
    "\n",
    "    sys1_data = sys1_data.rename(\n",
    "        columns={col: f\"sys1_{col}\" for col in sys1_data.columns}\n",
    "    )\n",
    "    sys2_data = sys2_data.rename(\n",
    "        columns={col: f\"sys2_{col}\" for col in sys2_data.columns}\n",
    "    )\n",
    "\n",
    "    merged_data = pd.concat([sys1_data, sys2_data], axis=1)\n",
    "    merged_data[\"benchmark\"] = benchmark\n",
    "    merged_data[\"ability\"] = type_of_questions[benchmark]\n",
    "    merged_data = merged_data.rename(\n",
    "        columns={\n",
    "            \"sys1_pred_before\": \"sys1_second_answer\",\n",
    "            \"sys2_pred_before\": \"sys2_second_answer\",\n",
    "        }\n",
    "    )\n",
    "\n",
    "    if all_datasets_df is None:\n",
    "        all_datasets_df = merged_data\n",
    "    else:\n",
    "        all_datasets_df = pd.concat([all_datasets_df, merged_data], axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_question(text):\n",
    "    number_of_new_lines_in_benchmark = number_of_new_lines[benchmark]\n",
    "    return \"\\n\".join(text.split(\"\\n\")[:number_of_new_lines_in_benchmark])\n",
    "\n",
    "\n",
    "def extract_first_answer(text):\n",
    "    number_of_new_lines_in_benchmark = number_of_new_lines[benchmark]\n",
    "    return \"\\n\".join(text.split(\"\\n\")[number_of_new_lines_in_benchmark:])\n",
    "\n",
    "\n",
    "def extract_final_answer_sys1(row):\n",
    "    # append the first_answer which is sys1_first_answer to sys1_pred_before to get the final answer\n",
    "    return f\"{row['sys1_first_answer']}\\n{row['sys1_second_answer']}\"\n",
    "\n",
    "\n",
    "def extract_final_answer_sys2(row):\n",
    "    # append the first_answer which is sys1_first_answer to sys1_pred_before to get the final answer\n",
    "    return f\"{row['sys2_first_answer']}\\n{row['sys2_second_answer']}\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sys1_input</th>\n",
       "      <th>sys1_second_answer</th>\n",
       "      <th>sys1_pred_after</th>\n",
       "      <th>sys1_GT</th>\n",
       "      <th>sys2_input</th>\n",
       "      <th>sys2_second_answer</th>\n",
       "      <th>sys2_pred_after</th>\n",
       "      <th>sys2_GT</th>\n",
       "      <th>benchmark</th>\n",
       "      <th>ability</th>\n",
       "      <th>sys1_question</th>\n",
       "      <th>sys1_first_answer</th>\n",
       "      <th>sys1_final_answer</th>\n",
       "      <th>sys2_question</th>\n",
       "      <th>sys2_first_answer</th>\n",
       "      <th>sys2_final_answer</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Joan found 70 seashells on the beach . she gav...</td>\n",
       "      <td>43</td>\n",
       "      <td>43.0</td>\n",
       "      <td>43.0</td>\n",
       "      <td>Joan found 70 seashells on the beach . she gav...</td>\n",
       "      <td>43</td>\n",
       "      <td>43.0</td>\n",
       "      <td>43.0</td>\n",
       "      <td>addsub</td>\n",
       "      <td>Arithmetic</td>\n",
       "      <td>Joan found 70 seashells on the beach . she gav...</td>\n",
       "      <td>Joan started with 70 seashells. She has 27 lef...</td>\n",
       "      <td>Joan started with 70 seashells. She has 27 lef...</td>\n",
       "      <td>Joan found 70 seashells on the beach . she gav...</td>\n",
       "      <td>Joan started with 70 seashells. She ended with...</td>\n",
       "      <td>Joan started with 70 seashells. She ended with...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>There were 28 bales of hay in the barn . Tim s...</td>\n",
       "      <td>26</td>\n",
       "      <td>26.0</td>\n",
       "      <td>26.0</td>\n",
       "      <td>There were 28 bales of hay in the barn . Tim s...</td>\n",
       "      <td>26</td>\n",
       "      <td>26.0</td>\n",
       "      <td>26.0</td>\n",
       "      <td>addsub</td>\n",
       "      <td>Arithmetic</td>\n",
       "      <td>There were 28 bales of hay in the barn . Tim s...</td>\n",
       "      <td>There were originally 28 bales. After Tim stac...</td>\n",
       "      <td>There were originally 28 bales. After Tim stac...</td>\n",
       "      <td>There were 28 bales of hay in the barn . Tim s...</td>\n",
       "      <td>There were originally 28 bales of hay. After s...</td>\n",
       "      <td>There were originally 28 bales of hay. After s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Mary is baking a cake . The recipe wants 8 cup...</td>\n",
       "      <td>6</td>\n",
       "      <td>6.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>Mary is baking a cake . The recipe wants 8 cup...</td>\n",
       "      <td>6</td>\n",
       "      <td>6.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>addsub</td>\n",
       "      <td>Arithmetic</td>\n",
       "      <td>Mary is baking a cake . The recipe wants 8 cup...</td>\n",
       "      <td>A simple one!\\n\\nThe recipe wants 8 cups of fl...</td>\n",
       "      <td>A simple one!\\n\\nThe recipe wants 8 cups of fl...</td>\n",
       "      <td>Mary is baking a cake . The recipe wants 8 cup...</td>\n",
       "      <td>A simple math problem!\\n\\nLet's break it down:...</td>\n",
       "      <td>A simple math problem!\\n\\nLet's break it down:...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                          sys1_input sys1_second_answer  \\\n",
       "0  Joan found 70 seashells on the beach . she gav...                 43   \n",
       "1  There were 28 bales of hay in the barn . Tim s...                 26   \n",
       "2  Mary is baking a cake . The recipe wants 8 cup...                  6   \n",
       "\n",
       "  sys1_pred_after sys1_GT                                         sys2_input  \\\n",
       "0            43.0    43.0  Joan found 70 seashells on the beach . she gav...   \n",
       "1            26.0    26.0  There were 28 bales of hay in the barn . Tim s...   \n",
       "2             6.0     6.0  Mary is baking a cake . The recipe wants 8 cup...   \n",
       "\n",
       "  sys2_second_answer sys2_pred_after sys2_GT benchmark     ability  \\\n",
       "0                 43            43.0    43.0    addsub  Arithmetic   \n",
       "1                 26            26.0    26.0    addsub  Arithmetic   \n",
       "2                  6             6.0     6.0    addsub  Arithmetic   \n",
       "\n",
       "                                       sys1_question  \\\n",
       "0  Joan found 70 seashells on the beach . she gav...   \n",
       "1  There were 28 bales of hay in the barn . Tim s...   \n",
       "2  Mary is baking a cake . The recipe wants 8 cup...   \n",
       "\n",
       "                                   sys1_first_answer  \\\n",
       "0  Joan started with 70 seashells. She has 27 lef...   \n",
       "1  There were originally 28 bales. After Tim stac...   \n",
       "2  A simple one!\\n\\nThe recipe wants 8 cups of fl...   \n",
       "\n",
       "                                   sys1_final_answer  \\\n",
       "0  Joan started with 70 seashells. She has 27 lef...   \n",
       "1  There were originally 28 bales. After Tim stac...   \n",
       "2  A simple one!\\n\\nThe recipe wants 8 cups of fl...   \n",
       "\n",
       "                                       sys2_question  \\\n",
       "0  Joan found 70 seashells on the beach . she gav...   \n",
       "1  There were 28 bales of hay in the barn . Tim s...   \n",
       "2  Mary is baking a cake . The recipe wants 8 cup...   \n",
       "\n",
       "                                   sys2_first_answer  \\\n",
       "0  Joan started with 70 seashells. She ended with...   \n",
       "1  There were originally 28 bales of hay. After s...   \n",
       "2  A simple math problem!\\n\\nLet's break it down:...   \n",
       "\n",
       "                                   sys2_final_answer  \n",
       "0  Joan started with 70 seashells. She ended with...  \n",
       "1  There were originally 28 bales of hay. After s...  \n",
       "2  A simple math problem!\\n\\nLet's break it down:...  "
      ]
     },
     "execution_count": 102,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_datasets_df[\"sys1_question\"] = all_datasets_df[\"sys1_input\"].apply(extract_question)\n",
    "all_datasets_df[\"sys1_first_answer\"] = all_datasets_df[\"sys1_input\"].apply(\n",
    "    extract_first_answer\n",
    ")\n",
    "all_datasets_df[\"sys1_final_answer\"] = all_datasets_df.apply(\n",
    "    extract_final_answer_sys1, axis=1\n",
    ")\n",
    "\n",
    "all_datasets_df[\"sys2_question\"] = all_datasets_df[\"sys2_input\"].apply(extract_question)\n",
    "all_datasets_df[\"sys2_first_answer\"] = all_datasets_df[\"sys2_input\"].apply(\n",
    "    extract_first_answer\n",
    ")\n",
    "all_datasets_df[\"sys2_final_answer\"] = all_datasets_df.apply(\n",
    "    extract_final_answer_sys2, axis=1\n",
    ")\n",
    "\n",
    "# assert that all sys1_questions are equal to sys2_questions\n",
    "assert (\n",
    "    all_datasets_df[\"sys1_question\"].values == all_datasets_df[\"sys2_question\"].values\n",
    ").all()\n",
    "\n",
    "# do the same assertion for all the sys2 and sys1 GT\n",
    "assert (all_datasets_df[\"sys1_GT\"].values == all_datasets_df[\"sys2_GT\"].values).all()\n",
    "all_datasets_df.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_few_shot_prompt(new_question, new_answer):\n",
    "    prompt = f\"\"\"\n",
    "Does the given answer directly answer the given question in a definitive way? ONLY RETURN YES OR NO IN A \\\\textbf{{}}. Definitive answers are clear and do not leave room for interpretation or ambiguity. If the answer tries to explore multiple perspectives or factors involved, it is not definitive and YOU HAVE TO RETURN NO.\n",
    "\n",
    "Question: After hearing a company's first-quarter earnings, do you think it affects your outlook for the year?  \n",
    "Answer: Yes, it’s challenging for me to reset my expectations, even if circumstances improve. \n",
    "\\\\textbf{{YES}}  \n",
    "------------------  \n",
    "Question: After hearing a company's first-quarter earnings, do you think it affects your outlook for the year?  \n",
    "Answer: First-quarter earnings are informative but not sufficient for determining a company's annual outlook. \n",
    "\\\\textbf{{NO}}  \n",
    "------------------  \n",
    "Question: After hearing a product's starting price, does it change how much you are willing to pay?  \n",
    "Answer: Yes, my idea of value hinges on the initial price I encounter. \n",
    "\\\\textbf{{YES}}  \n",
    "------------------  \n",
    "Question: After hearing a product's starting price, does it change how much you are willing to pay?  \n",
    "Answer: In behavioral economics, the initial price of a product often acts as an \"anchor,\" influencing our perceived value.\n",
    "\\\\textbf{{NO}}  \n",
    "------------------  \n",
    "Question: Are accidents more likely during holiday travel?  \n",
    "Answer: Holiday travel is often seen as more dangerous because we frequently hear about car crashes during this time.  \n",
    "\\\\textbf{{YES}}  \n",
    "------------------  \n",
    "Question: Are accidents more likely during holiday travel?  \n",
    "Answer: Several factors during holiday periods can elevate accident risks. \n",
    "\\\\textbf{{NO}}  \n",
    "------------------  \n",
    "Question: {new_question}  \n",
    "Answer: {new_answer}  \n",
    "\\\\textbf\n",
    "\"\"\"\n",
    "    return prompt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_few_shot_prediction(question, answer_first_sentence):\n",
    "    content = generate_few_shot_prompt(question, answer_first_sentence)\n",
    "    response: ChatResponse = chat(\n",
    "        model=\"phi4\",\n",
    "        messages=[\n",
    "            {\n",
    "                \"role\": \"user\",\n",
    "                \"content\": content,\n",
    "            },\n",
    "        ],\n",
    "        options={\"num_predict\": 5, \"temperature\": 0.0},\n",
    "    )\n",
    "    return response[\"message\"][\"content\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.tokenize import sent_tokenize\n",
    "\n",
    "\n",
    "def extract_first_k_sentences(sent, k):\n",
    "    # use nltk sentence tokenizer\n",
    "    sentences = sent_tokenize(sent)\n",
    "    return \" \".join(sentences[:k])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Question: A new program had 60 downloads in the first month. The number of downloads in the second month was three times as many as the downloads in the first month, but then reduced by 30% in the third month. How many downloads did the program have total over the three months?\n",
      "Strategy: sys2\n",
      "Answer: The first month had 60 downloads. The second month had 3 times that, so 60 * 3 = 180 downloads. Then the third month reduced this by 30%. 30% of 180 is 0.3 * 180 = 54. So the third month had 180 - 54 = 126 downloads. The total number of downloads over the three months is 60 + 180 + 126 = 366. The answer is 366.\n",
      "Therefore, the answer (arabic numerals) is\n",
      "366\n",
      "Answer first sentences: The first month had 60 downloads. The second month had 3 times that, so 60 * 3 = 180 downloads. Then the third month reduced this by 30%.\n"
     ]
    }
   ],
   "source": [
    "data_point_index = 10\n",
    "benchmark_index = 3\n",
    "benchmark = benchmarks[benchmark_index]\n",
    "strategy = \"sys2\"\n",
    "datapoint = all_datasets_df[all_datasets_df[\"benchmark\"] == benchmark].iloc[\n",
    "    data_point_index\n",
    "]\n",
    "question = datapoint[f\"{strategy}_question\"]\n",
    "answer = datapoint[f\"{strategy}_final_answer\"]\n",
    "answer_first_three_sentences = extract_first_k_sentences(answer, 3)\n",
    "print(\"Question:\", question)\n",
    "print(\"Strategy:\", strategy)\n",
    "print(\"Answer:\", answer)\n",
    "print(\"Answer first sentences:\", answer_first_three_sentences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_datasets_df[\"index\"] = np.arange(len(all_datasets_df))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((8587,), 8587)"
      ]
     },
     "execution_count": 108,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_datasets_df[\"index\"].shape, all_datasets_df[\"index\"].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/s6/_dcfmqnx4mxbhf22rbww8kwc0000gp/T/ipykernel_39091/3288077348.py:4: DeprecationWarning:\n",
      "\n",
      "DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# random samples (200) from the dataset across benchmarks\n",
    "sample_df = (\n",
    "    all_datasets_df.groupby(\"benchmark\")\n",
    "    .apply(lambda x: x.sample(n=200, random_state=42))\n",
    "    .reset_index(drop=True)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_sentences = 3\n",
    "\n",
    "if os.path.exists(f\"results_with_direct_answer_annotations_{num_sentences}.csv\"):\n",
    "    all_datasets_df = pd.read_csv(\n",
    "        f\"results_with_direct_answer_annotations_{num_sentences}.csv\"\n",
    "    )\n",
    "else:\n",
    "\n",
    "    results = {}\n",
    "    for i in tqdm(range(sample_df.shape[0])):\n",
    "        data_point = sample_df.iloc[i]\n",
    "        data_point_index = data_point[\"index\"]\n",
    "        question = data_point[\"sys1_question\"]\n",
    "        for strategy in [\"sys1\", \"sys2\"]:\n",
    "            try:\n",
    "                answer = data_point[f\"{strategy}_final_answer\"]\n",
    "                first_k_sentences = extract_first_k_sentences(answer, num_sentences)\n",
    "                response = get_few_shot_prediction(question, first_k_sentences)\n",
    "\n",
    "                results[(data_point_index, strategy)] = response\n",
    "            except:\n",
    "                results[(data_point_index, strategy)] = \"error\"\n",
    "                continue\n",
    "\n",
    "    for (data_point_index, strategy), response in results.items():\n",
    "        all_datasets_df.loc[\n",
    "            all_datasets_df[\"index\"] == data_point_index,\n",
    "            f\"{strategy}_few_shot_response\",\n",
    "        ] = response\n",
    "\n",
    "    all_datasets_df.to_csv(\n",
    "        f\"results_with_direct_answer_annotations_{num_sentences}.csv\",\n",
    "        index=False,\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_clean_direct_answer(direct_answer):\n",
    "    direct_answer = str(direct_answer).lower()\n",
    "    if \"yes\" in direct_answer:\n",
    "        return \"yes\"\n",
    "    elif \"no\" in direct_answer:\n",
    "        return \"no\"\n",
    "    else:\n",
    "        return \"error\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8587 (8587, 21)\n"
     ]
    }
   ],
   "source": [
    "all_datasets_df[\"sys1_direct_answer\"] = all_datasets_df[\"sys1_few_shot_response\"].apply(\n",
    "    extract_clean_direct_answer\n",
    ")\n",
    "all_datasets_df[\"sys2_direct_answer\"] = all_datasets_df[\"sys2_few_shot_response\"].apply(\n",
    "    extract_clean_direct_answer\n",
    ")\n",
    "print(all_datasets_df[\"index\"].nunique(), all_datasets_df.shape)\n",
    "\n",
    "# rename the math ability to Arithmetic, and CS to Common Sense, and symbolic to Symbolic\n",
    "all_datasets_df[\"ability\"] = all_datasets_df[\"ability\"].replace(\n",
    "    {\"math\": \"Arithmetic\", \"CS\": \"Common Sense\", \"symbolic\": \"Symbolic\"}\n",
    ")\n",
    "\n",
    "# all_datasets_df = all_datasets_df[\n",
    "#     all_datasets_df[\"ability\"].isin([\"math\", \"CS\", \"symbolic\"])\n",
    "# ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [],
   "source": [
    "# among the samples with direct answers that are not nans, show how many of the sys1 and sys2 answers are yes and no using a subburns\n",
    "\n",
    "strategies = []\n",
    "indices = []\n",
    "benchmarks = []\n",
    "direct_answers = []\n",
    "abilities = []\n",
    "\n",
    "for index, row in all_datasets_df[\n",
    "    ~all_datasets_df[\"sys1_direct_answer\"].isna()\n",
    "].iterrows():\n",
    "    if row[\"sys1_direct_answer\"] == \"error\" or row[\"sys2_direct_answer\"] == \"error\":\n",
    "        continue\n",
    "    strategies.append(\"sys1\")\n",
    "    direct_answers.append(row[\"sys1_direct_answer\"])\n",
    "    strategies.append(\"sys2\")\n",
    "    direct_answers.append(row[\"sys2_direct_answer\"])\n",
    "    benchmarks.append(row[\"benchmark\"])\n",
    "    benchmarks.append(row[\"benchmark\"])\n",
    "    abilities.append(row[\"ability\"])\n",
    "    abilities.append(row[\"ability\"])\n",
    "    indices.append(row[\"index\"])\n",
    "    indices.append(row[\"index\"])\n",
    "\n",
    "data = pd.DataFrame(\n",
    "    {\n",
    "        \"Strategy\": strategies,\n",
    "        \"direct_answer_clean\": direct_answers,\n",
    "        \"benchmark\": benchmarks,\n",
    "        \"ability\": abilities,\n",
    "        \"index\": indices,\n",
    "    }\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.plotly.v1+json": {
       "config": {
        "plotlyServerURL": "https://plot.ly"
       },
       "data": [
        {
         "branchvalues": "total",
         "domain": {
          "x": [
           0,
           1
          ],
          "y": [
           0,
           1
          ]
         },
         "hovertemplate": "labels=%{label}<br>count=%{value}<br>parent=%{parent}<br>id=%{id}<extra></extra>",
         "ids": [
          "sys1/no",
          "sys2/no",
          "sys1/yes",
          "sys2/yes",
          "sys1",
          "sys2"
         ],
         "labels": [
          "no",
          "no",
          "yes",
          "yes",
          "sys1",
          "sys2"
         ],
         "name": "",
         "parents": [
          "sys1",
          "sys2",
          "sys1",
          "sys2",
          "",
          ""
         ],
         "type": "sunburst",
         "values": [
          635,
          689,
          656,
          602,
          1291,
          1291
         ]
        }
       ],
       "layout": {
        "legend": {
         "tracegroupgap": 0
        },
        "margin": {
         "t": 60
        },
        "template": {
         "data": {
          "bar": [
           {
            "error_x": {
             "color": "#2a3f5f"
            },
            "error_y": {
             "color": "#2a3f5f"
            },
            "marker": {
             "line": {
              "color": "#E5ECF6",
              "width": 0.5
             },
             "pattern": {
              "fillmode": "overlay",
              "size": 10,
              "solidity": 0.2
             }
            },
            "type": "bar"
           }
          ],
          "barpolar": [
           {
            "marker": {
             "line": {
              "color": "#E5ECF6",
              "width": 0.5
             },
             "pattern": {
              "fillmode": "overlay",
              "size": 10,
              "solidity": 0.2
             }
            },
            "type": "barpolar"
           }
          ],
          "carpet": [
           {
            "aaxis": {
             "endlinecolor": "#2a3f5f",
             "gridcolor": "white",
             "linecolor": "white",
             "minorgridcolor": "white",
             "startlinecolor": "#2a3f5f"
            },
            "baxis": {
             "endlinecolor": "#2a3f5f",
             "gridcolor": "white",
             "linecolor": "white",
             "minorgridcolor": "white",
             "startlinecolor": "#2a3f5f"
            },
            "type": "carpet"
           }
          ],
          "choropleth": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "type": "choropleth"
           }
          ],
          "contour": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "contour"
           }
          ],
          "contourcarpet": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "type": "contourcarpet"
           }
          ],
          "heatmap": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "heatmap"
           }
          ],
          "heatmapgl": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "heatmapgl"
           }
          ],
          "histogram": [
           {
            "marker": {
             "pattern": {
              "fillmode": "overlay",
              "size": 10,
              "solidity": 0.2
             }
            },
            "type": "histogram"
           }
          ],
          "histogram2d": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "histogram2d"
           }
          ],
          "histogram2dcontour": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "histogram2dcontour"
           }
          ],
          "mesh3d": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "type": "mesh3d"
           }
          ],
          "parcoords": [
           {
            "line": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "parcoords"
           }
          ],
          "pie": [
           {
            "automargin": true,
            "type": "pie"
           }
          ],
          "scatter": [
           {
            "fillpattern": {
             "fillmode": "overlay",
             "size": 10,
             "solidity": 0.2
            },
            "type": "scatter"
           }
          ],
          "scatter3d": [
           {
            "line": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scatter3d"
           }
          ],
          "scattercarpet": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattercarpet"
           }
          ],
          "scattergeo": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattergeo"
           }
          ],
          "scattergl": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattergl"
           }
          ],
          "scattermapbox": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattermapbox"
           }
          ],
          "scatterpolar": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scatterpolar"
           }
          ],
          "scatterpolargl": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scatterpolargl"
           }
          ],
          "scatterternary": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scatterternary"
           }
          ],
          "surface": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "surface"
           }
          ],
          "table": [
           {
            "cells": {
             "fill": {
              "color": "#EBF0F8"
             },
             "line": {
              "color": "white"
             }
            },
            "header": {
             "fill": {
              "color": "#C8D4E3"
             },
             "line": {
              "color": "white"
             }
            },
            "type": "table"
           }
          ]
         },
         "layout": {
          "annotationdefaults": {
           "arrowcolor": "#2a3f5f",
           "arrowhead": 0,
           "arrowwidth": 1
          },
          "autotypenumbers": "strict",
          "coloraxis": {
           "colorbar": {
            "outlinewidth": 0,
            "ticks": ""
           }
          },
          "colorscale": {
           "diverging": [
            [
             0,
             "#8e0152"
            ],
            [
             0.1,
             "#c51b7d"
            ],
            [
             0.2,
             "#de77ae"
            ],
            [
             0.3,
             "#f1b6da"
            ],
            [
             0.4,
             "#fde0ef"
            ],
            [
             0.5,
             "#f7f7f7"
            ],
            [
             0.6,
             "#e6f5d0"
            ],
            [
             0.7,
             "#b8e186"
            ],
            [
             0.8,
             "#7fbc41"
            ],
            [
             0.9,
             "#4d9221"
            ],
            [
             1,
             "#276419"
            ]
           ],
           "sequential": [
            [
             0,
             "#0d0887"
            ],
            [
             0.1111111111111111,
             "#46039f"
            ],
            [
             0.2222222222222222,
             "#7201a8"
            ],
            [
             0.3333333333333333,
             "#9c179e"
            ],
            [
             0.4444444444444444,
             "#bd3786"
            ],
            [
             0.5555555555555556,
             "#d8576b"
            ],
            [
             0.6666666666666666,
             "#ed7953"
            ],
            [
             0.7777777777777778,
             "#fb9f3a"
            ],
            [
             0.8888888888888888,
             "#fdca26"
            ],
            [
             1,
             "#f0f921"
            ]
           ],
           "sequentialminus": [
            [
             0,
             "#0d0887"
            ],
            [
             0.1111111111111111,
             "#46039f"
            ],
            [
             0.2222222222222222,
             "#7201a8"
            ],
            [
             0.3333333333333333,
             "#9c179e"
            ],
            [
             0.4444444444444444,
             "#bd3786"
            ],
            [
             0.5555555555555556,
             "#d8576b"
            ],
            [
             0.6666666666666666,
             "#ed7953"
            ],
            [
             0.7777777777777778,
             "#fb9f3a"
            ],
            [
             0.8888888888888888,
             "#fdca26"
            ],
            [
             1,
             "#f0f921"
            ]
           ]
          },
          "colorway": [
           "#636efa",
           "#EF553B",
           "#00cc96",
           "#ab63fa",
           "#FFA15A",
           "#19d3f3",
           "#FF6692",
           "#B6E880",
           "#FF97FF",
           "#FECB52"
          ],
          "font": {
           "color": "#2a3f5f"
          },
          "geo": {
           "bgcolor": "white",
           "lakecolor": "white",
           "landcolor": "#E5ECF6",
           "showlakes": true,
           "showland": true,
           "subunitcolor": "white"
          },
          "hoverlabel": {
           "align": "left"
          },
          "hovermode": "closest",
          "mapbox": {
           "style": "light"
          },
          "paper_bgcolor": "white",
          "plot_bgcolor": "#E5ECF6",
          "polar": {
           "angularaxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           },
           "bgcolor": "#E5ECF6",
           "radialaxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           }
          },
          "scene": {
           "xaxis": {
            "backgroundcolor": "#E5ECF6",
            "gridcolor": "white",
            "gridwidth": 2,
            "linecolor": "white",
            "showbackground": true,
            "ticks": "",
            "zerolinecolor": "white"
           },
           "yaxis": {
            "backgroundcolor": "#E5ECF6",
            "gridcolor": "white",
            "gridwidth": 2,
            "linecolor": "white",
            "showbackground": true,
            "ticks": "",
            "zerolinecolor": "white"
           },
           "zaxis": {
            "backgroundcolor": "#E5ECF6",
            "gridcolor": "white",
            "gridwidth": 2,
            "linecolor": "white",
            "showbackground": true,
            "ticks": "",
            "zerolinecolor": "white"
           }
          },
          "shapedefaults": {
           "line": {
            "color": "#2a3f5f"
           }
          },
          "ternary": {
           "aaxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           },
           "baxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           },
           "bgcolor": "#E5ECF6",
           "caxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           }
          },
          "title": {
           "x": 0.05
          },
          "xaxis": {
           "automargin": true,
           "gridcolor": "white",
           "linecolor": "white",
           "ticks": "",
           "title": {
            "standoff": 15
           },
           "zerolinecolor": "white",
           "zerolinewidth": 2
          },
          "yaxis": {
           "automargin": true,
           "gridcolor": "white",
           "linecolor": "white",
           "ticks": "",
           "title": {
            "standoff": 15
           },
           "zerolinecolor": "white",
           "zerolinewidth": 2
          }
         }
        }
       }
      }
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# make a pie plot with the first layer being the Strategy and the second layer being how much of the direct_answer_clean for each strategy is yes or no\n",
    "fig = px.sunburst(data, path=[\"Strategy\", \"direct_answer_clean\"])\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "pvalue      0.0015531302152038344\n",
      "statistic   114.0\n"
     ]
    }
   ],
   "source": [
    "all_datasets_df_without_errors = all_datasets_df[\n",
    "    (all_datasets_df[\"sys1_direct_answer\"] != \"error\")\n",
    "    & (all_datasets_df[\"sys2_direct_answer\"] != \"error\")\n",
    "]\n",
    "\n",
    "sys1_yes_sys2_yes = all_datasets_df_without_errors[\n",
    "    (all_datasets_df_without_errors[\"sys1_direct_answer\"] == \"yes\")\n",
    "    & (all_datasets_df_without_errors[\"sys2_direct_answer\"] == \"yes\")\n",
    "]\n",
    "\n",
    "sys1_no_sys2_no = all_datasets_df_without_errors[\n",
    "    (all_datasets_df_without_errors[\"sys1_direct_answer\"] == \"no\")\n",
    "    & (all_datasets_df_without_errors[\"sys2_direct_answer\"] == \"no\")\n",
    "]\n",
    "\n",
    "sys1_yes_sys2_no = all_datasets_df_without_errors[\n",
    "    (all_datasets_df_without_errors[\"sys1_direct_answer\"] == \"yes\")\n",
    "    & (all_datasets_df_without_errors[\"sys2_direct_answer\"] == \"no\")\n",
    "]\n",
    "\n",
    "sys1_no_sys2_yes = all_datasets_df_without_errors[\n",
    "    (all_datasets_df_without_errors[\"sys1_direct_answer\"] == \"no\")\n",
    "    & (all_datasets_df_without_errors[\"sys2_direct_answer\"] == \"yes\")\n",
    "]\n",
    "\n",
    "\n",
    "table = np.array(\n",
    "    [\n",
    "        [len(sys1_yes_sys2_yes), len(sys1_yes_sys2_no)],\n",
    "        [len(sys1_no_sys2_yes), len(sys1_no_sys2_no)],\n",
    "    ]\n",
    ")\n",
    "\n",
    "result = mcnemar(table, exact=True, correction=True)\n",
    "print(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.plotly.v1+json": {
       "config": {
        "plotlyServerURL": "https://plot.ly"
       },
       "data": [
        {
         "branchvalues": "total",
         "domain": {
          "x": [
           0,
           1
          ],
          "y": [
           0,
           1
          ]
         },
         "hovertemplate": "labels=%{label}<br>count=%{value}<br>parent=%{parent}<br>id=%{id}<extra></extra>",
         "ids": [
          "sys1/Arithmetic/no",
          "sys2/Arithmetic/no",
          "sys1/Common Sense/no",
          "sys2/Common Sense/no",
          "sys1/Symbolic/no",
          "sys2/Symbolic/no",
          "sys1/Arithmetic/yes",
          "sys2/Arithmetic/yes",
          "sys1/Common Sense/yes",
          "sys2/Common Sense/yes",
          "sys1/Symbolic/yes",
          "sys2/Symbolic/yes",
          "sys1/Arithmetic",
          "sys2/Arithmetic",
          "sys1/Common Sense",
          "sys2/Common Sense",
          "sys1/Symbolic",
          "sys2/Symbolic",
          "sys1",
          "sys2"
         ],
         "labels": [
          "no",
          "no",
          "no",
          "no",
          "no",
          "no",
          "yes",
          "yes",
          "yes",
          "yes",
          "yes",
          "yes",
          "Arithmetic",
          "Arithmetic",
          "Common Sense",
          "Common Sense",
          "Symbolic",
          "Symbolic",
          "sys1",
          "sys2"
         ],
         "name": "",
         "parents": [
          "sys1/Arithmetic",
          "sys2/Arithmetic",
          "sys1/Common Sense",
          "sys2/Common Sense",
          "sys1/Symbolic",
          "sys2/Symbolic",
          "sys1/Arithmetic",
          "sys2/Arithmetic",
          "sys1/Common Sense",
          "sys2/Common Sense",
          "sys1/Symbolic",
          "sys2/Symbolic",
          "sys1",
          "sys2",
          "sys1",
          "sys2",
          "sys1",
          "sys2",
          "",
          ""
         ],
         "type": "sunburst",
         "values": [
          192,
          191,
          241,
          285,
          202,
          213,
          453,
          454,
          141,
          97,
          62,
          51,
          645,
          645,
          382,
          382,
          264,
          264,
          1291,
          1291
         ]
        }
       ],
       "layout": {
        "legend": {
         "tracegroupgap": 0
        },
        "margin": {
         "t": 60
        },
        "template": {
         "data": {
          "bar": [
           {
            "error_x": {
             "color": "#2a3f5f"
            },
            "error_y": {
             "color": "#2a3f5f"
            },
            "marker": {
             "line": {
              "color": "#E5ECF6",
              "width": 0.5
             },
             "pattern": {
              "fillmode": "overlay",
              "size": 10,
              "solidity": 0.2
             }
            },
            "type": "bar"
           }
          ],
          "barpolar": [
           {
            "marker": {
             "line": {
              "color": "#E5ECF6",
              "width": 0.5
             },
             "pattern": {
              "fillmode": "overlay",
              "size": 10,
              "solidity": 0.2
             }
            },
            "type": "barpolar"
           }
          ],
          "carpet": [
           {
            "aaxis": {
             "endlinecolor": "#2a3f5f",
             "gridcolor": "white",
             "linecolor": "white",
             "minorgridcolor": "white",
             "startlinecolor": "#2a3f5f"
            },
            "baxis": {
             "endlinecolor": "#2a3f5f",
             "gridcolor": "white",
             "linecolor": "white",
             "minorgridcolor": "white",
             "startlinecolor": "#2a3f5f"
            },
            "type": "carpet"
           }
          ],
          "choropleth": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "type": "choropleth"
           }
          ],
          "contour": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "contour"
           }
          ],
          "contourcarpet": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "type": "contourcarpet"
           }
          ],
          "heatmap": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "heatmap"
           }
          ],
          "heatmapgl": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "heatmapgl"
           }
          ],
          "histogram": [
           {
            "marker": {
             "pattern": {
              "fillmode": "overlay",
              "size": 10,
              "solidity": 0.2
             }
            },
            "type": "histogram"
           }
          ],
          "histogram2d": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "histogram2d"
           }
          ],
          "histogram2dcontour": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "histogram2dcontour"
           }
          ],
          "mesh3d": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "type": "mesh3d"
           }
          ],
          "parcoords": [
           {
            "line": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "parcoords"
           }
          ],
          "pie": [
           {
            "automargin": true,
            "type": "pie"
           }
          ],
          "scatter": [
           {
            "fillpattern": {
             "fillmode": "overlay",
             "size": 10,
             "solidity": 0.2
            },
            "type": "scatter"
           }
          ],
          "scatter3d": [
           {
            "line": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scatter3d"
           }
          ],
          "scattercarpet": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattercarpet"
           }
          ],
          "scattergeo": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattergeo"
           }
          ],
          "scattergl": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattergl"
           }
          ],
          "scattermapbox": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattermapbox"
           }
          ],
          "scatterpolar": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scatterpolar"
           }
          ],
          "scatterpolargl": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scatterpolargl"
           }
          ],
          "scatterternary": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scatterternary"
           }
          ],
          "surface": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "surface"
           }
          ],
          "table": [
           {
            "cells": {
             "fill": {
              "color": "#EBF0F8"
             },
             "line": {
              "color": "white"
             }
            },
            "header": {
             "fill": {
              "color": "#C8D4E3"
             },
             "line": {
              "color": "white"
             }
            },
            "type": "table"
           }
          ]
         },
         "layout": {
          "annotationdefaults": {
           "arrowcolor": "#2a3f5f",
           "arrowhead": 0,
           "arrowwidth": 1
          },
          "autotypenumbers": "strict",
          "coloraxis": {
           "colorbar": {
            "outlinewidth": 0,
            "ticks": ""
           }
          },
          "colorscale": {
           "diverging": [
            [
             0,
             "#8e0152"
            ],
            [
             0.1,
             "#c51b7d"
            ],
            [
             0.2,
             "#de77ae"
            ],
            [
             0.3,
             "#f1b6da"
            ],
            [
             0.4,
             "#fde0ef"
            ],
            [
             0.5,
             "#f7f7f7"
            ],
            [
             0.6,
             "#e6f5d0"
            ],
            [
             0.7,
             "#b8e186"
            ],
            [
             0.8,
             "#7fbc41"
            ],
            [
             0.9,
             "#4d9221"
            ],
            [
             1,
             "#276419"
            ]
           ],
           "sequential": [
            [
             0,
             "#0d0887"
            ],
            [
             0.1111111111111111,
             "#46039f"
            ],
            [
             0.2222222222222222,
             "#7201a8"
            ],
            [
             0.3333333333333333,
             "#9c179e"
            ],
            [
             0.4444444444444444,
             "#bd3786"
            ],
            [
             0.5555555555555556,
             "#d8576b"
            ],
            [
             0.6666666666666666,
             "#ed7953"
            ],
            [
             0.7777777777777778,
             "#fb9f3a"
            ],
            [
             0.8888888888888888,
             "#fdca26"
            ],
            [
             1,
             "#f0f921"
            ]
           ],
           "sequentialminus": [
            [
             0,
             "#0d0887"
            ],
            [
             0.1111111111111111,
             "#46039f"
            ],
            [
             0.2222222222222222,
             "#7201a8"
            ],
            [
             0.3333333333333333,
             "#9c179e"
            ],
            [
             0.4444444444444444,
             "#bd3786"
            ],
            [
             0.5555555555555556,
             "#d8576b"
            ],
            [
             0.6666666666666666,
             "#ed7953"
            ],
            [
             0.7777777777777778,
             "#fb9f3a"
            ],
            [
             0.8888888888888888,
             "#fdca26"
            ],
            [
             1,
             "#f0f921"
            ]
           ]
          },
          "colorway": [
           "#636efa",
           "#EF553B",
           "#00cc96",
           "#ab63fa",
           "#FFA15A",
           "#19d3f3",
           "#FF6692",
           "#B6E880",
           "#FF97FF",
           "#FECB52"
          ],
          "font": {
           "color": "#2a3f5f"
          },
          "geo": {
           "bgcolor": "white",
           "lakecolor": "white",
           "landcolor": "#E5ECF6",
           "showlakes": true,
           "showland": true,
           "subunitcolor": "white"
          },
          "hoverlabel": {
           "align": "left"
          },
          "hovermode": "closest",
          "mapbox": {
           "style": "light"
          },
          "paper_bgcolor": "white",
          "plot_bgcolor": "#E5ECF6",
          "polar": {
           "angularaxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           },
           "bgcolor": "#E5ECF6",
           "radialaxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           }
          },
          "scene": {
           "xaxis": {
            "backgroundcolor": "#E5ECF6",
            "gridcolor": "white",
            "gridwidth": 2,
            "linecolor": "white",
            "showbackground": true,
            "ticks": "",
            "zerolinecolor": "white"
           },
           "yaxis": {
            "backgroundcolor": "#E5ECF6",
            "gridcolor": "white",
            "gridwidth": 2,
            "linecolor": "white",
            "showbackground": true,
            "ticks": "",
            "zerolinecolor": "white"
           },
           "zaxis": {
            "backgroundcolor": "#E5ECF6",
            "gridcolor": "white",
            "gridwidth": 2,
            "linecolor": "white",
            "showbackground": true,
            "ticks": "",
            "zerolinecolor": "white"
           }
          },
          "shapedefaults": {
           "line": {
            "color": "#2a3f5f"
           }
          },
          "ternary": {
           "aaxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           },
           "baxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           },
           "bgcolor": "#E5ECF6",
           "caxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           }
          },
          "title": {
           "x": 0.05
          },
          "xaxis": {
           "automargin": true,
           "gridcolor": "white",
           "linecolor": "white",
           "ticks": "",
           "title": {
            "standoff": 15
           },
           "zerolinecolor": "white",
           "zerolinewidth": 2
          },
          "yaxis": {
           "automargin": true,
           "gridcolor": "white",
           "linecolor": "white",
           "ticks": "",
           "title": {
            "standoff": 15
           },
           "zerolinecolor": "white",
           "zerolinewidth": 2
          }
         }
        }
       }
      }
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# make a pie plot with the first layer being the Strategy and the second layer being how much of the direct_answer_clean for each strategy is yes or no\n",
    "fig = px.sunburst(data, path=[\"Strategy\", \"ability\", \"direct_answer_clean\"])\n",
    "# add the title that would explaion\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ability: Arithmetic\n",
      "sys1 yes ratio: 0.7023255813953488\n",
      "sys2 yes ratio: 0.703875968992248\n",
      "ability: Common Sense\n",
      "sys1 yes ratio: 0.36910994764397903\n",
      "sys2 yes ratio: 0.25392670157068065\n",
      "ability: Symbolic\n",
      "sys1 yes ratio: 0.23484848484848486\n",
      "sys2 yes ratio: 0.19318181818181818\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAeoAAAEiCAYAAAA21pHjAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8fJSN1AAAACXBIWXMAAA9hAAAPYQGoP6dpAABC9UlEQVR4nO3de1xNWf8H8M8pdbrfUG5RuUcSHkaMNEoTk5IHD0aJMZgxjBi/ekYI4zLGbYaZGJTLGNeGBzNEJGQYVMwwiMil3EmZTrf1+8OrM850O/vo3uf9ep3Xq732Xnt/T+3T96y9115LJoQQICIioipJq7IDICIiouIxURMREVVhTNRERERVGBM1ERFRFcZETUREVIUxURMREVVhTNRERERVGBM1ERFRFcZETUREVIUxURMREVVh1TpRx8bGwsvLC40aNYJMJsPu3btL3D4mJgYymazQKy0trWICJiIikqhaJ+rMzEw4Ojpi1apVkupduXIFqampypelpWU5RUhERPRm6lR2AG/C09MTnp6ekutZWlrCzMys7AMiIiIqY9W6Ra2pjh07omHDhnB3d8fJkycrOxwiIqJiVesWtVQNGzZEWFgYunTpAoVCgbVr16J37944ffo0OnXqVGw9hUIBhUKhXBZCIDs7G/Xq1YNMJquI0ImIqJaS1ZT5qGUyGX766Sf4+PhIqufi4oKmTZti06ZNxW4ze/ZshIaGFirfsmULDAwMpIZKREQEb29vtbarVS3qonTt2hUnTpwocZvg4GAEBgYql9PT02FtbY2+ffvCxMSkvEMkIqJarNYn6oSEBDRs2LDEbeRyOeRyeaFyHR0d6OjolFdoRERE1TtRZ2RkICkpSbmcnJyMhIQEWFhYoGnTpggODsbdu3exceNGAMDy5ctha2uLdu3aISsrC2vXrsWRI0cQFRVVWW+BiIioRNU6UZ89exaurq7K5YLL0/7+/oiIiEBqaipSUlKU67OzszF16lTcvXsXBgYG6NChAw4fPqyyDyIioqqkxnQmq0jp6ekwNTXF8+fPeY+aiIjKVa18jpqIiKi6YKImIiKqwpioiYiIqjAmaiIioiqMiZqIiKgKY6ImIiKqwpioiYiIqjAmaiIioiqMiZqIiKgKq9ZDiBLVBJ0/21jZIZCazi32q+wQqBZii5qIiKgKY6ImIiKqwpioiYiIqjAmaiIioiqMiZqIiKgKY6ImIiKqwpioiYiIqjCNnqOOjo5GdHQ0Hjx4gPz8fJV169evL5PAiIiISINEHRoaijlz5qBLly5o2LAhZDJZecRFRERE0CBRh4WFISIiAiNHjiyPeIiIiOg1ku9RZ2dnw9nZuTxiISIion+QnKg/+OADbNmypTxiISIion+QfOk7KysLa9asweHDh9GhQwfo6OiorF+6dGmZBUdERFTbSW5RX7hwAR07doSWlhZ+//13xMfHK18JCQnlEGLxYmNj4eXlhUaNGkEmk2H37t1q1z158iTq1KmDjh07llt8REREb0pyi/ro0aPlEYdGMjMz4ejoiNGjR8PX11ftes+ePYOfnx/69OmD+/fvl2OEREREb6Zaz0ft6ekJT09PyfXGjx+P4cOHQ1tbW1IrnIiIqKKplah9fX0REREBExOTUluukZGRZRJYeQkPD8eNGzewefNmzJs3r7LDISIiKpFaidrU1FQ5sImpqWm5BlSerl27hqCgIBw/fhx16qh/MUGhUEChUCiX09PTAQA5OTnIyckp8zipdtHVruwISF38vFNZ+mdn7OKola3Cw8OL/Lk6ycvLw/DhwxEaGopWrVpJqrtgwQKEhoYWKo+KioKBgUFZhUi1VFD36vvlt7b5+eefKzsEqkG8vb3V2k4mhBDlHEuFkMlk+Omnn+Dj41Pk+mfPnsHc3Bza2n83X/Lz8yGEgLa2NqKiovDOO+8UWbeoFrW1tTUePXoEExOTMn0fVPv0CvmxskMgNcXOHVbZIVANUqYt6prAxMQEFy9eVCn79ttvceTIEezcuRO2trbF1pXL5ZDL5YXKdXR01P5FExUnO6+yIyB18fNOlaFaJ+qMjAwkJSUpl5OTk5GQkAALCws0bdoUwcHBuHv3LjZu3AgtLS20b99epb6lpSX09PQKlRMREVUV1TpRnz17Fq6ursrlwMBAAIC/vz8iIiKQmpqKlJSUygqPiIjojZXJPepnz57BzMysDMKpHtLT02Fqaornz5/zHjW9sc6fbazsEEhN5xb7VXYIVAtJHkJ00aJF2LZtm3J5yJAhqFu3Lho3bozExMQyDY6IiKi2k5yow8LCYG1tDQA4dOgQDh06hF9++QWenp747LPPyjxAIiKi2kzyPeq0tDRlot63bx+GDBmCvn37wsbGBt26dSvzAImIiGozyS1qc3Nz3L59GwBw4MABuLm5AQCEEMjL43MmREREZUlyi9rX1xfDhw9Hy5Yt8fjxY+WkGPHx8WjRokWZB0hERFSbSU7Uy5Ytg62tLVJSUvDll1/CyMgIAJCamoqPPvqozAMkIiKqzSQl6pycHIwbNw4hISGFRvKaMmVKmQZGREREEu9R6+joYNeuXeUVCxEREf2D5M5kPj4+2L17dzmEQkRERP8k+R51y5YtMWfOHJw8eRKdO3eGoaGhyvpJkyaVWXBERES1neQhREuaZUomk+HGjRsl1r9z5w7MzMyUndAK5OTk4NSpU+jVq5eUcCoFhxClssQhRKsPDiFKlUFyizo5OVmjA6WmpsLb2xvnzp2DTCbD8OHD8e233yoT9pMnT+Dq6spnsYmIiF4j+R51gezsbFy5cgW5ublqbR8UFAQtLS2cPn0aBw4cwKVLl+Dq6oqnT58qtymD+UGIiIhqFMmJ+uXLlxgzZgwMDAzQrl075TSSn3zyCRYuXFhsvcOHD+Prr79Gly5d4ObmhpMnT6Jhw4Z455138OTJEwCvLp0TERHR3yQn6uDgYCQmJiImJgZ6enrKcjc3N5VZtf7p+fPnMDc3Vy7L5XJERkbCxsYGrq6uePDggdRQiIiIajzJiXr37t1YuXIlevbsqdICbteuHa5fv15sPTs7O1y4cEGlrE6dOtixYwfs7Ozw3nvvSQ2FiIioxpOcqB8+fAhLS8tC5ZmZmSVeuvb09MSaNWsKlRck644dO0oNhYiIqMaTnKi7dOmC/fv3K5cLkvPatWvRvXv3Yut98cUX2LFjR5Hr6tSpg127dpX6aBcREVFtI/nxrPnz58PT0xOXLl1Cbm4uVqxYgUuXLiEuLg7Hjh0r/kB16pT4zHGdOnXQrFkzqeEQERHVaJITdc+ePZGQkICFCxfCwcEBUVFR6NSpE06dOgUHB4dS6wshsHPnThw9ehQPHjxAfn6+yvrIyEipIREREdVYkhM1ADRv3hzff/+9Rgf89NNPsXr1ari6usLKyoqPZBEREZVAcqL28/ODq6srXFxcYGdnJ/mAmzZtQmRkJPr16ye5LhERUW0juTOZrq4uFixYgBYtWsDa2hrvv/8+1q5di2vXrqlV39TUVKMET0REVBtJTtRr167F1atXcfv2bXz55ZcwMjLCkiVL0KZNGzRp0qTU+rNnz0ZoaCj++usvjQImIiKqTTQe69vc3Bx169aFubk5zMzMUKdOHdSvX7/UekOGDMHTp09haWkJBwcHdOrUSeUlRWxsLLy8vNCoUSPIZLJS58k+ceIEevTogbp160JfXx9t2rTBsmXLJB2TiIioIkm+R/3f//4XMTExiI+PR9u2beHi4oKgoCD06tVLZYjQ4vj7++PcuXN4//3337gzWWZmJhwdHTF69Gj4+vqWur2hoSEmTpyIDh06wNDQECdOnMC4ceNgaGiIDz/8UOM4iIiIyovk+ai1tLRQv359TJkyBb6+vmjVqpWkAxoaGuLgwYPo2bOnpHqlkclk+Omnn+Dj4yOpnq+vLwwNDbFp0ya163A+aipLnI+6+uB81FQZJLeo4+PjcezYMcTExGDJkiXQ1dWFi4sLevfujd69e5eauK2tratMcouPj0dcXBzmzZtX4nYKhQIKhUK5nJ6eDgDIyclBTk5OucZINZ+udmVHQOri553Kko6OjlrbSW5R/1NiYiKWLVuGH374Afn5+cjLyytx+/379+Obb75BWFgYbGxs3uTQKqS0qJs0aYKHDx8iNzcXs2fPRkhISInbF3SA+6ctW7bAwMBA05CJiKgW8/b2Vms7yYlaCIH4+HjExMQgJiYGJ06cQHp6Ojp06AAXF5dSO2eZm5vj5cuXyM3NhYGBQaFvFAVzU0slJVEnJycjIyMDv/76K4KCgrBy5UoMGzas2O2LalFbW1vj0aNHVebqAFVfvUJ+rOwQSE2xc4v/P0EklbotasmXvi0sLJCRkQFHR0e4uLhg7NixePvtt2FmZqZW/eXLl0s9ZJmztbUFADg4OOD+/fuYPXt2iYlaLpdDLpcXKtfR0VH7F01UnOySL0JRFcLPO1UGyYl68+bNePvttzVuSfr7+2tUr7zk5+ertJaJiIiqEsmJun///irL6enpOHLkCFq3bo22bduqtY/r168jPDwc169fx4oVK2BpaYlffvkFTZs2Rbt27dSOJSMjA0lJScrl5ORkJCQkwMLCAk2bNkVwcDDu3r2LjRtf9apdtWoVmjZtijZt2gB49Rz2V199hUmTJql9TCIioookecCTIUOGYOXKlQCAv/76C126dMGQIUPQoUMH7Nq1q9T6x44dg4ODA06fPo3IyEhkZGQAeNUpbdasWZJiOXv2LJycnODk5AQACAwMhJOTE2bOnAkASE1NRUpKinL7/Px8BAcHo2PHjujSpQtWrVqFRYsWYc6cOZKOS0REVFEkdyZr0KABDh48CEdHR2zZsgWzZs1CYmIiNmzYgDVr1iA+Pr7E+t27d8fgwYMRGBgIY2NjJCYmws7ODmfOnIGvry/u3LnzRm+oIvA5aipLfI66+uBz1FQZJLeonz9/DgsLCwDAgQMHMGjQIBgYGKB///5qTcxx8eJFDBw4sFC5paUlHj16JDUcIiKiGk1yora2tsapU6eQmZmJAwcOoG/fvgCAp0+fQk9Pr9T6ZmZmSE1NLVQeHx+Pxo0bSw2HiIioRpOcqD/99FOMGDECTZo0QaNGjdC7d28ArzpmOTg4lFr/P//5D/7v//4PaWlpkMlkyM/Px8mTJzFt2jT4+fGyEhER0esk9/r+6KOP0LVrV9y+fRvu7u7Q0nqV6+3s7EodihMA5s+fj48//hjW1tbIy8uDvb098vLyMHz4cMyYMUP6OyAiIqrB3ngIUU3dvn0bFy9eREZGBpycnNCyZcvKCEMj7ExGZYmdyaoPdiajyiC5RZ2Xl4eIiAhER0fjwYMHyM/PV1l/5MiREuvPmTMH06ZNg7W1NaytrZXlf/31FxYvXqx8tIqIiIg0uEc9efJkTJ48GXl5eWjfvj0cHR1VXqUJDQ1VPjv9upcvXxY58QUREVFtJrlFvXXrVmzfvh39+vXT6IBCCMhkskLliYmJyse+iIiI6BXJiVpXVxctWrSQfCBzc3PIZDLIZDK0atVKJVnn5eUhIyMD48ePl7xfIiKimkxyop46dSpWrFiBlStXFtkyLs7y5cshhMDo0aMRGhoKU1NT5TpdXV3Y2Nige/fuUsMhIiKq0SQn6hMnTuDo0aP45Zdf0K5du0LTvkVGRhZZr2DWLFtbW/To0QN16kg+NBERUa0jOVuamZkVOQSouoyNjXH58mXl4Ch79uxBeHg47O3tMXv2bOjq6mq8byIioppGcqIODw9/owOOGzcOQUFBcHBwwI0bNzB06FD4+vpix44dePnyJZYvX/5G+yciIqpJJD+e9aauXr2Kjh07AgB27NgBFxcXbNmyBREREWpNk0lERFSbaHSjeOfOndi+fTtSUlKQnZ2tsu78+fMl1hVCKAdJOXz4MN577z0Aryb74OxZREREqiS3qL/++msEBATAysoK8fHx6Nq1K+rWrYsbN27A09Oz1PpdunTBvHnzsGnTJhw7dgz9+/cHACQnJ8PKykr6OyAiIqrBJCfqb7/9FmvWrME333wDXV1dTJ8+HYcOHcKkSZPw/PnzUusvW7YM586dw8SJE/H5558rn8neuXMnnJ2dpb8DIiKiGkzype+UlBRlQtXX18eLFy8AACNHjsRbb72FlStXlljf0dERv//+e6HyxYsXQ1tbW2o4RERENZrkFnWDBg3w5MkTAEDTpk3x66+/Anh16Vqdibj8/f0RGxtbqFxPT6/QM9lERES1neRE/c477+B///sfACAgIABTpkyBu7s7hg4dqtbz1c+fP4ebmxtatmyJ+fPn4+7du9KjJiIiqiUkz0edn5+P/Px85chiW7duRVxcHFq2bIlx48apNWDJw4cPsWnTJmzYsAGXLl2Cm5sbRo8eDR8fn2rRquZ81FSWOB919cH5qKkySE7UZe38+fMIDw/H2rVrYWRkhPfffx8fffQRWrZsWZlhlYiJmsoSE3X1wURNlaFSB9xOTU3FoUOHcOjQIWhra6Nfv364ePEi7O3t8eWXX2LKlCmVGR4RUaXhF7jqpTy/xFX4yGQ5OTnYtWsX3nvvPTRr1gw7duzAp59+inv37mHDhg04fPgwtm/fjjlz5pS6r9jYWHh5eaFRo0aQyWTYvXt3idtHRkbC3d0d9evXh4mJCbp3746DBw+W0TsjIiIqexWeqBs2bIixY8eiWbNmOHPmDM6ePYvx48erXEJ2dXWFmZlZqfvKzMyEo6MjVq1apdaxY2Nj4e7ujp9//hnnzp2Dq6srvLy8EB8fr+nbISIiKlcVful72bJlGDx4MPT09IrdxszMDMnJyaXuy9PTU63R0Ar8c8KP+fPnY8+ePdi7dy+cnJzU3g8REVFF0ahFnZubi8OHD2P16tXKAU/u3buHjIyMUuuOHDlSJUnfunULly5dUo7/XZHy8/Px4sULWFhYVPixiYiI1CG5RX3r1i28++67SElJgUKhgLu7O4yNjbFo0SIoFAqEhYUVWW/9+vV49uwZAgMDlWUffvgh1q1bBwBo3bo1Dh48CGtraw3finRfffUVMjIyMGTIkBK3UygUUCgUyuX09HQAr+635+TklGuMVPPpckC+aqMiP+88L6oXTc4NdR9Hlvx4lo+PD4yNjbFu3TrUrVsXiYmJsLOzQ0xMDMaOHYtr164VWe+tt97CuHHjEBAQAAA4cOAAvLy8EBERgbZt22LixImwt7fH2rVrpYTz9xuRyfDTTz/Bx8dHre23bNmCsWPHYs+ePXBzcytx29mzZyM0NLTIfRgYGGgSLhER1XLe3t5qbSc5UdetWxdxcXFo3bo1jI2NlYn65s2bsLe3x8uXL4utFxMTAwcHBwDAhAkT8PDhQ+zcuRMAEBMTg4CAALXuTRf5RiQk6q1bt2L06NHYsWOHcvaukhTVoi6YlpPPUdOb6hXyY2WHQGqKnTuswo7F86J60eTcULdFLfnSd35+PvLy8gqV37lzB8bGxsXW++uvv1SSWlxcHMaMGaNctrOzQ1pamtRwJPvxxx8xevRobN26Va0kDQByuRxyubxQuY6OTrUYSY2qtuzCHyeqoiry887zonopz3NDcmeyvn37qvSelslkyMjIwKxZs9CvX79i6zVr1gznzp0DADx69Ah//PEHevTooVyflpYGU1NTSbFkZGQgISEBCQkJAF5NDJKQkICUlBQAQHBwMPz8/n4IfcuWLfDz88OSJUvQrVs3pKWlIS0tTa3pOYmIiCqD5Bb1kiVL4OHhAXt7e2RlZWH48OG4du0a6tWrhx9/LP5Sjb+/Pz7++GP88ccfOHLkCNq0aYPOnTsr18fFxaF9+/aSYjl79ixcXV2VywUd1fz9/REREYHU1FRl0gaANWvWIDc3Fx9//DE+/vhjldgiIiIkHZuIiKgiSE7UTZo0QWJiIrZu3YoLFy4gIyMDY8aMwYgRI6Cvr19svenTp+Ply5eIjIxEgwYNsGPHDpX1J0+exLBh0q7x9+7du8SpNf+ZfGNiYiTtn4iIqLJJ7kyWlZVV4mAltQEn5aCyxDGdq4+KnJSD50X1UqXG+ra0tIS/vz8OHTpUKYOUEBER1SaSE/WGDRvw8uVLeHt7o3Hjxvj0009x9uzZ8oiNiIio1pOcqAcOHIgdO3bg/v37mD9/Pi5duoS33noLrVq1UmvGKyIiIlKfxrNnGRsbIyAgAFFRUbhw4QIMDQ2LHL2LiIiINKdxos7KysL27dvh4+ODTp064cmTJ/jss88k7UMIUWKvbSIiotpOcqI+ePAg/P39YWVlhQkTJsDKygpRUVG4desWFi5cqNY+Nm7cCAcHB+jr60NfXx8dOnTApk2bJAdPRERU00l+jnrgwIF47733sHHjRvTr10/ysGlLly5FSEgIJk6cqByZ7MSJExg/fjwePXqEKVOmSA2JiIioxpKcqO/fv1/imN6l+eabb/Ddd9+pDO05YMAAtGvXDrNnz2aiJiIieo1aiTo9PV05sIcQQjkfc1FKGwAkNTUVzs7OhcqdnZ2RmpqqTjhERES1hlr3qM3NzfHgwQMAgJmZGczNzQu9CspL06JFC2zfvr1Q+bZt29CyZUuJ4RMREdVsarWojxw5AgsLCwDA0aNH3+iAoaGhGDp0KGJjY5X3qE+ePIno6OgiEzgREVFtplaidnFxUf5sa2sLa2tryGQylW2EELh9+3ap+xo0aBBOnz6NZcuWYffu3QCAtm3b4syZM3BycpIQOhERUc0nuTOZra0tUlNTYWlpqVL+5MkT2NraIi+v9NnOO3fujM2bN0s9NBERUa0j+TlqIUSh1jQAZGRkqDWrlra2tvJ+9+seP34MbW1tqeEQERHVaGq3qAMDAwEAMpkMISEhMDAwUK7Ly8vD6dOn0bFjx1L3U9xIZAqFArq6uuqGQ0REVCuonajj4+MBvEq0Fy9eVEmqurq6cHR0xLRp04qt//XXXwN4lejXrl0LIyMj5bq8vDzExsaiTZs2kt8AERFRTaZ2oi7o7R0QEIAVK1aU+rz0Py1btgzAq0QfFhamcplbV1cXNjY2CAsLk7RPIiKimk5yZ7Lw8HCNDpScnAwAcHV1RWRkpFrPXBMREdV2aiVqX19fREREwMTEBL6+viVuGxkZWeL6N30Om4iIqDZRK1Gbmpoqe3qbmpqWa0BERET0N7US9euXuzW99E1ERETSSX6OmoiIiCqO5ER9//59jBw5Eo0aNUKdOnWgra2t8iIiIqKyI7nX96hRo5CSkoKQkBA0bNiwyFHKNJGZmYlz586hV69eateJjY3F4sWLce7cOaSmpuKnn36Cj49PsdunpqZi6tSpOHv2LJKSkjBp0iQsX778zYMnIiIqJ5IT9YkTJ3D8+HG1RiGTIikpCa6urmqNFV4gMzMTjo6OGD16dKm90YFXo5/Vr18fM2bMUD7XTUREVJVJTtTW1tbFDgNa0Tw9PeHp6an29jY2NlixYgUAYP369eUVFhERUZmRnKiXL1+OoKAgrF69GjY2NmrXK5jPujhSWtIVTaFQQKFQKJfT09MBADk5OcjJyamssKiG0GXXjmqjIj/vPC+qF03ODR0dHbW2k5yohw4dipcvX6J58+YwMDAodKAnT54UWU+hUGDChAlwcHAocv2tW7cQGhoqNZwKsWDBgiJji4qKUpmchEgTQd05NkF18fPPP1fYsXheVC+anBve3t5qbadRi1oTHTt2hLW1Nfz9/Ytcn5iYWGUTdXBwsHL2MOBVi9ra2hp9+/aVPOY50T/1CvmxskMgNcXOHVZhx+J5Ub2U57khOVEXl2hL079/fzx79qzY9RYWFvDz89No3+VNLpdDLpcXKtfR0VH70gVRcbKr7l0f+oeK/LzzvKheyvPcUCtRp6enK1uOBfdni1NcC/O///1vifWsra056hkREdE/qJWozc3NkZqaCktLS5iZmRX57LQQAjKZrEI7hWVkZCApKUm5nJycjISEBFhYWKBp06YIDg7G3bt3sXHjRuU2CQkJyroPHz5EQkICdHV1YW9vX2FxExERqUutRH3kyBFlr+2ymP3q3r17OHHiBB48eID8/HyVdZMmTVJ7P2fPnoWrq6tyueA+sr+/PyIiIpCamoqUlBSVOk5OTsqfz507hy1btqBZs2a4efOmBu+EiIiofKmVqFesWAEnJyeYmJjg1q1bGDp0aJH3bNURERGBcePGQVdXF3Xr1lVpnctkMkmJunfv3iU+0x0REVGorKo8A05ERKQOtcb63rdvHzIzMwEAAQEBeP78ucYHDAkJwcyZM/H8+XPcvHkTycnJyteNGzc03i8REVFNpFaLuk2bNggODoarqyuEENi+fXuxncZK67n98uVL/Oc//4GWFifuIiIiKo1aiTosLAyBgYHYv38/ZDIZZsyYUWSHMplMVmqiHjNmDHbs2IGgoCDNIiYiIqpF1ErUzs7O+PXXXwEAWlpauHr1KiwtLTU64IIFC/Dee+/hwIEDcHBwKPTs2dKlSzXaLxERUU0kecCT5ORk1K9fX+MDLliwAAcPHkTr1q0BoFBnMiIiIvqb5ETdrFkzHD9+HKtXr8b169exc+dONG7cGJs2bYKtrS169uxZYv0lS5Zg/fr1GDVqlKYxExER1RqSe3Tt2rULHh4e0NfXR3x8vHJWqefPn2P+/Pml1pfL5ejRo4f0SImIiGohyYl63rx5CAsLw/fff69yf7lHjx44f/58qfUnT56Mb775RuphiYiIaiXJl76vXLmCXr16FSo3NTUtcdKNAmfOnMGRI0ewb98+tGvXrlBnssjISKkhERER1ViSE3WDBg2QlJQEGxsblfITJ07Azs6u1PpmZmbw9fWVelgiIqJaSXKiHjt2LCZPnoz169dDJpPh3r17OHXqFKZNm4aQkJBS63OGLCIiIvVJTtRBQUHIz89Hnz598PLlS/Tq1QtyuRzTpk3DJ598otY+cnNzERMTg+vXr2P48OEwNjbGvXv3YGJiAiMjI8lvgoiIqKaSnKhlMhk+//xzfPbZZ0hKSkJGRgbs7e3VTrC3bt3Cu+++i5SUFCgUCri7u8PY2BiLFi2CQqFAWFiY5DdBRERUU2k84HbBHM5du3aV1AqePHkyunTpgqdPn0JfX19ZPnDgQERHR2saDhERUY2kVotaSuev0nptHz9+HHFxcdDV1VUpt7Gxwd27d9U+DhERUW2gVova1NRU+TIxMUF0dDTOnj2rXH/u3DlER0fD1NS01H3l5+cjLy+vUPmdO3dgbGwsIXQiIqKaT60W9es9tf/v//4PQ4YMQVhYGLS1tQEAeXl5+Oijj4qd+vJ1ffv2xfLly7FmzRoAr+55Z2RkYNasWejXr58m74GIiKjGknyPev369Zg2bZoySQOAtrY2AgMDsX79+lLrL1myBCdPnoS9vT2ysrIwfPhw5WXvRYsWSQ2HiIioRpPc6zs3Nxd//vmncvarAn/++Sfy8/NLrd+kSRMkJiZi27ZtSExMREZGBsaMGYMRI0aodC4jIiIiDRJ1QEAAxowZg+vXr6Nr164AgNOnT2PhwoUICAgotX5sbCycnZ0xYsQIjBgxQlmem5uL2NjYIocnJSIiqq0kJ+qvvvoKDRo0wJIlS5CamgoAaNiwIT777DNMnTq11Pqurq5ITU2FpaWlSvnz58/h6upaZEczIiKi2kpyotbS0sL06dMxffp0pKenA4BancgKCCEgk8kKlT9+/BiGhoZSwyEiIqrRJCfq10lJ0AXPYstkMowaNQpyuVy5Li8vDxcuXICzs/ObhENERFTjaDwymVQFz2ELIWBsbKzybHaDBg3w4YcfYvPmzZL2GRsbCy8vLzRq1AgymQy7d+8utU5MTAw6deoEuVyOFi1aICIiQrM3REREVAHeqEUtRcGz2DY2Npg2bVqZXObOzMyEo6MjRo8erdboacnJyejfvz/Gjx+PH374AdHR0fjggw/QsGFDeHh4vHE8REREZa3CEnWB6dOnQwihXL516xZ++ukn2Nvbo2/fvpL25enpCU9PT7W3DwsLg62tLZYsWQIAaNu2LU6cOIFly5YxURMRUZWkVqK2sLDA1atXUa9ePYwePRorVqzQeLhPb29v+Pr6Yvz48Xj27Bm6du0KXV1dPHr0CEuXLsWECRM02q86Tp06BTc3N5UyDw8PfPrppyXWUygUUCgUyuWCTnQ5OTnIyckp8zipdtHVLn0bqhoq8vPO86J60eTc0NHRUWs7tRJ1dnY20tPTUa9ePWzYsAGLFi3SOFGfP38ey5YtAwDs3LkTDRo0QHx8PHbt2oWZM2eWa6JOS0uDlZWVSpmVlRXS09Px119/FTvgyoIFCxAaGlqoPCoqCgYGBuUSK9UeQd1LHyOfqoaff/65wo7F86J60eTc8Pb2Vms7tRJ19+7d4ePjg86dO0MIgUmTJhWb1EobRvTly5fKJB8VFQVfX19oaWnhrbfewq1bt9QKuqIFBwcjMDBQuZyeng5ra2v07dtXUs/3XiE/lkd4VA5i5w6rsGPxvKg+eF5Qccrz3FArUW/evBnLli3D9evXIZPJ8Pz5c2RlZWl0wBYtWmD37t0YOHAgDh48iClTpgAAHjx4ICnpaaJBgwa4f/++Stn9+/dhYmJS4vClcrlc5XGyAjo6OmpfugCAbI7lUm1I+bu+KZ4X1QfPCypOeZ4baiVqKysrLFy4EABga2uLTZs2oW7duhodcObMmRg+fDimTJmCPn36oHv37gBeta6dnJw02qe6unfvXujyxKFDh5QxEBERVTWSe30nJye/0QH//e9/o2fPnkhNTYWjo6OyvE+fPhg4cKCkfWVkZCApKUkltoSEBFhYWKBp06YIDg7G3bt3sXHjRgDA+PHjsXLlSkyfPh2jR4/GkSNHsH37duzfv/+N3hMREVF50WjAk2PHjsHLywstWrRAixYtMGDAABw/flytuuHh4TA1NYWTkxO0tP4+fNeuXdGmTRtJcZw9exZOTk7KlnhgYCCcnJwwc+ZMAEBqaipSUlKU29va2mL//v04dOgQHB0dsWTJEqxdu5aPZhERUZUlOVFv3rwZbm5uMDAwwKRJk5Qdy/r06YMtW7aUWj8oKAhWVlYYM2YM4uLiNAq6QO/evSGEKPQqGG0sIiICMTExherEx8dDoVDg+vXrGDVq1BvFQEREVJ4kJ+ovvvgCX375JbZt26ZM1Nu2bcPChQsxd+7cUuvfvXsXGzZswKNHj9C7d2+0adMGixYtQlpamkZvgIiIqCaTnKhv3LgBLy+vQuUDBgxQ6/51nTp1MHDgQOzZswe3b9/G2LFj8cMPP6Bp06YYMGAA9uzZg/z8fKlhERER1UiSE7W1tTWio6MLlR8+fBjW1taS9mVlZYWePXuie/fu0NLSwsWLF+Hv74/mzZsXumRNRERUG0nu9T116lRMmjQJCQkJymkpT548iYiICKxYsUKtfdy/fx+bNm1CeHg4bty4AR8fH+zbtw9ubm7IzMzEnDlz4O/vX2UHQCEiIqookhP1hAkT0KBBAyxZsgTbt28H8Gpyi23btqk1HJqXlxcOHjyIVq1aYezYsfDz84OFhYVyvaGhIaZOnYrFixdLDY2IiKjG0Wj2rIEDB0p+5rmApaUljh07VuIgI/Xr13/j57WJiIhqggqf5nLdunWlbiOTydCsWbMKiIaIiKhq02jAE02cOnUK+/btUynbuHEjbG1tYWlpiQ8//FBlKkkiIiKqwEQ9Z84c/PHHH8rlixcvYsyYMXBzc0NQUBD27t2LBQsWVFQ4RERE1UKFJeqEhAT06dNHubx161Z069YN33//PQIDA/H1118rO6cRERHRK2+UqAuG7FTH06dPYWVlpVw+duwYPD09lcv/+te/cPv27TcJh4iIqMbRKFFv3LgRDg4O0NfXh76+Pjp06IBNmzaVWMfKykrZkzs7Oxvnz5/HW2+9pVz/4sWLCp3rlYiIqDqQ3Ot76dKlCAkJwcSJE9GjRw8AwIkTJzB+/Hg8evQIU6ZMKbJev379EBQUhEWLFmH37t0wMDDA22+/rVx/4cIFNG/eXMO3QUREVDNJTtTffPMNvvvuO/j5+SnLBgwYgHbt2mH27NnFJuq5c+fC19cXLi4uMDIywoYNG6Crq6tcv379evTt21eDt0BERFRzSU7UqampyqFDX+fs7IzU1NRi69WrVw+xsbF4/vw5jIyMoK2trbJ+x44dMDIykhoOERFRjSb5HnWLFi2K7J29bds2tGzZstT6pqamhZI0AFhYWKi0sImIiEiDFnVoaCiGDh2K2NhY5T3qkydPIjo6mo9XERERlTHJLepBgwbh9OnTqFevHnbv3o3du3ejXr16OHPmjMbjfxMREVHRNBrru3Pnzti8eXNZx0JERET/oFaiTk9Ph4mJifLnkhRsR0RERG9OrURtbm6O1NRUWFpawszMDDKZrNA2QgjIZDLk5eWVeZBERES1lVqJ+siRI7CwsAAAHD16tFwDIiIior+plahdXFyUP9va2sLa2rpQq1oIwbG6iYiIypjkXt+2trZ4+PBhofInT57A1ta2TIKSatWqVbCxsYGenh66deuGM2fOFLttTk4O5syZg+bNm0NPTw+Ojo44cOBABUZLRESkPsmJuuBe9D9lZGRAT0+vTIKSYtu2bQgMDMSsWbNw/vx5ODo6wsPDAw8ePChy+xkzZmD16tX45ptvcOnSJYwfPx4DBw5EfHx8BUdORERUOrUfzwoMDAQAyGQyhISEwMDAQLkuLy8Pp0+fRseOHcs8wNIsXboUY8eORUBAAAAgLCwM+/fvx/r16xEUFFRo+02bNuHzzz9Hv379AAATJkzA4cOHsWTJEj5yRkREVY7aibqgxSmEwMWLF1WG+9TV1YWjoyOmTZtW9hGWIDs7G+fOnUNwcLCyTEtLC25ubjh16lSRdRQKRaGWv76+Pk6cOFGusRIREWlC7URd0Ns7ICAAK1asqBLPSz969Ah5eXmwsrJSKbeyssKff/5ZZB0PDw8sXboUvXr1QvPmzREdHY3IyMgSHytTKBRQKBTK5YJnyXNycpCTk6N2vLqFhzinKkrK3/VN8byoPnheUHE0OTd0dHTU2k4mhBCS915F3Lt3D40bN0ZcXBy6d++uLJ8+fTqOHTuG06dPF6rz8OFDjB07Fnv37oVMJkPz5s3h5uaG9evX46+//iryOLNnz0ZoaGih8i1btqjcAiAiIlKXt7e3WttplKjPnj2L7du3IyUlBdnZ2SrrIiMjpe5OY9nZ2TAwMMDOnTvh4+OjLPf398ezZ8+wZ8+eYutmZWXh8ePHaNSoEYKCgrBv3z788ccfRW5bVIva2toajx49knRloVfIj2pvS5Urdu6wCjsWz4vqg+cFFUeTc0PdFrXksb63bt0KPz8/eHh4ICoqCn379sXVq1dx//79Cp+UQ1dXF507d0Z0dLQyUefn5yM6OhoTJ04ssa6enh4aN26MnJwc7Nq1C0OGDCl2W7lcDrlcXqhcR0dH7V80AGRz0LZqQ8rf9U3xvKg+eF5Qccrz3JD8eNb8+fOxbNky7N27F7q6ulixYgX+/PNPDBkyBE2bNi2PGEsUGBiI77//Hhs2bMDly5cxYcIEZGZmKnuB+/n5qXQ2O336NCIjI3Hjxg0cP34c7777LvLz8zF9+vQKj52IiKg0klvU169fR//+/QG8atFmZmZCJpNhypQpeOedd4q8l1uehg4diocPH2LmzJlIS0tDx44dceDAAWUHs5SUFGhp/f19JCsrCzNmzMCNGzdgZGSEfv36YdOmTTAzM6vQuImIiNQhOVGbm5vjxYsXAIDGjRvj999/h4ODA549e4aXL1+WeYDqmDhxYrGXumNiYlSWXVxccOnSpQqIioiI6M1JTtS9evXCoUOH4ODggMGDB2Py5Mk4cuQIDh06hD59+pRHjERERLWW5ES9cuVKZGVlAQA+//xz6OjoIC4uDoMGDcKMGTPKPEAiIqLaTHKiLpjuEng1Ctjrw3QW9xwyERERaUZyr++iKBQKLF26tNJmzyIiIqqp1E7UCoUCwcHB6NKlC5ydnbF7924AQHh4OGxtbbFs2TJMmTKlvOIkIiKqldS+9D1z5kysXr0abm5uiIuLw+DBgxEQEIBff/0VS5cuxeDBg6GtzcFpiYiIypLaiXrHjh3YuHEjBgwYgN9//x0dOnRAbm4uEhMTi5yfmoiIiN6c2pe+79y5g86dOwMA2rdvD7lcjilTpjBJExERlSO1E3VeXp7KHNR16tSBkZFRuQRFREREr6h96VsIgVGjRiknp8jKysL48eNhaGiosl1Fzp5FRERU06mdqP39/VWW33///TIPhoiIiFSpnajDw8PLMw4iIiIqQpkMeEJERETlg4maiIioCmOiJiIiqsKYqImIiKowJmoiIqIqjImaiIioCmOiJiIiqsKYqImIiKowJmoiIqIqjImaiIioCmOiJiIiqsKYqImIiKqwGpGoV61aBRsbG+jp6aFbt244c+ZMidsvX74crVu3hr6+PqytrTFlyhRkZWVVULRERETqq/aJetu2bQgMDMSsWbNw/vx5ODo6wsPDAw8ePChy+y1btiAoKAizZs3C5cuXsW7dOmzbtg3//e9/KzhyIiKi0lX7RL106VKMHTsWAQEBsLe3R1hYGAwMDLB+/foit4+Li0OPHj0wfPhw2NjYoG/fvhg2bFiprXAiIqLKoPZ81FVRdnY2zp07h+DgYGWZlpYW3NzccOrUqSLrODs7Y/PmzThz5gy6du2KGzdu4Oeff8bIkSOLPY5CoYBCoVAuP3/+HADw5MkT5OTkqB2vVu5fam9Llevx48cVdiyeF9UHzwsqjibnho6ODoyNjSGTyUreUFRjd+/eFQBEXFycSvlnn30munbtWmy9FStWCB0dHVGnTh0BQIwfP77E48yaNUsA4Isvvvjii68yfT1//rzUXFetW9SaiImJwfz58/Htt9+iW7duSEpKwuTJkzF37lyEhIQUWSc4OBiBgYHK5fz8fDx58gR169Yt/ZtQDZeeng5ra2vcvn0bJiYmlR0OVRE8L6goPC8KMzY2LnWbap2o69WrB21tbdy/f1+l/P79+2jQoEGRdUJCQjBy5Eh88MEHAAAHBwdkZmbiww8/xOeffw4trcK37eVyOeRyuUqZmZlZ2byJGsLExIQfPCqE5wUVheeFNNW6M5muri46d+6M6OhoZVl+fj6io6PRvXv3Iuu8fPmyUDLW1tYGAAghyi9YIiIiDVTrFjUABAYGwt/fH126dEHXrl2xfPlyZGZmIiAgAADg5+eHxo0bY8GCBQAALy8vLF26FE5OTspL3yEhIfDy8lImbCIioqqi2ifqoUOH4uHDh5g5cybS0tLQsWNHHDhwAFZWVgCAlJQUlRb0jBkzIJPJMGPGDNy9exf169eHl5cXvvjii8p6C9WaXC7HrFmzCt0aoNqN5wUVheeFZmSC13uJiIiqrGp9j5qIiKimY6ImIiKqwpioiYiIqjAm6lpm9uzZ6NixY4nb3Lx5EzKZDAkJCRUSkzpsbGywfPnyyg6DiN5ATEwMZDIZnj179kb7+ef/A5lMht27d7/RPqsyJupq7tSpU9DW1kb//v3V2n7atGkqz52PGjUKPj4+5RSddBEREUUOJvPbb7/hww8/rPiAKlBaWho++eQT2NnZQS6Xw9raGl5eXip/r9ogOTkZw4cPR6NGjaCnp4cmTZrA29sbf/75Z2WHVqM8fPgQEyZMQNOmTSGXy9GgQQN4eHjg5MmTlR2aZKmpqfD09KzsMMpNtX88q7Zbt24dPvnkE6xbtw737t1Do0aNitxOCIG8vDwYGRnByMiogqN8c/Xr16/sEMrVzZs30aNHD5iZmWHx4sVwcHBATk4ODh48iI8//rjWJKmcnBy4u7ujdevWiIyMRMOGDXHnzh388ssvb9wKI1WDBg1CdnY2NmzYADs7O9y/fx/R0dEVOvFIWSluJMoao/SpL6iqevHihTAyMhJ//vmnGDp0qPjiiy+U644ePSoAiJ9//ll06tRJ6OjoiKNHj4pZs2YJR0dHIUTRk40cPXpUJCcnCwBi165donfv3kJfX1906NBBZfKT8PBwYWpqKvbu3StatWol9PX1xaBBg0RmZqaIiIgQzZo1E2ZmZuKTTz4Rubm5ynpZWVli6tSpolGjRsLAwEB07dpVHD16VCXm11+zZs0SQgjRrFkzsWzZMuV+nj59Kj788ENhaWkp5HK5aNeundi7d2+5/a7Lm6enp2jcuLHIyMgotO7p06fKn2/duiUGDBggDA0NhbGxsRg8eLBIS0tTri/4+65bt05YW1sLQ0NDMWHCBJGbmysWLVokrKysRP369cW8efNUjgFAhIWFif79+wt9fX3Rpk0bERcXJ65duyZcXFyEgYGB6N69u0hKSlKp9+233wo7Ozuho6MjWrVqJTZu3Fhov99//73w8fER+vr6okWLFmLPnj3F/h7i4+MFAHHz5s0Sf18pKSli8ODBwtTUVJibm4sBAwaI5ORk5Xp/f3/h7e0tFi9eLBo0aCAsLCzERx99JLKzs5XbrFq1SrRo0ULI5XJhaWkpBg0apFyXl5cn5s+fL2xsbISenp7o0KGD2LFjR4kxVSdPnz4VAERMTEyR6wMCAkT//v1VyrKzs0X9+vXF2rVrhRBCuLi4iIkTJ4rJkycLMzMzYWlpKdasWSMyMjLEqFGjhJGRkWjevLn4+eeflfso+Izv27dPODg4CLlcLrp16yYuXryocqydO3cKe3t7oaurK5o1aya++uorlfX//H8AQPz000/K5du3b4v//Oc/wtzcXBgYGIjOnTuLX3/9VZNfVZXARF2NrVu3TnTp0kUIIcTevXtF8+bNRX5+vhDi7w9Ehw4dRFRUlEhKShKPHz9WSdQvXrwQQ4YMEe+++65ITU0VqampQqFQKBN1mzZtxL59+8SVK1fEv//9b9GsWTORk5MjhHiVqHV0dIS7u7s4f/68OHbsmKhbt67o27evGDJkiPjjjz/E3r17ha6urti6dasy5g8++EA4OzuL2NhYkZSUJBYvXizkcrm4evWqUCgUYvny5cLExEQZz4sXL4QQqh/MvLw88dZbb4l27dqJqKgocf36dbF3716VfwjVyePHj4VMJhPz588vcbu8vDzRsWNH0bNnT3H27Fnx66+/is6dOwsXFxflNrNmzRJGRkbi3//+t/jjjz/E//73P6Grqys8PDzEJ598Iv7880+xfv16AUDlHxcA0bhxY7Ft2zZx5coV4ePjI2xsbMQ777wjDhw4IC5duiTeeust8e677yrrREZGCh0dHbFq1Spx5coVsWTJEqGtrS2OHDmist8mTZqILVu2iGvXrolJkyYJIyMj8fjx4yLf4507d4SWlpb46quvVL7gvS47O1u0bdtWjB49Wly4cEFcunRJDB8+XLRu3VooFAohxKtEbWJiIsaPHy8uX74s9u7dKwwMDMSaNWuEEEL89ttvQltbW2zZskXcvHlTnD9/XqxYsUJ5jHnz5ok2bdqIAwcOiOvXr4vw8HAhl8uLTWzVTU5OjjAyMhKffvqpyMrKKrT+5MmTQltbW9y7d09ZFhkZKQwNDZWfSRcXF2FsbCzmzp0rrl69KubOnSu0tbWFp6enWLNmjbh69aqYMGGCqFu3rsjMzBRC/P1/qW3btiIqKkpcuHBBvPfee8LGxkb5Jers2bNCS0tLzJkzR1y5ckWEh4cLfX19ER4eroylpET94sULYWdnJ95++21x/Phxce3aNbFt27ZCsyxWJ0zU1Zizs7NYvny5EOLVB69evXqFWqe7d+9WqfN6ohbi75bH6woSdcE3ZyGE+OOPPwQAcfnyZSHEq0QNQKWFNW7cOGFgYKD8IAshhIeHhxg3bpwQ4lVrUFtbW9y9e1fleH369BHBwcHK/ZqamhZ6r69/MA8ePCi0tLTElStXSvkNVQ+nT58WAERkZGSJ20VFRQltbW2RkpKiLCv4u5w5c0YI8erva2BgINLT05XbeHh4CBsbG5GXl6csa926tViwYIFyGYCYMWOGcvnUqVMCgFi3bp2y7McffxR6enrKZWdnZzF27FiVGAcPHiz69etX7H4zMjIEAPHLL78U+z5XrlwpDAwMhLGxsXB1dRVz5swR169fV67ftGmTaN26tfJLqRBCKBQKoa+vLw4ePCiEeHVeN2vWTCXZDx48WAwdOlQIIcSuXbuEiYmJyu+pQFZWljAwMCj0j33MmDFi2LBhxcZd3ezcuVOYm5sLPT094ezsLIKDg0ViYqJyvb29vVi0aJFy2cvLS4waNUq57OLiInr27Klczs3NFYaGhmLkyJHKstTUVAFAnDp1Sgjx9/+l17+8P378WOjr64tt27YJIYQYPny4cHd3V4n1s88+E/b29srlkhL16tWrhbGxcbFfBqsjdiarpq5cuYIzZ85g2LBhAIA6depg6NChWLduncp2Xbp00fgYHTp0UP7csGFDAMCDBw+UZQYGBmjevLly2crKCjY2Nir3wK2srJR1Ll68iLy8PLRq1Up5r9zIyAjHjh3D9evX1Y4rISEBTZo0QatWrTR+b1WJUHNwwMuXL8Pa2hrW1tbKMnt7e5iZmeHy5cvKMhsbG5Wp86ysrGBvb68ylO7rf5cCr/+9C4bgdXBwUCnLyspCenq6Mp4ePXqo7KNHjx4qsfxzv4aGhjAxMSl07Nd9/PHHSEtLww8//IDu3btjx44daNeuHQ4dOgQASExMRFJSEoyNjZXnkIWFBbKyslTOo3bt2qmM39+wYUPlcd3d3dGsWTPY2dlh5MiR+OGHH/Dy5UsAQFJSEl6+fAl3d3eV83Tjxo2SztOqbtCgQbh37x7+97//4d1330VMTAw6deqEiIgIAMAHH3yA8PBwAK9mJPzll18wevRolX28/rfV1tZG3bp1C50zAAr9vV+fNMnCwgKtW7dWnjfFnVfXrl1DXl5eqe8rISEBTk5OsLCwKHXb6oKdyaqpdevWITc3V6XzmBACcrkcK1euVJYZGhpqfAwdHR3lzwXzbufn5xe5vmCbosoK6mRkZEBbWxvnzp0rNAGKlA5u+vr6am9bHbRs2RIymazMOoxJ/bsUVa/g713aOaBpPKXtw9jYGF5eXvDy8sK8efPg4eGBefPmwd3dHRkZGejcuTN++OGHQvVe73RY0nGNjY1x/vx5xMTEICoqCjNnzsTs2bPx22+/ISMjAwCwf/9+NG7cWGUfNW2Maj09Pbi7u8Pd3R0hISH44IMPMGvWLIwaNQp+fn4ICgrCqVOnEBcXB1tbW7z99tsq9Us71zQ9Z95ETfv/APDxrGopNzcXGzduxJIlS5CQkKB8JSYmolGjRvjxxx/V3peurq5a31LLgpOTE/Ly8vDgwQO0aNFC5VXQa1OdeDp06IA7d+7g6tWrFRF2ubOwsICHhwdWrVqFzMzMQusLeju3bdsWt2/fxu3bt5XrLl26hGfPnsHe3r6iwlVq27ZtoUd5Tp48WeaxyGQytGnTRvm76dSpE65duwZLS8tC55Gpqana+61Tpw7c3Nzw5Zdf4sKFC7h58yaOHDkCe3t7yOVypKSkFNr/61czaiJ7e3vl77lu3brw8fFBeHg4IiIilDMSloVff/1V+fPTp09x9epVtG3bFkDx51WrVq3UmuGwQ4cOSEhIwJMnT8os3srGRF0N7du3D0+fPsWYMWPQvn17ldegQYMKXf4uiY2NDS5cuIArV67g0aNHyMnJKbe4W7VqhREjRsDPzw+RkZFITk7GmTNnsGDBAuzfv18ZT0ZGBqKjo/Ho0SPl5cjXubi4oFevXhg0aBAOHTqE5ORk/PLLLzhw4EC5xV7eVq1ahby8PHTt2hW7du3CtWvXcPnyZXz99dfKy4Rubm5wcHDAiBEjcP78eZw5cwZ+fn5wcXF5o1scmvrss88QERGB7777DteuXcPSpUsRGRmJadOmabzPhIQEeHt7Y+fOnbh06RKSkpKwbt06rF+/Ht7e3gCAESNGoF69evD29sbx48eRnJyMmJgYTJo0CXfu3FHrOPv27cPXX3+NhIQE3Lp1Cxs3bkR+fj5at24NY2NjTJs2DVOmTMGGDRtw/fp1nD9/Ht988w02bNig8XurSh4/fox33nkHmzdvxoULF5CcnIwdO3bgyy+/VP6egVeXvzds2IDLly/D39+/zI4/Z84cREdH4/fff8eoUaNQr1495XgOU6dORXR0NObOnYurV69iw4YNWLlypdrn1bBhw9CgQQP4+Pjg5MmTuHHjBnbt2oVTp06VWfwVjYm6Glq3bh3c3NyKbD0MGjQIZ8+exYULF9Ta19ixY9G6dWt06dIF9evXL/fBDsLDw+Hn54epU6eidevW8PHxwW+//YamTZsCAJydnTF+/HgMHToU9evXx5dfflnkfnbt2oV//etfGDZsGOzt7TF9+vQKuzJQHuzs7HD+/Hm4urpi6tSpaN++Pdzd3REdHY3vvvsOwKuW5Z49e2Bubo5evXrBzc0NdnZ22LZtW6XE7OPjgxUrVuCrr75Cu3btsHr1aoSHh6N3794a77NJkyawsbFBaGgounXrhk6dOmHFihUIDQ3F559/DuBV34jY2Fg0bdoUvr6+aNu2LcaMGYOsrCyYmJiodRwzMzNERkbinXfeQdu2bREWFoYff/wR7dq1AwDMnTsXISEhWLBgAdq2bYt3330X+/fvh62trcbvrSoxMjJCt27dsGzZMvTq1Qvt27dHSEgIxo4dq3LrzM3NDQ0bNoSHh0exYzRoYuHChZg8eTI6d+6MtLQ07N27F7q6ugBeXTHZvn07tm7divbt22PmzJmYM2cORo0apda+dXV1ERUVBUtLS/Tr1w8ODg5YuHChWq3xqorTXBIRUZEyMjLQuHFjhIeHw9fXt7LDqbXYmYyIiFTk5+fj0aNHWLJkCczMzDBgwIDKDqlWY6ImIiIVKSkpsLW1RZMmTRAREYE6dZgqKhMvfRMREVVh7ExGRERUhTFRExERVWFM1ERERFUYEzUREVEVxkRNRERUhTFRExERVWFM1ERERFUYEzUREVEVxkRNRERUhf0/rdTqPe1ZiJ0AAAAASUVORK5CYII=",
      "text/plain": [
       "<Figure size 500x300 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# in a bar plot, have two bars, one for sys1 which is the ratio of sys1's that are yes\n",
    "# and another one for sys2 which is the ratio of sys2's that are yes\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "abilities = []\n",
    "ratios = []\n",
    "\n",
    "for ability in data[\"ability\"].unique():\n",
    "    ability_df = data[data[\"ability\"] == ability]\n",
    "\n",
    "    sys1_yes_ratio = (\n",
    "        ability_df[\n",
    "            (ability_df[\"Strategy\"] == \"sys1\")\n",
    "            & (ability_df[\"direct_answer_clean\"] == \"yes\")\n",
    "        ].shape[0]\n",
    "        / ability_df[ability_df[\"Strategy\"] == \"sys1\"].shape[0]\n",
    "    )\n",
    "\n",
    "    sys2_yes_ratio = (\n",
    "        ability_df[\n",
    "            (ability_df[\"Strategy\"] == \"sys2\")\n",
    "            & (ability_df[\"direct_answer_clean\"] == \"yes\")\n",
    "        ].shape[0]\n",
    "        / ability_df[ability_df[\"Strategy\"] == \"sys2\"].shape[0]\n",
    "    )\n",
    "    print(\"ability:\", ability)\n",
    "    print(\"sys1 yes ratio:\", sys1_yes_ratio)\n",
    "    print(\"sys2 yes ratio:\", sys2_yes_ratio)\n",
    "\n",
    "    sys1_ratio_to_sys2 = sys1_yes_ratio / sys2_yes_ratio\n",
    "    abilities.append(ability)\n",
    "    ratios.append(sys1_ratio_to_sys2)\n",
    "\n",
    "plt.figure(figsize=(5, 3))\n",
    "sns.barplot(x=abilities, y=ratios)\n",
    "# make the whole thing more beautiful\n",
    "sns.despine()\n",
    "plt.grid(axis=\"y\")\n",
    "plt.ylabel(\"Ratio of definitive answers in \\nSystem1 to System2\")\n",
    "\n",
    "plt.ylim(0.8, 1.5)\n",
    "\n",
    "plt.tight_layout()\n",
    "\n",
    "plt.savefig(\n",
    "    \"ratio_of_definitive_answers_in_first_3_sentences_per_ability.png\",\n",
    "    dpi=300,\n",
    "    bbox_inches=\"tight\",\n",
    ")\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Analyzing ability: Arithmetic\n",
      "pvalue      1.0\n",
      "statistic   65.0\n",
      "N:  645\n",
      "\n",
      "Analyzing ability: Common Sense\n",
      "pvalue      4.935351419046992e-06\n",
      "statistic   24.0\n",
      "N:  382\n",
      "\n",
      "Analyzing ability: Symbolic\n",
      "pvalue      0.19252645776146274\n",
      "statistic   24.0\n",
      "N:  264\n"
     ]
    }
   ],
   "source": [
    "all_datasets_df_without_errors = all_datasets_df[\n",
    "    (all_datasets_df[\"sys1_direct_answer\"] != \"error\")\n",
    "    & (all_datasets_df[\"sys2_direct_answer\"] != \"error\")\n",
    "]\n",
    "\n",
    "\n",
    "for ability in all_datasets_df_without_errors[\"ability\"].unique():\n",
    "    print(f\"\\nAnalyzing ability: {ability}\")\n",
    "\n",
    "    ability_df = all_datasets_df_without_errors[\n",
    "        all_datasets_df_without_errors[\"ability\"] == ability\n",
    "    ]\n",
    "\n",
    "    sys1_yes_sys2_yes = ability_df[\n",
    "        (ability_df[\"sys1_direct_answer\"] == \"yes\")\n",
    "        & (ability_df[\"sys2_direct_answer\"] == \"yes\")\n",
    "    ]\n",
    "\n",
    "    sys1_no_sys2_no = ability_df[\n",
    "        (ability_df[\"sys1_direct_answer\"] == \"no\")\n",
    "        & (ability_df[\"sys2_direct_answer\"] == \"no\")\n",
    "    ]\n",
    "\n",
    "    sys1_yes_sys2_no = ability_df[\n",
    "        (ability_df[\"sys1_direct_answer\"] == \"yes\")\n",
    "        & (ability_df[\"sys2_direct_answer\"] == \"no\")\n",
    "    ]\n",
    "\n",
    "    sys1_no_sys2_yes = ability_df[\n",
    "        (ability_df[\"sys1_direct_answer\"] == \"no\")\n",
    "        & (ability_df[\"sys2_direct_answer\"] == \"yes\")\n",
    "    ]\n",
    "\n",
    "    table = np.array(\n",
    "        [\n",
    "            [len(sys1_yes_sys2_yes), len(sys1_yes_sys2_no)],\n",
    "            [len(sys1_no_sys2_yes), len(sys1_no_sys2_no)],\n",
    "        ]\n",
    "    )\n",
    "\n",
    "    result = mcnemar(table, exact=True, correction=True)\n",
    "    print(result)\n",
    "    print(\"N: \", table.sum())"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "py310",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
