{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "6ff72016",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found 54 runs in runs:\n",
      "  gpt-4o-mini_answerable-full_all-tools_3-shot\n",
      "  Qwen3-14B_answerable-full_data-tools_0-shot\n",
      "  Qwen3-32B_answerable-full_all-tools_1-shot\n",
      "  Llama-3.2-3B-Instruct_answerable-full_data-tools_0-shot\n",
      "  Mistral-Small-3.1-24B_answerable-full_data-tools_0-shot\n",
      "  Qwen3-32B_answerable-partial_all-tools_0-shot\n",
      "  Qwen3-14B_answerable-full_all-tools_0-shot\n",
      "  gpt-4.1-mini_answerable-full_all-tools_1-shot\n",
      "  Llama-3.1-70B-Instruct_answerable-full_all-tools_1-shot\n",
      "  Qwen3-4B_answerable-partial_all-tools_0-shot\n",
      "  Qwen3-32B_answerable-full_data-tools_0-shot\n",
      "  gpt-4o-mini_answerable-full_data-tools_0-shot\n",
      "  Mistral-Small-3.1-24B_answerable-partial_all-tools_0-shot\n",
      "  Qwen3-4B_answerable-full_all-tools_3-shot\n",
      "  Llama-3.1-70B-Instruct_answerable-partial_all-tools_0-shot\n",
      "  Qwen3-14B_answerable-full_all-tools_1-shot\n",
      "  Llama-3.3-70B-Instruct_answerable-full_all-tools_1-shot\n",
      "  Qwen3-32B_answerable-full_all-tools_0-shot\n",
      "  Llama-3.1-8B-Instruct_answerable-full_data-tools_0-shot\n",
      "  Llama-3.1-8B-Instruct_answerable-full_all-tools_3-shot\n",
      "  gpt-4.1-mini_answerable-full_data-tools_0-shot\n",
      "  Qwen3-30B-A3B_answerable-partial_all-tools_0-shot\n",
      "  gpt-4.1-mini_answerable-full_all-tools_0-shot\n",
      "  gpt-4o-mini_answerable-full_all-tools_1-shot\n",
      "  Qwen3-4B_answerable-full_data-tools_0-shot\n",
      "  Qwen3-32B_answerable-full_all-tools_3-shot\n",
      "  Qwen3-4B_answerable-full_all-tools_0-shot\n",
      "  Llama-3.1-8B-Instruct_answerable-full_all-tools_1-shot\n",
      "  Llama-3.1-8B-Instruct_answerable-partial_all-tools_0-shot\n",
      "  Qwen3-14B_answerable-partial_all-tools_0-shot\n",
      "  Qwen3-4B_answerable-full_all-tools_1-shot\n",
      "  Qwen3-30B-A3B_answerable-full_all-tools_1-shot\n",
      "  Llama-3.2-3B-Instruct_answerable-partial_all-tools_0-shot\n",
      "  Llama-3.3-70B-Instruct_answerable-full_all-tools_0-shot\n",
      "  Llama-3.1-70B-Instruct_answerable-full_all-tools_3-shot\n",
      "  Llama-3.1-70B-Instruct_answerable-full_data-tools_0-shot\n",
      "  Llama-3.2-3B-Instruct_answerable-full_all-tools_0-shot\n",
      "  Llama-3.3-70B-Instruct_answerable-full_all-tools_3-shot\n",
      "  Qwen3-14B_answerable-full_all-tools_3-shot\n",
      "  gpt-4.1-mini_answerable-full_all-tools_3-shot\n",
      "  Llama-3.1-8B-Instruct_answerable-full_all-tools_0-shot\n",
      "  Llama-3.3-70B-Instruct_answerable-full_data-tools_0-shot\n",
      "  Mistral-Small-3.1-24B_answerable-full_all-tools_1-shot\n",
      "  Qwen3-30B-A3B_answerable-full_all-tools_0-shot\n",
      "  gpt-4.1-mini_answerable-partial_all-tools_0-shot\n",
      "  gpt-4o-mini_answerable-full_all-tools_0-shot\n",
      "  Llama-3.2-3B-Instruct_answerable-full_all-tools_3-shot\n",
      "  Qwen3-30B-A3B_answerable-full_all-tools_3-shot\n",
      "  Llama-3.1-70B-Instruct_answerable-full_all-tools_0-shot\n",
      "  Qwen3-30B-A3B_answerable-full_data-tools_0-shot\n",
      "  Llama-3.3-70B-Instruct_answerable-partial_all-tools_0-shot\n",
      "  Mistral-Small-3.1-24B_answerable-full_all-tools_3-shot\n",
      "  Mistral-Small-3.1-24B_answerable-full_all-tools_0-shot\n",
      "  gpt-4o-mini_answerable-partial_all-tools_0-shot\n"
     ]
    }
   ],
   "source": [
    "import inspect\n",
    "import os\n",
    "import sys\n",
    "from pathlib import Path\n",
    "\n",
    "import analysis\n",
    "import pandas as pd\n",
    "\n",
    "# In Jupyter, __file__ is not defined, so use the current working directory\n",
    "sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))\n",
    "\n",
    "import matcher\n",
    "\n",
    "from frankenstein.tools import arithmetic, data_retrieval\n",
    "\n",
    "ARITHMETIC_TOOL_NAMES = [name for name, _ in inspect.getmembers(arithmetic, predicate=inspect.isfunction)]\n",
    "DATA_TOOL_NAMES = [name for name, _ in inspect.getmembers(data_retrieval, predicate=inspect.isfunction)]\n",
    "\n",
    "run_dir = Path('runs')\n",
    "dfs = {f.stem: pd.read_json(f, orient='records', lines=True, precise_float=True) for f in run_dir.iterdir()}\n",
    "print(f'Found {len(dfs)} runs in {run_dir}:')\n",
    "for name in dfs:\n",
    "    print(f'  {name}')\n",
    "\n",
    "m = matcher.Matcher()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "2c2b6691",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = dfs['Qwen3-14B_answerable-full_all-tools_0-shot']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "a2da0288",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>question_template</th>\n",
       "      <th>question</th>\n",
       "      <th>actions</th>\n",
       "      <th>answer</th>\n",
       "      <th>slot_values</th>\n",
       "      <th>answerable</th>\n",
       "      <th>data_availability</th>\n",
       "      <th>answer_format</th>\n",
       "      <th>messages</th>\n",
       "      <th>tokens</th>\n",
       "      <th>pred</th>\n",
       "      <th>correct</th>\n",
       "      <th>error</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>859b3565-fdd3-4c76-9c40-a2fa528ec406</td>\n",
       "      <td>AverageChange</td>\n",
       "      <td>What was the average yearly change in the perc...</td>\n",
       "      <td>[{'name': 'get_country_code_from_name', 'argum...</td>\n",
       "      <td>-0.48044</td>\n",
       "      <td>{'property': 'SE.PRE.ENRR', 'subject': 'BEL', ...</td>\n",
       "      <td>True</td>\n",
       "      <td>full</td>\n",
       "      <td>float</td>\n",
       "      <td>[{'role': 'system', 'content': 'You are a help...</td>\n",
       "      <td>4236</td>\n",
       "      <td>-0.48044</td>\n",
       "      <td>True</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                     id question_template  \\\n",
       "0  859b3565-fdd3-4c76-9c40-a2fa528ec406     AverageChange   \n",
       "\n",
       "                                            question  \\\n",
       "0  What was the average yearly change in the perc...   \n",
       "\n",
       "                                             actions   answer  \\\n",
       "0  [{'name': 'get_country_code_from_name', 'argum... -0.48044   \n",
       "\n",
       "                                         slot_values  answerable  \\\n",
       "0  {'property': 'SE.PRE.ENRR', 'subject': 'BEL', ...        True   \n",
       "\n",
       "  data_availability answer_format  \\\n",
       "0              full         float   \n",
       "\n",
       "                                            messages  tokens      pred  \\\n",
       "0  [{'role': 'system', 'content': 'You are a help...    4236  -0.48044   \n",
       "\n",
       "   correct  error  \n",
       "0     True    0.0  "
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "4f9fd115",
   "metadata": {},
   "outputs": [],
   "source": [
    "df['gold_tool_calls'] = df.apply(lambda row: analysis.get_gold_tool_calls(row, []), axis=1)\n",
    "df['pred_tool_calls'] = df.apply(lambda row: analysis.get_pred_tool_calls(row), axis=1)\n",
    "df['true_positives'] = df.apply(lambda row: analysis.get_true_positives(row), axis=1)\n",
    "df['false_positives'] = df.apply(lambda row: analysis.get_false_positives(row), axis=1)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "3d9df37d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'name': 'get_country_code_from_name',\n",
       "  'arguments': {'country_name': 'Belgium'},\n",
       "  'result': 'BEL',\n",
       "  'id': None},\n",
       " {'name': 'search_for_indicator_names',\n",
       "  'arguments': {'keywords': 'School enrollment, preprimary (% gross)'},\n",
       "  'result': [{'indicator_name': 'School enrollment, preprimary (% gross)',\n",
       "    'indicator_description': 'Gross enrollment ratio is the ratio of total enrollment, regardless of age, to the population of the age group that officially corresponds to the level of education shown. Preprimary education refers to programs at the initial stage of organized instruction, designed primarily to introduce very young children to a school-type environment and to provide a bridge between home and school.'}],\n",
       "  'id': None},\n",
       " {'name': 'get_indicator_code_from_name',\n",
       "  'arguments': {'indicator_name': 'School enrollment, preprimary (% gross)'},\n",
       "  'result': 'SE.PRE.ENRR',\n",
       "  'id': None},\n",
       " {'name': 'retrieve_value',\n",
       "  'arguments': {'country_code': 'BEL',\n",
       "   'indicator_code': 'SE.PRE.ENRR',\n",
       "   'year': '2009'},\n",
       "  'result': 118.35961,\n",
       "  'id': None},\n",
       " {'name': 'retrieve_value',\n",
       "  'arguments': {'country_code': 'BEL',\n",
       "   'indicator_code': 'SE.PRE.ENRR',\n",
       "   'year': '2010'},\n",
       "  'result': 119.37523,\n",
       "  'id': None},\n",
       " {'name': 'retrieve_value',\n",
       "  'arguments': {'country_code': 'BEL',\n",
       "   'indicator_code': 'SE.PRE.ENRR',\n",
       "   'year': '2011'},\n",
       "  'result': 120.61282,\n",
       "  'id': None},\n",
       " {'name': 'retrieve_value',\n",
       "  'arguments': {'country_code': 'BEL',\n",
       "   'indicator_code': 'SE.PRE.ENRR',\n",
       "   'year': '2012'},\n",
       "  'result': 120.08512,\n",
       "  'id': None},\n",
       " {'name': 'retrieve_value',\n",
       "  'arguments': {'country_code': 'BEL',\n",
       "   'indicator_code': 'SE.PRE.ENRR',\n",
       "   'year': '2013'},\n",
       "  'result': 118.83549,\n",
       "  'id': None},\n",
       " {'name': 'retrieve_value',\n",
       "  'arguments': {'country_code': 'BEL',\n",
       "   'indicator_code': 'SE.PRE.ENRR',\n",
       "   'year': '2014'},\n",
       "  'result': 118.24409,\n",
       "  'id': None},\n",
       " {'name': 'retrieve_value',\n",
       "  'arguments': {'country_code': 'BEL',\n",
       "   'indicator_code': 'SE.PRE.ENRR',\n",
       "   'year': '2015'},\n",
       "  'result': 117.01423,\n",
       "  'id': None},\n",
       " {'name': 'retrieve_value',\n",
       "  'arguments': {'country_code': 'BEL',\n",
       "   'indicator_code': 'SE.PRE.ENRR',\n",
       "   'year': '2016'},\n",
       "  'result': 116.09559,\n",
       "  'id': None},\n",
       " {'name': 'retrieve_value',\n",
       "  'arguments': {'country_code': 'BEL',\n",
       "   'indicator_code': 'SE.PRE.ENRR',\n",
       "   'year': '2017'},\n",
       "  'result': 115.14532,\n",
       "  'id': None},\n",
       " {'name': 'retrieve_value',\n",
       "  'arguments': {'country_code': 'BEL',\n",
       "   'indicator_code': 'SE.PRE.ENRR',\n",
       "   'year': '2018'},\n",
       "  'result': 114.03566,\n",
       "  'id': None},\n",
       " {'name': 'subtract',\n",
       "  'arguments': {'value_a': 119.37523, 'value_b': 118.35961},\n",
       "  'result': 1.01562,\n",
       "  'id': None},\n",
       " {'name': 'subtract',\n",
       "  'arguments': {'value_a': 120.61282, 'value_b': 119.37523},\n",
       "  'result': 1.23759,\n",
       "  'id': None},\n",
       " {'name': 'subtract',\n",
       "  'arguments': {'value_a': 120.08512, 'value_b': 120.61282},\n",
       "  'result': -0.5277,\n",
       "  'id': None},\n",
       " {'name': 'subtract',\n",
       "  'arguments': {'value_a': 118.83549, 'value_b': 120.08512},\n",
       "  'result': -1.24963,\n",
       "  'id': None},\n",
       " {'name': 'subtract',\n",
       "  'arguments': {'value_a': 118.24409, 'value_b': 118.83549},\n",
       "  'result': -0.5914,\n",
       "  'id': None},\n",
       " {'name': 'subtract',\n",
       "  'arguments': {'value_a': 117.01423, 'value_b': 118.24409},\n",
       "  'result': -1.22986,\n",
       "  'id': None},\n",
       " {'name': 'subtract',\n",
       "  'arguments': {'value_a': 116.09559, 'value_b': 117.01423},\n",
       "  'result': -0.91864,\n",
       "  'id': None},\n",
       " {'name': 'subtract',\n",
       "  'arguments': {'value_a': 115.14532, 'value_b': 116.09559},\n",
       "  'result': -0.95027,\n",
       "  'id': None},\n",
       " {'name': 'subtract',\n",
       "  'arguments': {'value_a': 114.03566, 'value_b': 115.14532},\n",
       "  'result': -1.10966,\n",
       "  'id': None},\n",
       " {'name': 'mean',\n",
       "  'arguments': {'values': [-1.24963,\n",
       "    -1.22986,\n",
       "    -1.10966,\n",
       "    -0.95027,\n",
       "    -0.91864,\n",
       "    -0.5914,\n",
       "    -0.5277,\n",
       "    1.01562,\n",
       "    1.23759]},\n",
       "  'result': -0.48044,\n",
       "  'id': None}]"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['actions'].iloc[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "1b96d966",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "([{'name': 'get_country_code_from_name',\n",
       "   'arguments': {'country_name': 'Belgium'}},\n",
       "  {'name': 'search_for_indicator_names',\n",
       "   'arguments': {'keywords': 'School enrollment, preprimary (% gross)'}},\n",
       "  {'name': 'get_indicator_code_from_name',\n",
       "   'arguments': {'indicator_name': 'School enrollment, preprimary (% gross)'}},\n",
       "  {'name': 'retrieve_value',\n",
       "   'arguments': {'country_code': 'BEL',\n",
       "    'indicator_code': 'SE.PRE.ENRR',\n",
       "    'year': '2009'}},\n",
       "  {'name': 'retrieve_value',\n",
       "   'arguments': {'country_code': 'BEL',\n",
       "    'indicator_code': 'SE.PRE.ENRR',\n",
       "    'year': '2010'}},\n",
       "  {'name': 'retrieve_value',\n",
       "   'arguments': {'country_code': 'BEL',\n",
       "    'indicator_code': 'SE.PRE.ENRR',\n",
       "    'year': '2011'}},\n",
       "  {'name': 'retrieve_value',\n",
       "   'arguments': {'country_code': 'BEL',\n",
       "    'indicator_code': 'SE.PRE.ENRR',\n",
       "    'year': '2012'}},\n",
       "  {'name': 'retrieve_value',\n",
       "   'arguments': {'country_code': 'BEL',\n",
       "    'indicator_code': 'SE.PRE.ENRR',\n",
       "    'year': '2013'}},\n",
       "  {'name': 'retrieve_value',\n",
       "   'arguments': {'country_code': 'BEL',\n",
       "    'indicator_code': 'SE.PRE.ENRR',\n",
       "    'year': '2014'}},\n",
       "  {'name': 'retrieve_value',\n",
       "   'arguments': {'country_code': 'BEL',\n",
       "    'indicator_code': 'SE.PRE.ENRR',\n",
       "    'year': '2015'}},\n",
       "  {'name': 'retrieve_value',\n",
       "   'arguments': {'country_code': 'BEL',\n",
       "    'indicator_code': 'SE.PRE.ENRR',\n",
       "    'year': '2016'}},\n",
       "  {'name': 'retrieve_value',\n",
       "   'arguments': {'country_code': 'BEL',\n",
       "    'indicator_code': 'SE.PRE.ENRR',\n",
       "    'year': '2017'}},\n",
       "  {'name': 'retrieve_value',\n",
       "   'arguments': {'country_code': 'BEL',\n",
       "    'indicator_code': 'SE.PRE.ENRR',\n",
       "    'year': '2018'}},\n",
       "  {'name': 'subtract',\n",
       "   'arguments': {'value_a': 119.37523, 'value_b': 118.35961}},\n",
       "  {'name': 'subtract',\n",
       "   'arguments': {'value_a': 120.61282, 'value_b': 119.37523}},\n",
       "  {'name': 'subtract',\n",
       "   'arguments': {'value_a': 120.08512, 'value_b': 120.61282}},\n",
       "  {'name': 'subtract',\n",
       "   'arguments': {'value_a': 118.83549, 'value_b': 120.08512}},\n",
       "  {'name': 'subtract',\n",
       "   'arguments': {'value_a': 118.24409, 'value_b': 118.83549}},\n",
       "  {'name': 'subtract',\n",
       "   'arguments': {'value_a': 117.01423, 'value_b': 118.24409}},\n",
       "  {'name': 'subtract',\n",
       "   'arguments': {'value_a': 116.09559, 'value_b': 117.01423}},\n",
       "  {'name': 'subtract',\n",
       "   'arguments': {'value_a': 115.14532, 'value_b': 116.09559}},\n",
       "  {'name': 'subtract',\n",
       "   'arguments': {'value_a': 114.03566, 'value_b': 115.14532}},\n",
       "  {'name': 'mean',\n",
       "   'arguments': {'values': [-1.24963,\n",
       "     -1.22986,\n",
       "     -1.10966,\n",
       "     -0.95027,\n",
       "     -0.91864,\n",
       "     -0.5914,\n",
       "     -0.5277,\n",
       "     1.01562,\n",
       "     1.23759]}}],\n",
       " 23)"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['gold_tool_calls'].iloc[0], len(df['gold_tool_calls'].iloc[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "294a6791",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "([{'name': 'get_country_code_from_name',\n",
       "   'arguments': {'country_name': 'Saint Vincent and the Grenadines'}},\n",
       "  {'name': 'search_for_indicator_names',\n",
       "   'arguments': {'keywords': 'Lending interest rate (%)'}},\n",
       "  {'name': 'get_indicator_code_from_name',\n",
       "   'arguments': {'indicator_name': 'Lending interest rate (%)'}},\n",
       "  {'name': 'retrieve_value',\n",
       "   'arguments': {'country_code': 'VCT',\n",
       "    'indicator_code': 'FR.INR.LEND',\n",
       "    'year': '2003'}},\n",
       "  {'name': 'retrieve_value',\n",
       "   'arguments': {'country_code': 'VCT',\n",
       "    'indicator_code': 'FR.INR.LEND',\n",
       "    'year': '2004'}},\n",
       "  {'name': 'retrieve_value',\n",
       "   'arguments': {'country_code': 'VCT',\n",
       "    'indicator_code': 'FR.INR.LEND',\n",
       "    'year': '2005'}},\n",
       "  {'name': 'retrieve_value',\n",
       "   'arguments': {'country_code': 'VCT',\n",
       "    'indicator_code': 'FR.INR.LEND',\n",
       "    'year': '2006'}},\n",
       "  {'name': 'retrieve_value',\n",
       "   'arguments': {'country_code': 'VCT',\n",
       "    'indicator_code': 'FR.INR.LEND',\n",
       "    'year': '2007'}},\n",
       "  {'name': 'retrieve_value',\n",
       "   'arguments': {'country_code': 'VCT',\n",
       "    'indicator_code': 'FR.INR.LEND',\n",
       "    'year': '2008'}},\n",
       "  {'name': 'retrieve_value',\n",
       "   'arguments': {'country_code': 'VCT',\n",
       "    'indicator_code': 'FR.INR.LEND',\n",
       "    'year': '2009'}},\n",
       "  {'name': 'subtract', 'arguments': {'value_a': 9.69263, 'value_b': 11.83238}},\n",
       "  {'name': 'subtract', 'arguments': {'value_a': 9.58413, 'value_b': 9.69263}},\n",
       "  {'name': 'subtract', 'arguments': {'value_a': 9.73168, 'value_b': 9.58413}},\n",
       "  {'name': 'subtract', 'arguments': {'value_a': 9.61258, 'value_b': 9.73168}},\n",
       "  {'name': 'subtract', 'arguments': {'value_a': 9.52442, 'value_b': 9.61258}},\n",
       "  {'name': 'subtract', 'arguments': {'value_a': 9.18656, 'value_b': 9.52442}}],\n",
       " 16)"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['true_positives'].iloc[1], len(df['true_positives'].iloc[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "200d688e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'name': 'add',\n",
       "  'arguments': {'values': [-2.13975,\n",
       "    -0.33786,\n",
       "    -0.1191,\n",
       "    -0.1085,\n",
       "    -0.08816,\n",
       "    0.14755]}},\n",
       " {'name': 'divide', 'arguments': {'value_a': -2.64582, 'value_b': 6}}]"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['false_positives'].iloc[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "9532ae59",
   "metadata": {},
   "outputs": [],
   "source": [
    "df['coverage'] = df.apply(analysis.get_coverage, axis=1)\n",
    "df['recall'] = df.apply(analysis.get_recall, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "11ff2d09",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "np.float64(0.5217391304347826)"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['recall'].iloc[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "625b0209",
   "metadata": {},
   "outputs": [],
   "source": [
    "df['precision'] = df.apply(analysis.get_precision, axis=1)\n",
    "df['error_made'] = df.apply(analysis.get_error_made, axis=1)\n",
    "# df['correct_indicator_data_process'] = df.apply(analysis.get_correct_indicator_data_process, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "d226aa70",
   "metadata": {},
   "outputs": [],
   "source": [
    "df['correct_indicator_data_process'] = df.apply(analysis.get_correct_indicator_data_process, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "730cb13f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "correct_indicator_data_process\n",
       "True     262\n",
       "False    138\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['correct_indicator_data_process'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "338af196",
   "metadata": {},
   "outputs": [],
   "source": [
    "df['missing_tool_calls'] = df.apply(analysis.get_missing_tool_calls, axis=1)\n",
    "df['additional'] = df.apply(analysis.get_additional_tool_calls, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "424dbf39",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "additional\n",
       "retrieve_value                  155\n",
       "unknown                         136\n",
       "get_country_code_from_name       23\n",
       "get_indicator_code_from_name     22\n",
       "divide                           19\n",
       "add                              15\n",
       "search_for_indicator_names       13\n",
       "greater_than                     13\n",
       "count                             9\n",
       "rank                              4\n",
       "subtract                          1\n",
       "sort                              1\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "correct = df[df['correct'] == True]\n",
    "df_exploded = correct.explode('additional')\n",
    "df_exploded['additional'] = df_exploded['additional'].apply(lambda x: x.get('name') if isinstance(x, dict) else 'unknown')\n",
    "df_exploded['additional'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "c9dacbf8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "false_positives\n",
       "retrieve_value                  569\n",
       "unknown                         201\n",
       "get_indicator_code_from_name     83\n",
       "subtract                         42\n",
       "divide                           42\n",
       "get_country_code_from_name       39\n",
       "add                              27\n",
       "search_for_indicator_names       24\n",
       "greater_than                     22\n",
       "get_country_name_from_code       11\n",
       "count                            10\n",
       "mean                              8\n",
       "rank                              7\n",
       "minimum                           7\n",
       "index                             6\n",
       "sort                              4\n",
       "maximum                           4\n",
       "multiply                          3\n",
       "get_country_codes_in_region       2\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Take an overview of which functions comprised the false positives\n",
    "# First we need to explode the false positives column\n",
    "df_exploded = df.explode('false_positives')\n",
    "df_exploded['false_positives'] = df_exploded['false_positives'].apply(\n",
    "    lambda x: x.get('name', 'unknown') if isinstance(x, dict) else 'unknown'\n",
    ")\n",
    "df_exploded['false_positives'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "d4ecc940",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>Error Made</th>\n",
       "      <th>False</th>\n",
       "      <th>True</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Correct Answer Given</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>False</th>\n",
       "      <td>121</td>\n",
       "      <td>48</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>True</th>\n",
       "      <td>203</td>\n",
       "      <td>28</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Error Made            False  True \n",
       "Correct Answer Given              \n",
       "False                   121     48\n",
       "True                    203     28"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.crosstab(\n",
    "    df['correct'],\n",
    "    df['error_made'],\n",
    "    rownames=['Correct Answer Given'],\n",
    "    colnames=['Error Made'],\n",
    ").round(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "28f496a1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>Correct Indicator Data Process</th>\n",
       "      <th>False</th>\n",
       "      <th>True</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Correct Answer Given</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>False</th>\n",
       "      <td>90</td>\n",
       "      <td>79</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>True</th>\n",
       "      <td>48</td>\n",
       "      <td>183</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Correct Indicator Data Process  False  True \n",
       "Correct Answer Given                        \n",
       "False                              90     79\n",
       "True                               48    183"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.crosstab(\n",
    "    df['correct'],\n",
    "    df['correct_indicator_data_process'],\n",
    "    rownames=['Correct Answer Given'],\n",
    "    colnames=['Correct Indicator Data Process'],\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "frankenstein",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
