{
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "### Configuration"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "from results import *\n",
    "r = results(task=\"boolean_expressions\", latest=True, kwargs={\"max_length\": 320}, decoder=\"argmax\")\n",
    "accuracy(r), num_samples(r), no_response(r)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pandas show full column contents\n",
    "pd.set_option('display.max_colwidth', None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make Jupyter Notebook in the browser full width\n",
    "from IPython.core.display import display, HTML\n",
    "display(HTML('<style>.container {width:100% !important;}</style>'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Boolean Expressions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from results import *\n",
    "r = results(additional=[accuracy,num_samples,filename, evaluation_time, shots], model=\"davinci\", shots=0, task=lambda t: \"bool\" in t and not \"mini\" in t)\n",
    "named(r)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Date Understanding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "from results import *\n",
    "r = results(additional=[accuracy,num_samples,evaluation_time,shots], task=lambda t: \"date_understanding\" in t and \"mini\" in t, shots=0)\n",
    "# count unique .prediction values\n",
    "# r[r.num_samples > 1][[\"model\", \"decoder\", \"num_samples\", \"accuracy\"]].groupby([\"model\", \"decoder\"]).agg({\"num_samples\": \"sum\", \"accuracy\": \"max\"})\n",
    "# r[[\"filename\", \"num_samples\", \"accuracy\"]].values\n",
    "r = r.sort_values([\"decoder\"])\n",
    "r\n",
    "# TODO: make check CoT for date understanding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "[i for i in list(zip(r.task, r.decoder, r.kwargs, r.accuracy))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "from results import *\n",
    "r = results(additional=[accuracy,num_samples,evaluation_time,shots,errors], task=lambda t: \"date_understanding\" in t and \"small\" in t, shots=0)\n",
    "r = r.sort_values([\"decoder\", \"timestamp\"], ascending=[True, False])\n",
    "r\n",
    "# TODO: make check CoT for date understanding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for s in data(results(hash=\"c98e5ffa\")).iloc:\n",
    "    print(\"\\n\\n\")\n",
    "#     print(s[\"query\"], end=\"\\n\\n\\n\")\n",
    "#     print(s[\"query\"], end=\"\\n\\n\\n\")\n",
    "    print(s[\"query\"])\n",
    "    print(s.model_result)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# prints query and results for a specific run\n",
    "for s in data(r)[3].iloc:\n",
    "    print(\"\\n\\nQUERY:\")\n",
    "    print(s[\"query\"])\n",
    "    print(\"\\nMODEL_RESULT:\")\n",
    "    print(s.model_result)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Information Essentiality"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "from results import *\n",
    "r = results(additional=[accuracy,num_samples,evaluation_time,shots, errors], task=lambda t: \"evaluating_information_essentiality\" in t and \"small\" in t, model=\"davinci\")\n",
    "# count unique .prediction values\n",
    "# r[r.num_samples > 1][[\"model\", \"decoder\", \"num_samples\", \"accuracy\"]].groupby([\"model\", \"decoder\"]).agg({\"num_samples\": \"sum\", \"accuracy\": \"max\"})\n",
    "# r[[\"filename\", \"num_samples\", \"accuracy\"]].values\n",
    "r"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data(r)[1].model_result.values[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Tracking Shuffled Objects"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "from results import *\n",
    "r = results(additional=[accuracy,num_samples,evaluation_time,shots], task=\"tracking_shuffled_objects\")\n",
    "# count unique .prediction values\n",
    "# r[r.num_samples > 1][[\"model\", \"decoder\", \"num_samples\", \"accuracy\"]].groupby([\"model\", \"decoder\"]).agg({\"num_samples\": \"sum\", \"accuracy\": \"max\"})\n",
    "# r[[\"filename\", \"num_samples\", \"accuracy\"]].values\n",
    "r"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Matrix Shapes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from results import *\n",
    "r = results(additional=[accuracy,num_samples,evaluation_time,shots], task=lambda t: \"matrixshapes\" in t and \"mini\" in t)\n",
    "# count unique .prediction values\n",
    "# r[r.num_samples > 1][[\"model\", \"decoder\", \"num_samples\", \"accuracy\"]].groupby([\"model\", \"decoder\"]).agg({\"num_samples\": \"sum\", \"accuracy\": \"max\"})\n",
    "# r[[\"filename\", \"num_samples\", \"accuracy\"]].values\n",
    "r"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### small dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from results import *\n",
    "r = results(additional=[accuracy,num_samples,evaluation_time,shots, errors], task=lambda t: \"matrixshapes\" in t and \"small\" in t)\n",
    "# count unique .prediction values\n",
    "# r[r.num_samples > 1][[\"model\", \"decoder\", \"num_samples\", \"accuracy\"]].groupby([\"model\", \"decoder\"]).agg({\"num_samples\": \"sum\", \"accuracy\": \"max\"})\n",
    "# r[[\"filename\", \"num_samples\", \"accuracy\"]].values\n",
    "named(r)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cot_0_shot = data(r)[2]\n",
    "cf_0_shot = data(r)[0]\n",
    "ao_0_shot = data(r)[3]\n",
    "\n",
    "cot_0_shot = cot_0_shot.sort_values([\"query_file\"])\n",
    "cf_0_shot = cf_0_shot.sort_values([\"query_file\"])\n",
    "ao_0_shot = ao_0_shot.sort_values([\"query_file\"])\n",
    "\n",
    "i = 0\n",
    "t=0\n",
    "for cot, cf, ao in zip(cot_0_shot.iloc, cf_0_shot.iloc, ao_0_shot.iloc):\n",
    "#     print(cot.target, cot.prediction, cf.target, cf.prediction)\n",
    "    if cot.target != cot.prediction and cf.target == cf.prediction and ao.target != ao.prediction:\n",
    "        print(\"\\n\\n\\n====== new example ====== \\n\\n\")\n",
    "#             print(cot[\"query\"])\n",
    "        print(\"ao\", ao.prediction, \"cot\", cot.prediction, \"cf\", cf.prediction)\n",
    "        print(\"\\nAO\")\n",
    "        print(ao.model_result)\n",
    "        print(ao[\"query\"])\n",
    "        print(\"\\nCoT\")    \n",
    "        print(cot[\"model_result\"])\n",
    "        print(cot[\"query\"])\n",
    "#             print(cf[\"query\"])\n",
    "        print(\"\\nCF-G\")\n",
    "        print(cf[\"model_result\"])\n",
    "        print(cf[\"query\"])\n",
    "        print(cf[\"target\"], cot[\"target\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "d_argmax = data(r)[0].sort_values(\"target\")\n",
    "d_var = data(r)[-1].sort_values(\"target\")\n",
    "var_correct = np.array(d_var[\"prediction\"] == d_var[\"target\"].values)\n",
    "argmax_correct = np.array(d_argmax[\"prediction\"] == d_argmax[\"target\"].values)\n",
    "\n",
    "var_correct.mean(), argmax_correct.mean(), np.logical_or(var_correct, argmax_correct).mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all([(a == b) for a,b in zip(d_var[\"target\"], d_argmax[\"target\"])])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Multi-Step Arithmetics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from results import *\n",
    "from functools import partial\n",
    "\n",
    "def value_normalizer(v):\n",
    "    return str(v).strip(\". '\")\n",
    "# more lenient accuracy\n",
    "def acc(v):\n",
    "    return accuracy(v, t=value_normalizer)\n",
    "\n",
    "r = results(additional=[acc,num_samples,evaluation_time,shots, errors], task=lambda t: \"multistep_arithmetic\" in t and \"mini\" in t)\n",
    "# count unique .prediction values\n",
    "# r[r.num_samples > 1][[\"model\", \"decoder\", \"num_samples\", \"accuracy\"]].groupby([\"model\", \"decoder\"]).agg({\"num_samples\": \"sum\", \"accuracy\": \"max\"})\n",
    "# r[[\"filename\", \"num_samples\", \"accuracy\"]].values\n",
    "named(r)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(data(r)[1][\"query\"][6])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "i = 0\n",
    "for cfg,ins in zip(data(r)[0].iloc, data(r)[1].iloc):\n",
    "    print(\"[cfg]\", cfg.model_result, \"pred\", cfg.prediction, \"target\", cfg.target, value_normalizer(cfg.prediction) == value_normalizer(cfg.target), end=\"\\n\\n\")\n",
    "    print(\"[ins]\", ins.model_result, \"pred\", ins.prediction, \"target\", ins.target, value_normalizer(ins.prediction) == value_normalizer(ins.target), end=\"\\n\\n\")\n",
    "    print(i)\n",
    "    i += 1\n",
    "#     print(\"[instruct]\", m2, end=\"\\n\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for name, mrs in data(r, lambda r: (r.task_name, r.model_result)):\n",
    "    for mr in mrs:\n",
    "        print(name[0], mr)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### AddSub"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "from results import *\n",
    "r = results(additional=[accuracy,num_samples,evaluation_time,shots,errors], task=lambda t: \"AddSub\" in t, shots=0)\n",
    "r = r.sort_values([\"decoder\", \"timestamp\"], ascending=[True, False])\n",
    "r\n",
    "# TODO: make check CoT for date understanding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for s in data(results(hash=\"48371f73\")).iloc:\n",
    "    if s.target != s.prediction:\n",
    "        print(s.target, s.prediction)\n",
    "        print(s[\"query\"])\n",
    "        print(s.model_result, end=\"\\n\\n\")\n",
    "        print(\"\\n\\n\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### AQuA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "from results import *\n",
    "r = results(additional=[accuracy,num_samples,evaluation_time,shots,errors], size=\"small\", task=lambda t: \"AQUA\" in t, shots=0, model=\"davinci\")\n",
    "r = r.sort_values([\"decoder\", \"timestamp\"], ascending=[True, False])\n",
    "r\n",
    "# TODO: make check CoT for date understanding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for s in data(results(hash=\"6e012731\")).iloc:\n",
    "    # if s.prediction != s.target:\n",
    "    # print(s.model_result)\n",
    "    # print(s[\"query\"])\n",
    "    print(s.model_result, \"\\nPREDICTION:\", s.prediction, \"\\n        GT:\", s.target)\n",
    "    if s.prediction == s.target:\n",
    "        print(\"[CORRECT]\")\n",
    "    print(\"\\n\" + \"=\" * 120, end=\"\\n\\n\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "clean_up_dangeling_json() # remove json result files without corresponding .csv files"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Strategy QA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    },
    {
     "data": {
      "text/plain": "                       task                    model   decoder  \\\n1    StrategyQA@mvar_medium  openai/text-davinci-003  beam_var   \n33   StrategyQA@mvar_medium  openai/text-davinci-003  beam_var   \n87   StrategyQA@mvar_medium  openai/text-davinci-003    argmax   \n89    StrategyQA@cot_medium  openai/text-davinci-003    argmax   \n108    StrategyQA@ao_medium  openai/text-davinci-003    argmax   \n\n                                                kwargs  num_samples  \\\n1    {'kwargs': {'max_length': 512, 'top1_distribut...           15   \n33   {'kwargs': {'max_length': 512, 'top1_distribut...           14   \n87   {'kwargs': {'max_length': 512, 'top1_distribut...          100   \n89   {'kwargs': {'max_length': 512, 'top1_distribut...          100   \n108  {'kwargs': {'max_length': 512, 'top1_distribut...          100   \n\n                                               cost            timestamp  \\\n1                        {'tokens': -1, 'cost': -1}  2023:01:23_00:00:35   \n33                       {'tokens': -1, 'cost': -1}  2023:01:23_08:31:00   \n87              {'tokens': 147202, 'cost': 2.94404}  2023:01:22_21:30:00   \n89               {'tokens': 95152, 'cost': 1.90304}  2023:01:22_20:30:44   \n108  {'tokens': 2663, 'cost': 0.053259999999999995}  2023:01:22_20:10:30   \n\n           result      hash  accuracy  evaluation_time  shots  errors  \n1    <TaskResult>  43cc9dde      0.00   8h 52m 11s ago      0       0  \n33   <TaskResult>  632e80f9      0.00      21m 46s ago      0       0  \n87   <TaskResult>  afe24d59      0.68  11h 22m 46s ago      0       0  \n89   <TaskResult>  3d761642      0.68   12h 22m 2s ago      0       0  \n108  <TaskResult>  96c416c9      0.70  12h 42m 16s ago      0       0  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>task</th>\n      <th>model</th>\n      <th>decoder</th>\n      <th>kwargs</th>\n      <th>num_samples</th>\n      <th>cost</th>\n      <th>timestamp</th>\n      <th>result</th>\n      <th>hash</th>\n      <th>accuracy</th>\n      <th>evaluation_time</th>\n      <th>shots</th>\n      <th>errors</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>1</th>\n      <td>StrategyQA@mvar_medium</td>\n      <td>openai/text-davinci-003</td>\n      <td>beam_var</td>\n      <td>{'kwargs': {'max_length': 512, 'top1_distribut...</td>\n      <td>15</td>\n      <td>{'tokens': -1, 'cost': -1}</td>\n      <td>2023:01:23_00:00:35</td>\n      <td>&lt;TaskResult&gt;</td>\n      <td>43cc9dde</td>\n      <td>0.00</td>\n      <td>8h 52m 11s ago</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>33</th>\n      <td>StrategyQA@mvar_medium</td>\n      <td>openai/text-davinci-003</td>\n      <td>beam_var</td>\n      <td>{'kwargs': {'max_length': 512, 'top1_distribut...</td>\n      <td>14</td>\n      <td>{'tokens': -1, 'cost': -1}</td>\n      <td>2023:01:23_08:31:00</td>\n      <td>&lt;TaskResult&gt;</td>\n      <td>632e80f9</td>\n      <td>0.00</td>\n      <td>21m 46s ago</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>87</th>\n      <td>StrategyQA@mvar_medium</td>\n      <td>openai/text-davinci-003</td>\n      <td>argmax</td>\n      <td>{'kwargs': {'max_length': 512, 'top1_distribut...</td>\n      <td>100</td>\n      <td>{'tokens': 147202, 'cost': 2.94404}</td>\n      <td>2023:01:22_21:30:00</td>\n      <td>&lt;TaskResult&gt;</td>\n      <td>afe24d59</td>\n      <td>0.68</td>\n      <td>11h 22m 46s ago</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>89</th>\n      <td>StrategyQA@cot_medium</td>\n      <td>openai/text-davinci-003</td>\n      <td>argmax</td>\n      <td>{'kwargs': {'max_length': 512, 'top1_distribut...</td>\n      <td>100</td>\n      <td>{'tokens': 95152, 'cost': 1.90304}</td>\n      <td>2023:01:22_20:30:44</td>\n      <td>&lt;TaskResult&gt;</td>\n      <td>3d761642</td>\n      <td>0.68</td>\n      <td>12h 22m 2s ago</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>108</th>\n      <td>StrategyQA@ao_medium</td>\n      <td>openai/text-davinci-003</td>\n      <td>argmax</td>\n      <td>{'kwargs': {'max_length': 512, 'top1_distribut...</td>\n      <td>100</td>\n      <td>{'tokens': 2663, 'cost': 0.053259999999999995}</td>\n      <td>2023:01:22_20:10:30</td>\n      <td>&lt;TaskResult&gt;</td>\n      <td>96c416c9</td>\n      <td>0.70</td>\n      <td>12h 42m 16s ago</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "from results import *\n",
    "r = results(additional=[accuracy,num_samples,evaluation_time,shots,errors], task=lambda t: \"StrategyQA\" in t, shots=0)\n",
    "# r = r.sort_values([\"decoder\", \"timestamp\"], ascending=[True, False])\n",
    "r\n",
    "# TODO: make check CoT for date understanding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "ename": "AttributeError",
     "evalue": "'list' object has no attribute 'iloc'",
     "output_type": "error",
     "traceback": [
      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
      "\u001B[0;31mAttributeError\u001B[0m                            Traceback (most recent call last)",
      "Cell \u001B[0;32mIn[2], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m s \u001B[38;5;129;01min\u001B[39;00m \u001B[43mdata\u001B[49m\u001B[43m(\u001B[49m\u001B[43mresults\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mhash\u001B[39;49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43m3ce45a44\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\u001B[43m)\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43miloc\u001B[49m:\n\u001B[1;32m      2\u001B[0m     \u001B[38;5;28mprint\u001B[39m(s\u001B[38;5;241m.\u001B[39mtarget, s\u001B[38;5;241m.\u001B[39mprediction)\n\u001B[1;32m      3\u001B[0m     \u001B[38;5;28;01mif\u001B[39;00m s\u001B[38;5;241m.\u001B[39mtarget \u001B[38;5;241m==\u001B[39m s\u001B[38;5;241m.\u001B[39mprediction:\n",
      "\u001B[0;31mAttributeError\u001B[0m: 'list' object has no attribute 'iloc'"
     ]
    }
   ],
   "source": [
    "for s in data(results(hash=\"3ce45a44\")).iloc:\n",
    "    print(s.target, s.prediction)\n",
    "    if s.target == s.prediction:\n",
    "        print(s.target, s.prediction)\n",
    "        print(s[\"query\"])\n",
    "        print(s.model_result, end=\"\\n\\n\")\n",
    "        print(\"\\n\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data(results(hash=\"3ce45a44\"))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### GSM8"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "from results import *\n",
    "\n",
    "def value_normalizer(v):\n",
    "    v = str(v).strip(\". '\")\n",
    "    v = v.rstrip(\".0\")\n",
    "    return v\n",
    "# more lenient accuracy\n",
    "def acc(v):\n",
    "    return accuracy(v, t=value_normalizer)\n",
    "\n",
    "r = results(additional=[acc,num_samples,evaluation_time,shots,errors], task=lambda t: \"GSM8\" in t, shots=0)\n",
    "# r = r.sort_values([\"decoder\", \"timestamp\"], ascending=[True, False])\n",
    "r\n",
    "# TODO: make check CoT for date understanding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for s in data(results(hash=\"e353f066\")).iloc:\n",
    "    print([s.model_result[-10:]], s.prediction, s.target, value_normalizer(s.prediction) == value_normalizer(s.target))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  },
  "vscode": {
   "interpreter": {
    "hash": "1a3f742538928d7fe17d54779274ecff8afc8007fdeca0464d7b2ce3865992ba"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
