{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "ffa57366-a1c5-4fea-8d56-ae40c0a2c3f5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# pandas show full column contents\n",
    "pd.set_option('display.max_colwidth', None)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c3d9f783-ee48-4645-8dec-fd1eef521281",
   "metadata": {},
   "source": [
    "### AQuA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1061,
   "id": "41c8a640-9b26-4d8f-8c36-886ffd9e7b2a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>task</th>\n",
       "      <th>model</th>\n",
       "      <th>decoder</th>\n",
       "      <th>kwargs</th>\n",
       "      <th>num_samples</th>\n",
       "      <th>cost</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>result</th>\n",
       "      <th>hash</th>\n",
       "      <th>accuracy</th>\n",
       "      <th>evaluation_time</th>\n",
       "      <th>shots</th>\n",
       "      <th>errors</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>353</th>\n",
       "      <td>AQUA@ao_medium</td>\n",
       "      <td>openai/text-curie-001</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': &lt;class 'eval-aqua.AquaCotAo'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 25009, 'cost': 0.050018}</td>\n",
       "      <td>2023:03:16_18:49:51</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>d2f1b245</td>\n",
       "      <td>0.23</td>\n",
       "      <td>16h 40m 32s ago</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>375</th>\n",
       "      <td>AQUA@ao_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': 512, 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': &lt;class 'eval-aqua.AquaCotAo'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 25009, 'cost': 0.5001800000000001}</td>\n",
       "      <td>2023:03:15_15:59:52</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>56c0d98b</td>\n",
       "      <td>0.29</td>\n",
       "      <td>1d 19h 30m 31s ago</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>261</th>\n",
       "      <td>AQUA@cot_medium</td>\n",
       "      <td>openai/text-curie-001</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': &lt;class 'eval-aqua.AquaCot'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 209850, 'cost': 0.4197}</td>\n",
       "      <td>2023:03:15_18:17:04</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>f7609d38</td>\n",
       "      <td>0.17</td>\n",
       "      <td>1d 17h 13m 19s ago</td>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>251</th>\n",
       "      <td>AQUA@cot_medium</td>\n",
       "      <td>openai/text-curie-001</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '512', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': &lt;class 'eval-aqua.AquaCot'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 156645, 'cost': 0.31329}</td>\n",
       "      <td>2023:03:15_18:08:54</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>bd905576</td>\n",
       "      <td>0.16</td>\n",
       "      <td>1d 17h 21m 29s ago</td>\n",
       "      <td>2</td>\n",
       "      <td>33</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>AQUA@cot_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': &lt;class 'eval-aqua.AquaCot'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 212132, 'cost': 4.24264}</td>\n",
       "      <td>2023:03:15_15:41:11</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>ff5c9699</td>\n",
       "      <td>0.45</td>\n",
       "      <td>1d 19h 49m 12s ago</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>282</th>\n",
       "      <td>AQUA@dash_multivar_medium</td>\n",
       "      <td>openai/text-curie-001</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': &lt;class 'eval-aqua.AquaDashMultiVarCot'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 722934, 'cost': 1.445868}</td>\n",
       "      <td>2023:03:15_18:02:09</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>0e4af722</td>\n",
       "      <td>0.23</td>\n",
       "      <td>1d 17h 28m 14s ago</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52</th>\n",
       "      <td>AQUA@dash_multivar_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': &lt;class 'eval-aqua.AquaDashMultiVarCot'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 730634, 'cost': 14.612680000000001}</td>\n",
       "      <td>2023:03:15_15:42:50</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>648c0fc3</td>\n",
       "      <td>0.46</td>\n",
       "      <td>1d 19h 47m 33s ago</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>384</th>\n",
       "      <td>AQUA@dash_multivar_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>var</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': &lt;class 'eval-aqua.AquaDashMultiVarCot'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 2776375, 'cost': 55.5275}</td>\n",
       "      <td>2023:03:15_16:06:32</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>de08b397</td>\n",
       "      <td>0.44</td>\n",
       "      <td>1d 19h 23m 51s ago</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>385</th>\n",
       "      <td>AQUA@multivar_medium</td>\n",
       "      <td>openai/text-curie-001</td>\n",
       "      <td>beam_var</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': &lt;class 'eval-aqua.AquaMultiVarCot'&gt;}</td>\n",
       "      <td>1</td>\n",
       "      <td>{'tokens': -1, 'cost': -1}</td>\n",
       "      <td>2023:03:16_18:51:06</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>14a9f823</td>\n",
       "      <td>0.00</td>\n",
       "      <td>16h 39m 17s ago</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                          task                    model   decoder  \\\n",
       "353             AQUA@ao_medium    openai/text-curie-001    argmax   \n",
       "375             AQUA@ao_medium  openai/text-davinci-003    argmax   \n",
       "261            AQUA@cot_medium    openai/text-curie-001    argmax   \n",
       "251            AQUA@cot_medium    openai/text-curie-001    argmax   \n",
       "12             AQUA@cot_medium  openai/text-davinci-003    argmax   \n",
       "282  AQUA@dash_multivar_medium    openai/text-curie-001    argmax   \n",
       "52   AQUA@dash_multivar_medium  openai/text-davinci-003    argmax   \n",
       "384  AQUA@dash_multivar_medium  openai/text-davinci-003       var   \n",
       "385       AQUA@multivar_medium    openai/text-curie-001  beam_var   \n",
       "\n",
       "                                                                                                                                                                               kwargs  \\\n",
       "353              {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': <class 'eval-aqua.AquaCotAo'>}   \n",
       "375               {'kwargs': {'max_length': 512, 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': <class 'eval-aqua.AquaCotAo'>}   \n",
       "261              {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': <class 'eval-aqua.AquaCot'>}   \n",
       "251               {'kwargs': {'max_length': '512', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': <class 'eval-aqua.AquaCot'>}   \n",
       "12               {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': <class 'eval-aqua.AquaCot'>}   \n",
       "282  {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': <class 'eval-aqua.AquaDashMultiVarCot'>}   \n",
       "52   {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': <class 'eval-aqua.AquaDashMultiVarCot'>}   \n",
       "384  {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': <class 'eval-aqua.AquaDashMultiVarCot'>}   \n",
       "385        {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': <class 'eval-aqua.AquaMultiVarCot'>}   \n",
       "\n",
       "     num_samples                                            cost  \\\n",
       "353          100             {'tokens': 25009, 'cost': 0.050018}   \n",
       "375          100   {'tokens': 25009, 'cost': 0.5001800000000001}   \n",
       "261          100              {'tokens': 209850, 'cost': 0.4197}   \n",
       "251          100             {'tokens': 156645, 'cost': 0.31329}   \n",
       "12           100             {'tokens': 212132, 'cost': 4.24264}   \n",
       "282          100            {'tokens': 722934, 'cost': 1.445868}   \n",
       "52           100  {'tokens': 730634, 'cost': 14.612680000000001}   \n",
       "384          100            {'tokens': 2776375, 'cost': 55.5275}   \n",
       "385            1                      {'tokens': -1, 'cost': -1}   \n",
       "\n",
       "               timestamp        result      hash  accuracy  \\\n",
       "353  2023:03:16_18:49:51  <TaskResult>  d2f1b245      0.23   \n",
       "375  2023:03:15_15:59:52  <TaskResult>  56c0d98b      0.29   \n",
       "261  2023:03:15_18:17:04  <TaskResult>  f7609d38      0.17   \n",
       "251  2023:03:15_18:08:54  <TaskResult>  bd905576      0.16   \n",
       "12   2023:03:15_15:41:11  <TaskResult>  ff5c9699      0.45   \n",
       "282  2023:03:15_18:02:09  <TaskResult>  0e4af722      0.23   \n",
       "52   2023:03:15_15:42:50  <TaskResult>  648c0fc3      0.46   \n",
       "384  2023:03:15_16:06:32  <TaskResult>  de08b397      0.44   \n",
       "385  2023:03:16_18:51:06  <TaskResult>  14a9f823      0.00   \n",
       "\n",
       "        evaluation_time  shots  errors  \n",
       "353     16h 40m 32s ago      2       0  \n",
       "375  1d 19h 30m 31s ago      2       0  \n",
       "261  1d 17h 13m 19s ago      2       5  \n",
       "251  1d 17h 21m 29s ago      2      33  \n",
       "12   1d 19h 49m 12s ago      2       0  \n",
       "282  1d 17h 28m 14s ago      2       0  \n",
       "52   1d 19h 47m 33s ago      2       1  \n",
       "384  1d 19h 23m 51s ago      2       0  \n",
       "385     16h 39m 17s ago      2       1  "
      ]
     },
     "execution_count": 1061,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "from results import *\n",
    "r = results(additional=[accuracy,num_samples,evaluation_time,shots,errors], task=lambda t: \"AQUA\" in t and \"medium\" in t, shots=2)\n",
    "r = r.sort_values([\"task\", \"decoder\", \"timestamp\"], ascending=[True, True, False])\n",
    "r\n",
    "# TODO: make check CoT for date understanding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "370928b4-2ccc-4fbb-8e4e-31a4ce910127",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(data(\"ff5c9699\")[\"query\"].values[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1060,
   "id": "2bebd092-3da0-49ea-925c-16fdabaf7c56",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>task</th>\n",
       "      <th>model</th>\n",
       "      <th>decoder</th>\n",
       "      <th>kwargs</th>\n",
       "      <th>num_samples</th>\n",
       "      <th>cost</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>result</th>\n",
       "      <th>hash</th>\n",
       "      <th>accuracy</th>\n",
       "      <th>evaluation_time</th>\n",
       "      <th>shots</th>\n",
       "      <th>errors</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>173</th>\n",
       "      <td>AQUA@ao_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': 512, 'top1_distribution': True},'size': medium,'suite': &lt;class 'eval-aqua.AquaCotAo'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 63864, 'cost': 1.27728}</td>\n",
       "      <td>2023:01:26_00:46:49</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>64968482</td>\n",
       "      <td>0.31</td>\n",
       "      <td>7w 1d 10h 43m 12s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>AQUA@cot_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': 1024, 'top1_distribution': True},'size': medium,'suite': &lt;class 'eval-aqua.AquaCot'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 83764, 'cost': 1.6752799999999999}</td>\n",
       "      <td>2023:01:25_23:07:00</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>bfbb09ea</td>\n",
       "      <td>0.37</td>\n",
       "      <td>7w 1d 12h 23m 1s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>51</th>\n",
       "      <td>AQUA@cot_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>beam_var</td>\n",
       "      <td>{'kwargs': {'max_length': 512, 'top1_distribution': True, 'num_beams': '2', 'prune': '1.05'},'size': medium,'suite': &lt;class 'eval-aqua.AquaCot'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 3368786, 'cost': 67.37572}</td>\n",
       "      <td>2023:01:26_01:28:31</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>aaa39758</td>\n",
       "      <td>0.35</td>\n",
       "      <td>7w 1d 10h 1m 30s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>134</th>\n",
       "      <td>AQUA@cot_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>bsseq</td>\n",
       "      <td>{'kwargs': {'max_length': 512, 'top1_distribution': True, 'b': '2', 'openai_chunksize': '128'},'size': medium,'suite': &lt;class 'eval-aqua.AquaCot'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 1206850, 'cost': 24.137}</td>\n",
       "      <td>2023:01:26_13:34:09</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>efc16fb7</td>\n",
       "      <td>0.37</td>\n",
       "      <td>7w 21h 55m 52s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>AQUA@cot_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>var</td>\n",
       "      <td>{'kwargs': {'max_length': 512, 'top1_distribution': True},'size': medium,'suite': &lt;class 'eval-aqua.AquaCot'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 228986, 'cost': 4.57972}</td>\n",
       "      <td>2023:01:26_01:19:59</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>dc5bd7ea</td>\n",
       "      <td>0.35</td>\n",
       "      <td>7w 1d 10h 10m 2s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>323</th>\n",
       "      <td>AQUA@dash_multivar_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': 512, 'top1_distribution': True},'size': medium,'suite': &lt;class 'eval-aqua.AquaDashMultiVarCot'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 354821, 'cost': 7.096420000000001}</td>\n",
       "      <td>2023:01:26_00:45:43</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>e32dd7dd</td>\n",
       "      <td>0.40</td>\n",
       "      <td>7w 1d 10h 44m 18s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>AQUA@dash_multivar_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': 512, 'top1_distribution': True},'size': medium,'suite': &lt;class 'eval-aqua.AquaDashMultiVarCot'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 320857, 'cost': 6.417140000000001}</td>\n",
       "      <td>2023:01:26_00:38:35</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>293f5e8b</td>\n",
       "      <td>0.31</td>\n",
       "      <td>7w 1d 10h 51m 26s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>160</th>\n",
       "      <td>AQUA@dash_multivar_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': 512, 'top1_distribution': True},'size': medium,'suite': &lt;class 'eval-aqua.AquaDashMultiVarCot'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 363218, 'cost': 7.264360000000001}</td>\n",
       "      <td>2023:01:26_00:33:03</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>eb3e0eaa</td>\n",
       "      <td>0.31</td>\n",
       "      <td>7w 1d 10h 56m 58s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>432</th>\n",
       "      <td>AQUA@dash_multivar_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': 512, 'top1_distribution': True},'size': medium,'suite': &lt;class 'eval-aqua.AquaDashMultiVarCot'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 228746, 'cost': 4.5749200000000005}</td>\n",
       "      <td>2023:01:25_23:24:55</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>9fe382e4</td>\n",
       "      <td>0.28</td>\n",
       "      <td>7w 1d 12h 5m 6s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>308</th>\n",
       "      <td>AQUA@dash_multivar_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': 1024, 'top1_distribution': True},'size': medium,'suite': &lt;class 'eval-aqua.AquaDashMultiVarCot'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 180878, 'cost': 3.6175599999999997}</td>\n",
       "      <td>2023:01:25_23:09:59</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>055cc408</td>\n",
       "      <td>0.28</td>\n",
       "      <td>7w 1d 12h 20m 2s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>226</th>\n",
       "      <td>AQUA@dash_multivar_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>beam_var</td>\n",
       "      <td>{'kwargs': {'max_length': 512, 'top1_distribution': True, 'num_beams': '2', 'prune': '1.05'},'size': medium,'suite': &lt;class 'eval-aqua.AquaDashMultiVarCot'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 1861174, 'cost': 37.22348}</td>\n",
       "      <td>2023:01:26_00:53:14</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>29e97f97</td>\n",
       "      <td>0.47</td>\n",
       "      <td>7w 1d 10h 36m 47s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>424</th>\n",
       "      <td>AQUA@dash_multivar_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>beam_var</td>\n",
       "      <td>{'kwargs': {'max_length': 512, 'top1_distribution': True, 'num_beams': '2', 'prune': '1.05'},'size': medium,'suite': &lt;class 'eval-aqua.AquaDashMultiVarCot'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 1825974, 'cost': 36.51948}</td>\n",
       "      <td>2023:01:26_00:38:14</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>f72b78bd</td>\n",
       "      <td>0.40</td>\n",
       "      <td>7w 1d 10h 51m 47s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>AQUA@dash_multivar_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>bsseq</td>\n",
       "      <td>{'kwargs': {'max_length': 512, 'top1_distribution': True, 'b': '2', 'openai_chunksize': '128'},'size': medium,'suite': &lt;class 'eval-aqua.AquaDashMultiVarCot'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 1898355, 'cost': 37.9671}</td>\n",
       "      <td>2023:01:26_18:28:01</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>07c78cdc</td>\n",
       "      <td>0.37</td>\n",
       "      <td>7w 17h 2m  ago</td>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>377</th>\n",
       "      <td>AQUA@dash_multivar_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>var</td>\n",
       "      <td>{'kwargs': {'max_length': 512, 'top1_distribution': True},'size': medium,'suite': &lt;class 'eval-aqua.AquaDashMultiVarCot'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 1639088, 'cost': 32.78176}</td>\n",
       "      <td>2023:01:26_00:53:36</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>d19382f3</td>\n",
       "      <td>0.35</td>\n",
       "      <td>7w 1d 10h 36m 25s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                          task                    model   decoder  \\\n",
       "173             AQUA@ao_medium  openai/text-davinci-003    argmax   \n",
       "28             AQUA@cot_medium  openai/text-davinci-003    argmax   \n",
       "51             AQUA@cot_medium  openai/text-davinci-003  beam_var   \n",
       "134            AQUA@cot_medium  openai/text-davinci-003     bsseq   \n",
       "15             AQUA@cot_medium  openai/text-davinci-003       var   \n",
       "323  AQUA@dash_multivar_medium  openai/text-davinci-003    argmax   \n",
       "36   AQUA@dash_multivar_medium  openai/text-davinci-003    argmax   \n",
       "160  AQUA@dash_multivar_medium  openai/text-davinci-003    argmax   \n",
       "432  AQUA@dash_multivar_medium  openai/text-davinci-003    argmax   \n",
       "308  AQUA@dash_multivar_medium  openai/text-davinci-003    argmax   \n",
       "226  AQUA@dash_multivar_medium  openai/text-davinci-003  beam_var   \n",
       "424  AQUA@dash_multivar_medium  openai/text-davinci-003  beam_var   \n",
       "18   AQUA@dash_multivar_medium  openai/text-davinci-003     bsseq   \n",
       "377  AQUA@dash_multivar_medium  openai/text-davinci-003       var   \n",
       "\n",
       "                                                                                                                                                              kwargs  \\\n",
       "173                                                 {'kwargs': {'max_length': 512, 'top1_distribution': True},'size': medium,'suite': <class 'eval-aqua.AquaCotAo'>}   \n",
       "28                                                   {'kwargs': {'max_length': 1024, 'top1_distribution': True},'size': medium,'suite': <class 'eval-aqua.AquaCot'>}   \n",
       "51                 {'kwargs': {'max_length': 512, 'top1_distribution': True, 'num_beams': '2', 'prune': '1.05'},'size': medium,'suite': <class 'eval-aqua.AquaCot'>}   \n",
       "134              {'kwargs': {'max_length': 512, 'top1_distribution': True, 'b': '2', 'openai_chunksize': '128'},'size': medium,'suite': <class 'eval-aqua.AquaCot'>}   \n",
       "15                                                    {'kwargs': {'max_length': 512, 'top1_distribution': True},'size': medium,'suite': <class 'eval-aqua.AquaCot'>}   \n",
       "323                                       {'kwargs': {'max_length': 512, 'top1_distribution': True},'size': medium,'suite': <class 'eval-aqua.AquaDashMultiVarCot'>}   \n",
       "36                                        {'kwargs': {'max_length': 512, 'top1_distribution': True},'size': medium,'suite': <class 'eval-aqua.AquaDashMultiVarCot'>}   \n",
       "160                                       {'kwargs': {'max_length': 512, 'top1_distribution': True},'size': medium,'suite': <class 'eval-aqua.AquaDashMultiVarCot'>}   \n",
       "432                                       {'kwargs': {'max_length': 512, 'top1_distribution': True},'size': medium,'suite': <class 'eval-aqua.AquaDashMultiVarCot'>}   \n",
       "308                                      {'kwargs': {'max_length': 1024, 'top1_distribution': True},'size': medium,'suite': <class 'eval-aqua.AquaDashMultiVarCot'>}   \n",
       "226    {'kwargs': {'max_length': 512, 'top1_distribution': True, 'num_beams': '2', 'prune': '1.05'},'size': medium,'suite': <class 'eval-aqua.AquaDashMultiVarCot'>}   \n",
       "424    {'kwargs': {'max_length': 512, 'top1_distribution': True, 'num_beams': '2', 'prune': '1.05'},'size': medium,'suite': <class 'eval-aqua.AquaDashMultiVarCot'>}   \n",
       "18   {'kwargs': {'max_length': 512, 'top1_distribution': True, 'b': '2', 'openai_chunksize': '128'},'size': medium,'suite': <class 'eval-aqua.AquaDashMultiVarCot'>}   \n",
       "377                                       {'kwargs': {'max_length': 512, 'top1_distribution': True},'size': medium,'suite': <class 'eval-aqua.AquaDashMultiVarCot'>}   \n",
       "\n",
       "     num_samples                                            cost  \\\n",
       "173          100              {'tokens': 63864, 'cost': 1.27728}   \n",
       "28           100   {'tokens': 83764, 'cost': 1.6752799999999999}   \n",
       "51           100           {'tokens': 3368786, 'cost': 67.37572}   \n",
       "134          100             {'tokens': 1206850, 'cost': 24.137}   \n",
       "15           100             {'tokens': 228986, 'cost': 4.57972}   \n",
       "323          100   {'tokens': 354821, 'cost': 7.096420000000001}   \n",
       "36           100   {'tokens': 320857, 'cost': 6.417140000000001}   \n",
       "160          100   {'tokens': 363218, 'cost': 7.264360000000001}   \n",
       "432          100  {'tokens': 228746, 'cost': 4.5749200000000005}   \n",
       "308          100  {'tokens': 180878, 'cost': 3.6175599999999997}   \n",
       "226          100           {'tokens': 1861174, 'cost': 37.22348}   \n",
       "424          100           {'tokens': 1825974, 'cost': 36.51948}   \n",
       "18           100            {'tokens': 1898355, 'cost': 37.9671}   \n",
       "377          100           {'tokens': 1639088, 'cost': 32.78176}   \n",
       "\n",
       "               timestamp        result      hash  accuracy  \\\n",
       "173  2023:01:26_00:46:49  <TaskResult>  64968482      0.31   \n",
       "28   2023:01:25_23:07:00  <TaskResult>  bfbb09ea      0.37   \n",
       "51   2023:01:26_01:28:31  <TaskResult>  aaa39758      0.35   \n",
       "134  2023:01:26_13:34:09  <TaskResult>  efc16fb7      0.37   \n",
       "15   2023:01:26_01:19:59  <TaskResult>  dc5bd7ea      0.35   \n",
       "323  2023:01:26_00:45:43  <TaskResult>  e32dd7dd      0.40   \n",
       "36   2023:01:26_00:38:35  <TaskResult>  293f5e8b      0.31   \n",
       "160  2023:01:26_00:33:03  <TaskResult>  eb3e0eaa      0.31   \n",
       "432  2023:01:25_23:24:55  <TaskResult>  9fe382e4      0.28   \n",
       "308  2023:01:25_23:09:59  <TaskResult>  055cc408      0.28   \n",
       "226  2023:01:26_00:53:14  <TaskResult>  29e97f97      0.47   \n",
       "424  2023:01:26_00:38:14  <TaskResult>  f72b78bd      0.40   \n",
       "18   2023:01:26_18:28:01  <TaskResult>  07c78cdc      0.37   \n",
       "377  2023:01:26_00:53:36  <TaskResult>  d19382f3      0.35   \n",
       "\n",
       "           evaluation_time  shots  errors  \n",
       "173  7w 1d 10h 43m 12s ago      0       2  \n",
       "28    7w 1d 12h 23m 1s ago      0       3  \n",
       "51    7w 1d 10h 1m 30s ago      0       1  \n",
       "134     7w 21h 55m 52s ago      0       7  \n",
       "15    7w 1d 10h 10m 2s ago      0       1  \n",
       "323  7w 1d 10h 44m 18s ago      0       1  \n",
       "36   7w 1d 10h 51m 26s ago      0       1  \n",
       "160  7w 1d 10h 56m 58s ago      0       1  \n",
       "432    7w 1d 12h 5m 6s ago      0       0  \n",
       "308   7w 1d 12h 20m 2s ago      0       0  \n",
       "226  7w 1d 10h 36m 47s ago      0       0  \n",
       "424  7w 1d 10h 51m 47s ago      0       0  \n",
       "18          7w 17h 2m  ago      0      10  \n",
       "377  7w 1d 10h 36m 25s ago      0      11  "
      ]
     },
     "execution_count": 1060,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "from results import *\n",
    "r = results(additional=[accuracy,num_samples,evaluation_time,shots,errors], task=lambda t: \"AQUA\" in t and \"medium\" in t, shots=0, model=\"davinci\")\n",
    "r = r.sort_values([\"task\", \"decoder\", \"timestamp\"], ascending=[True, True, False])\n",
    "r\n",
    "# TODO: make check CoT for date understanding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "608fa0b3-2447-425f-92bc-f80b88a1fe05",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Q: A starts a business with Rs.40,000. After 2 months, B joined him with Rs.60,000. C joined them after some more time with Rs.120,000. At the end of the year, out of a total profit of Rs.375,000, C gets Rs.150,000 as his share. How many months after B joined the business, did C join?\n",
      "Answer Choices: (A) 2 months (B) 4 months (C) 23 months (D) 24 months (E) 84 months\n",
      "A: Among A through E, the answer is D: 24 months.\n"
     ]
    }
   ],
   "source": [
    "print(data(\"64968482\")[\"model_result\"].values[20])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7f1e33eb-c70f-449b-8580-4ecbb0f6850b",
   "metadata": {},
   "source": [
    "##### Curie"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1118,
   "id": "02298414-9b05-4703-9184-a40afe12f892",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>task</th>\n",
       "      <th>model</th>\n",
       "      <th>decoder</th>\n",
       "      <th>kwargs</th>\n",
       "      <th>num_samples</th>\n",
       "      <th>cost</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>result</th>\n",
       "      <th>hash</th>\n",
       "      <th>accuracy</th>\n",
       "      <th>evaluation_time</th>\n",
       "      <th>shots</th>\n",
       "      <th>errors</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>176</th>\n",
       "      <td>AQUA@ao_medium</td>\n",
       "      <td>openai/text-curie-001</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': &lt;class 'eval-aqua.AquaCotAo'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 9370, 'cost': 0.01874}</td>\n",
       "      <td>2023:03:16_18:54:24</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>b99bc1ff</td>\n",
       "      <td>0.16</td>\n",
       "      <td>17h 32m 36s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>212</th>\n",
       "      <td>AQUA@cot_medium</td>\n",
       "      <td>openai/text-curie-001</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '512', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': &lt;class 'eval-aqua.AquaCot'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 65800, 'cost': 0.1316}</td>\n",
       "      <td>2023:03:15_17:40:40</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>e6642dd8</td>\n",
       "      <td>0.24</td>\n",
       "      <td>1d 18h 46m 20s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>229</th>\n",
       "      <td>AQUA@dash_multivar_medium</td>\n",
       "      <td>openai/text-curie-001</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': &lt;class 'eval-aqua.AquaDashMultiVarCot'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 402103, 'cost': 0.8042060000000001}</td>\n",
       "      <td>2023:03:15_17:58:32</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>27541b43</td>\n",
       "      <td>0.27</td>\n",
       "      <td>1d 18h 28m 28s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>283</th>\n",
       "      <td>AQUA@dash_multivar_medium</td>\n",
       "      <td>openai/text-curie-001</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': &lt;class 'eval-aqua.AquaDashMultiVarCot'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 214052, 'cost': 0.428104}</td>\n",
       "      <td>2023:03:15_17:52:26</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>2e088358</td>\n",
       "      <td>0.21</td>\n",
       "      <td>1d 18h 34m 34s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>206</th>\n",
       "      <td>AQUA@multivar_medium</td>\n",
       "      <td>openai/text-curie-001</td>\n",
       "      <td>var</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': &lt;class 'eval-aqua.AquaMultiVarCot'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 629590, 'cost': 1.2591800000000002}</td>\n",
       "      <td>2023:03:17_11:18:06</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>60cc1e52</td>\n",
       "      <td>0.17</td>\n",
       "      <td>1h 8m 54s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                          task                  model decoder  \\\n",
       "176             AQUA@ao_medium  openai/text-curie-001  argmax   \n",
       "212            AQUA@cot_medium  openai/text-curie-001  argmax   \n",
       "229  AQUA@dash_multivar_medium  openai/text-curie-001  argmax   \n",
       "283  AQUA@dash_multivar_medium  openai/text-curie-001  argmax   \n",
       "206       AQUA@multivar_medium  openai/text-curie-001     var   \n",
       "\n",
       "                                                                                                                                                                               kwargs  \\\n",
       "176              {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': <class 'eval-aqua.AquaCotAo'>}   \n",
       "212               {'kwargs': {'max_length': '512', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': <class 'eval-aqua.AquaCot'>}   \n",
       "229  {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': <class 'eval-aqua.AquaDashMultiVarCot'>}   \n",
       "283  {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': <class 'eval-aqua.AquaDashMultiVarCot'>}   \n",
       "206        {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': <class 'eval-aqua.AquaMultiVarCot'>}   \n",
       "\n",
       "     num_samples                                            cost  \\\n",
       "176          100               {'tokens': 9370, 'cost': 0.01874}   \n",
       "212          100               {'tokens': 65800, 'cost': 0.1316}   \n",
       "229          100  {'tokens': 402103, 'cost': 0.8042060000000001}   \n",
       "283          100            {'tokens': 214052, 'cost': 0.428104}   \n",
       "206          100  {'tokens': 629590, 'cost': 1.2591800000000002}   \n",
       "\n",
       "               timestamp        result      hash  accuracy  \\\n",
       "176  2023:03:16_18:54:24  <TaskResult>  b99bc1ff      0.16   \n",
       "212  2023:03:15_17:40:40  <TaskResult>  e6642dd8      0.24   \n",
       "229  2023:03:15_17:58:32  <TaskResult>  27541b43      0.27   \n",
       "283  2023:03:15_17:52:26  <TaskResult>  2e088358      0.21   \n",
       "206  2023:03:17_11:18:06  <TaskResult>  60cc1e52      0.17   \n",
       "\n",
       "        evaluation_time  shots  errors  \n",
       "176     17h 32m 36s ago      0       0  \n",
       "212  1d 18h 46m 20s ago      0       2  \n",
       "229  1d 18h 28m 28s ago      0       1  \n",
       "283  1d 18h 34m 34s ago      0       1  \n",
       "206       1h 8m 54s ago      0       1  "
      ]
     },
     "execution_count": 1118,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "from results import *\n",
    "r = results(additional=[accuracy,num_samples,evaluation_time,shots,errors], task=lambda t: \"AQUA\" in t and \"medium\" in t, model=\"curie\", shots=0)\n",
    "r = r.sort_values([\"task\", \"decoder\", \"timestamp\"], ascending=[True, True, False])\n",
    "r\n",
    "# TODO: make check CoT for date understanding\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b4765f34-ffdf-4032-8e81-921bb15fb395",
   "metadata": {},
   "source": [
    "### StrategyQA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1119,
   "id": "942afb10-1422-447f-9581-952c66d71c80",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>task</th>\n",
       "      <th>model</th>\n",
       "      <th>decoder</th>\n",
       "      <th>kwargs</th>\n",
       "      <th>num_samples</th>\n",
       "      <th>cost</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>result</th>\n",
       "      <th>hash</th>\n",
       "      <th>accuracy</th>\n",
       "      <th>evaluation_time</th>\n",
       "      <th>shots</th>\n",
       "      <th>errors</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>66</th>\n",
       "      <td>sqauni@ao_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': &lt;class 'eval-strategyqa-uniform.StrategyQAao'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 8265, 'cost': 0.1653}</td>\n",
       "      <td>2023:03:15_18:42:02</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>cf223ec2</td>\n",
       "      <td>0.67</td>\n",
       "      <td>1d 17h 45m 7s ago</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>267</th>\n",
       "      <td>sqauni@cot_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': &lt;class 'eval-strategyqa-uniform.StrategyQAcot'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 149912, 'cost': 2.99824}</td>\n",
       "      <td>2023:03:15_18:39:23</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>30f3d66e</td>\n",
       "      <td>0.74</td>\n",
       "      <td>1d 17h 47m 46s ago</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>410</th>\n",
       "      <td>sqauni@multivar2_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': &lt;class 'eval-strategyqa-uniform.StrategyQAmvar2'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 277112, 'cost': 5.5422400000000005}</td>\n",
       "      <td>2023:03:15_18:40:38</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>c29b5c82</td>\n",
       "      <td>0.78</td>\n",
       "      <td>1d 17h 46m 31s ago</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>382</th>\n",
       "      <td>sqauni@multivar2_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>beam_var</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': &lt;class 'eval-strategyqa-uniform.StrategyQAmvar2'&gt;}</td>\n",
       "      <td>1</td>\n",
       "      <td>{'tokens': -1, 'cost': -1}</td>\n",
       "      <td>2023:03:15_18:54:01</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>78120b4f</td>\n",
       "      <td>1.00</td>\n",
       "      <td>1d 17h 33m 8s ago</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>127</th>\n",
       "      <td>sqauni@multivar2_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>var</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': &lt;class 'eval-strategyqa-uniform.StrategyQAmvar2'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 983177, 'cost': 19.66354}</td>\n",
       "      <td>2023:03:15_18:46:47</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>22a1913c</td>\n",
       "      <td>0.78</td>\n",
       "      <td>1d 17h 40m 22s ago</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                        task                    model   decoder  \\\n",
       "66          sqauni@ao_medium  openai/text-davinci-003    argmax   \n",
       "267        sqauni@cot_medium  openai/text-davinci-003    argmax   \n",
       "410  sqauni@multivar2_medium  openai/text-davinci-003    argmax   \n",
       "382  sqauni@multivar2_medium  openai/text-davinci-003  beam_var   \n",
       "127  sqauni@multivar2_medium  openai/text-davinci-003       var   \n",
       "\n",
       "                                                                                                                                                                                         kwargs  \\\n",
       "66      {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': <class 'eval-strategyqa-uniform.StrategyQAao'>}   \n",
       "267    {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': <class 'eval-strategyqa-uniform.StrategyQAcot'>}   \n",
       "410  {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': <class 'eval-strategyqa-uniform.StrategyQAmvar2'>}   \n",
       "382  {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': <class 'eval-strategyqa-uniform.StrategyQAmvar2'>}   \n",
       "127  {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': <class 'eval-strategyqa-uniform.StrategyQAmvar2'>}   \n",
       "\n",
       "     num_samples                                            cost  \\\n",
       "66           100                {'tokens': 8265, 'cost': 0.1653}   \n",
       "267          100             {'tokens': 149912, 'cost': 2.99824}   \n",
       "410          100  {'tokens': 277112, 'cost': 5.5422400000000005}   \n",
       "382            1                      {'tokens': -1, 'cost': -1}   \n",
       "127          100            {'tokens': 983177, 'cost': 19.66354}   \n",
       "\n",
       "               timestamp        result      hash  accuracy  \\\n",
       "66   2023:03:15_18:42:02  <TaskResult>  cf223ec2      0.67   \n",
       "267  2023:03:15_18:39:23  <TaskResult>  30f3d66e      0.74   \n",
       "410  2023:03:15_18:40:38  <TaskResult>  c29b5c82      0.78   \n",
       "382  2023:03:15_18:54:01  <TaskResult>  78120b4f      1.00   \n",
       "127  2023:03:15_18:46:47  <TaskResult>  22a1913c      0.78   \n",
       "\n",
       "        evaluation_time  shots  errors  \n",
       "66    1d 17h 45m 7s ago      2       0  \n",
       "267  1d 17h 47m 46s ago      2       0  \n",
       "410  1d 17h 46m 31s ago      2       0  \n",
       "382   1d 17h 33m 8s ago      2       0  \n",
       "127  1d 17h 40m 22s ago      2       0  "
      ]
     },
     "execution_count": 1119,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "from results import *\n",
    "r = results(additional=[accuracy,num_samples,evaluation_time,shots,errors], task=lambda t: \"sqauni\" in t and \"medium\" in t, model=\"davinci\", shots=2)\n",
    "r = r.sort_values([\"task\", \"decoder\", \"timestamp\"], ascending=[True, True, False])\n",
    "r\n",
    "# TODO: make check CoT for date understanding\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "73cfd4e1-1f57-4d1f-9d2d-7b09bcf0fc98",
   "metadata": {},
   "source": [
    "#### Curie"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1064,
   "id": "dbe0748f-bf16-423a-b2b9-93db828ed97d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>task</th>\n",
       "      <th>model</th>\n",
       "      <th>decoder</th>\n",
       "      <th>kwargs</th>\n",
       "      <th>num_samples</th>\n",
       "      <th>cost</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>result</th>\n",
       "      <th>hash</th>\n",
       "      <th>accuracy</th>\n",
       "      <th>evaluation_time</th>\n",
       "      <th>shots</th>\n",
       "      <th>errors</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>179</th>\n",
       "      <td>sqauni@ao_medium</td>\n",
       "      <td>openai/text-curie-001</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '512', 'top1_distribution': True, 'step_budget': '512', 'openai_chunksize': '1024'},'size': medium,'suite': &lt;class 'eval-strategyqa-uniform.StrategyQAao'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 2765, 'cost': 0.00553}</td>\n",
       "      <td>2023:03:16_18:24:14</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>d20d051e</td>\n",
       "      <td>0.46</td>\n",
       "      <td>17h 42m 32s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>sqauni@cot_medium</td>\n",
       "      <td>openai/text-curie-001</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '512', 'top1_distribution': True, 'step_budget': '512', 'openai_chunksize': '1024'},'size': medium,'suite': &lt;class 'eval-strategyqa-uniform.StrategyQAcot'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 66718, 'cost': 0.133436}</td>\n",
       "      <td>2023:03:16_18:25:00</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>6a8b9a06</td>\n",
       "      <td>0.53</td>\n",
       "      <td>17h 41m 46s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>270</th>\n",
       "      <td>sqauni@multivar2_medium</td>\n",
       "      <td>openai/text-curie-001</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '512', 'top1_distribution': True, 'step_budget': '512', 'openai_chunksize': '1024'},'size': medium,'suite': &lt;class 'eval-strategyqa-uniform.StrategyQAmvar2'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 137482, 'cost': 0.274964}</td>\n",
       "      <td>2023:03:16_18:27:28</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>d5ea2c80</td>\n",
       "      <td>0.58</td>\n",
       "      <td>17h 39m 18s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>247</th>\n",
       "      <td>sqauni@multivar2_medium</td>\n",
       "      <td>openai/text-curie-001</td>\n",
       "      <td>beam_var</td>\n",
       "      <td>{'kwargs': {'max_length': '512', 'top1_distribution': True, 'step_budget': '512', 'openai_chunksize': '1024'},'size': medium,'suite': &lt;class 'eval-strategyqa-uniform.StrategyQAmvar2'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 2578444, 'cost': 5.156888}</td>\n",
       "      <td>2023:03:16_18:41:04</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>ef5ddf17</td>\n",
       "      <td>0.55</td>\n",
       "      <td>17h 25m 42s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>sqauni@multivar2_medium</td>\n",
       "      <td>openai/text-curie-001</td>\n",
       "      <td>var</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '512'},'size': medium,'suite': &lt;class 'eval-strategyqa-uniform.StrategyQAmvar2'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 614914, 'cost': 1.229828}</td>\n",
       "      <td>2023:03:17_11:09:17</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>4828042e</td>\n",
       "      <td>0.52</td>\n",
       "      <td>57m 29s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                        task                  model   decoder  \\\n",
       "179         sqauni@ao_medium  openai/text-curie-001    argmax   \n",
       "27         sqauni@cot_medium  openai/text-curie-001    argmax   \n",
       "270  sqauni@multivar2_medium  openai/text-curie-001    argmax   \n",
       "247  sqauni@multivar2_medium  openai/text-curie-001  beam_var   \n",
       "32   sqauni@multivar2_medium  openai/text-curie-001       var   \n",
       "\n",
       "                                                                                                                                                                                        kwargs  \\\n",
       "179      {'kwargs': {'max_length': '512', 'top1_distribution': True, 'step_budget': '512', 'openai_chunksize': '1024'},'size': medium,'suite': <class 'eval-strategyqa-uniform.StrategyQAao'>}   \n",
       "27      {'kwargs': {'max_length': '512', 'top1_distribution': True, 'step_budget': '512', 'openai_chunksize': '1024'},'size': medium,'suite': <class 'eval-strategyqa-uniform.StrategyQAcot'>}   \n",
       "270   {'kwargs': {'max_length': '512', 'top1_distribution': True, 'step_budget': '512', 'openai_chunksize': '1024'},'size': medium,'suite': <class 'eval-strategyqa-uniform.StrategyQAmvar2'>}   \n",
       "247   {'kwargs': {'max_length': '512', 'top1_distribution': True, 'step_budget': '512', 'openai_chunksize': '1024'},'size': medium,'suite': <class 'eval-strategyqa-uniform.StrategyQAmvar2'>}   \n",
       "32   {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '512'},'size': medium,'suite': <class 'eval-strategyqa-uniform.StrategyQAmvar2'>}   \n",
       "\n",
       "     num_samples                                   cost            timestamp  \\\n",
       "179          100      {'tokens': 2765, 'cost': 0.00553}  2023:03:16_18:24:14   \n",
       "27           100    {'tokens': 66718, 'cost': 0.133436}  2023:03:16_18:25:00   \n",
       "270          100   {'tokens': 137482, 'cost': 0.274964}  2023:03:16_18:27:28   \n",
       "247          100  {'tokens': 2578444, 'cost': 5.156888}  2023:03:16_18:41:04   \n",
       "32           100   {'tokens': 614914, 'cost': 1.229828}  2023:03:17_11:09:17   \n",
       "\n",
       "           result      hash  accuracy  evaluation_time  shots  errors  \n",
       "179  <TaskResult>  d20d051e      0.46  17h 42m 32s ago      0       0  \n",
       "27   <TaskResult>  6a8b9a06      0.53  17h 41m 46s ago      0       1  \n",
       "270  <TaskResult>  d5ea2c80      0.58  17h 39m 18s ago      0       0  \n",
       "247  <TaskResult>  ef5ddf17      0.55  17h 25m 42s ago      0       2  \n",
       "32   <TaskResult>  4828042e      0.52      57m 29s ago      0       0  "
      ]
     },
     "execution_count": 1064,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "from results import *\n",
    "r = results(additional=[accuracy,num_samples,evaluation_time,shots,errors], task=lambda t: \"sqauni\" in t and \"medium\" in t, model=\"curie\", shots=0)\n",
    "r = r.sort_values([\"task\", \"decoder\", \"timestamp\"], ascending=[True, True, False])\n",
    "r\n",
    "# TODO: make check CoT for date understanding\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1135,
   "id": "b0188733-75e6-4b89-9f51-a5d7d318127a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>task</th>\n",
       "      <th>model</th>\n",
       "      <th>decoder</th>\n",
       "      <th>kwargs</th>\n",
       "      <th>num_samples</th>\n",
       "      <th>cost</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>result</th>\n",
       "      <th>hash</th>\n",
       "      <th>accuracy</th>\n",
       "      <th>evaluation_time</th>\n",
       "      <th>shots</th>\n",
       "      <th>errors</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>104</th>\n",
       "      <td>sqauni@ao_large</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '8'},'size': large,'suite': &lt;class 'eval-strategyqa-uniform.StrategyQAao'&gt;}</td>\n",
       "      <td>1000</td>\n",
       "      <td>{'tokens': 27172, 'cost': 0.54344}</td>\n",
       "      <td>2023:03:17_12:05:34</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>943a2c5e</td>\n",
       "      <td>0.625000</td>\n",
       "      <td>2h 6m 3s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>392</th>\n",
       "      <td>sqauni@cot_large</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '64'},'size': large,'suite': &lt;class 'eval-strategyqa-uniform.StrategyQAcot'&gt;}</td>\n",
       "      <td>1000</td>\n",
       "      <td>{'tokens': 829047, 'cost': 16.580940000000002}</td>\n",
       "      <td>2023:03:17_12:08:11</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>297e5a17</td>\n",
       "      <td>0.685000</td>\n",
       "      <td>2h 3m 26s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>274</th>\n",
       "      <td>sqauni@multivar2_large</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': large,'suite': &lt;class 'eval-strategyqa-uniform.StrategyQAmvar2'&gt;}</td>\n",
       "      <td>1000</td>\n",
       "      <td>{'tokens': 1402524, 'cost': 28.050479999999997}</td>\n",
       "      <td>2023:03:17_12:14:08</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>a92ffd7f</td>\n",
       "      <td>0.686000</td>\n",
       "      <td>1h 57m 29s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>sqauni@multivar2_large</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>beam_var</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': large,'suite': &lt;class 'eval-strategyqa-uniform.StrategyQAmvar2'&gt;}</td>\n",
       "      <td>1</td>\n",
       "      <td>{'tokens': -1, 'cost': -1}</td>\n",
       "      <td>2023:03:17_14:09:35</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>72e7f63a</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>2m 2s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>373</th>\n",
       "      <td>sqauni@multivar2_large</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>var</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '128'},'size': large,'suite': &lt;class 'eval-strategyqa-uniform.StrategyQAmvar2'&gt;}</td>\n",
       "      <td>1000</td>\n",
       "      <td>{'tokens': 5937162, 'cost': 118.74324000000001}</td>\n",
       "      <td>2023:03:17_12:27:43</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>8593478f</td>\n",
       "      <td>0.667000</td>\n",
       "      <td>1h 43m 54s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>327</th>\n",
       "      <td>sqauni@multivar2_large</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>var</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': large,'suite': &lt;class 'eval-strategyqa-uniform.StrategyQAmvar2'&gt;}</td>\n",
       "      <td>7</td>\n",
       "      <td>{'tokens': -1, 'cost': -1}</td>\n",
       "      <td>2023:03:17_12:18:18</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>1755007c</td>\n",
       "      <td>0.714286</td>\n",
       "      <td>1h 53m 19s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       task                    model   decoder  \\\n",
       "104         sqauni@ao_large  openai/text-davinci-003    argmax   \n",
       "392        sqauni@cot_large  openai/text-davinci-003    argmax   \n",
       "274  sqauni@multivar2_large  openai/text-davinci-003    argmax   \n",
       "31   sqauni@multivar2_large  openai/text-davinci-003  beam_var   \n",
       "373  sqauni@multivar2_large  openai/text-davinci-003       var   \n",
       "327  sqauni@multivar2_large  openai/text-davinci-003       var   \n",
       "\n",
       "                                                                                                                                                                                       kwargs  \\\n",
       "104       {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '8'},'size': large,'suite': <class 'eval-strategyqa-uniform.StrategyQAao'>}   \n",
       "392     {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '64'},'size': large,'suite': <class 'eval-strategyqa-uniform.StrategyQAcot'>}   \n",
       "274   {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': large,'suite': <class 'eval-strategyqa-uniform.StrategyQAmvar2'>}   \n",
       "31    {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': large,'suite': <class 'eval-strategyqa-uniform.StrategyQAmvar2'>}   \n",
       "373  {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '128'},'size': large,'suite': <class 'eval-strategyqa-uniform.StrategyQAmvar2'>}   \n",
       "327   {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': large,'suite': <class 'eval-strategyqa-uniform.StrategyQAmvar2'>}   \n",
       "\n",
       "     num_samples                                             cost  \\\n",
       "104         1000               {'tokens': 27172, 'cost': 0.54344}   \n",
       "392         1000   {'tokens': 829047, 'cost': 16.580940000000002}   \n",
       "274         1000  {'tokens': 1402524, 'cost': 28.050479999999997}   \n",
       "31             1                       {'tokens': -1, 'cost': -1}   \n",
       "373         1000  {'tokens': 5937162, 'cost': 118.74324000000001}   \n",
       "327            7                       {'tokens': -1, 'cost': -1}   \n",
       "\n",
       "               timestamp        result      hash  accuracy evaluation_time  \\\n",
       "104  2023:03:17_12:05:34  <TaskResult>  943a2c5e  0.625000    2h 6m 3s ago   \n",
       "392  2023:03:17_12:08:11  <TaskResult>  297e5a17  0.685000   2h 3m 26s ago   \n",
       "274  2023:03:17_12:14:08  <TaskResult>  a92ffd7f  0.686000  1h 57m 29s ago   \n",
       "31   2023:03:17_14:09:35  <TaskResult>  72e7f63a  0.000000       2m 2s ago   \n",
       "373  2023:03:17_12:27:43  <TaskResult>  8593478f  0.667000  1h 43m 54s ago   \n",
       "327  2023:03:17_12:18:18  <TaskResult>  1755007c  0.714286  1h 53m 19s ago   \n",
       "\n",
       "     shots  errors  \n",
       "104      0       0  \n",
       "392      0       0  \n",
       "274      0       0  \n",
       "31       0       0  \n",
       "373      0       0  \n",
       "327      0       0  "
      ]
     },
     "execution_count": 1135,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "from results import *\n",
    "r = results(additional=[accuracy,num_samples,evaluation_time,shots,errors], task=lambda t: \"sqauni\" in t and \"large\" in t, model=\"davinci\", shots=0)\n",
    "r = r.sort_values([\"task\", \"decoder\", \"timestamp\"], ascending=[True, True, False])\n",
    "r\n",
    "# TODO: make check CoT for date understanding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1045,
   "id": "75d5beac-206f-4ccf-b7b5-b571a4c856be",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>task</th>\n",
       "      <th>model</th>\n",
       "      <th>decoder</th>\n",
       "      <th>kwargs</th>\n",
       "      <th>num_samples</th>\n",
       "      <th>cost</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>result</th>\n",
       "      <th>hash</th>\n",
       "      <th>accuracy</th>\n",
       "      <th>evaluation_time</th>\n",
       "      <th>shots</th>\n",
       "      <th>errors</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>413</th>\n",
       "      <td>sqauni@ao_medium</td>\n",
       "      <td>openai/text-curie-001</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '512', 'top1_distribution': True, 'step_budget': '512', 'openai_chunksize': '1024'},'size': medium,'suite': &lt;class 'eval-strategyqa-uniform.StrategyQAao'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 8265, 'cost': 0.016530000000000003}</td>\n",
       "      <td>2023:03:16_18:56:59</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>66914c8b</td>\n",
       "      <td>0.47</td>\n",
       "      <td>16h 12m 21s ago</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>354</th>\n",
       "      <td>sqauni@cot_medium</td>\n",
       "      <td>openai/text-curie-001</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '512', 'top1_distribution': True, 'step_budget': '512', 'openai_chunksize': '512'},'size': medium,'suite': &lt;class 'eval-strategyqa-uniform.StrategyQAcot'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 143508, 'cost': 0.28701600000000005}</td>\n",
       "      <td>2023:03:16_18:57:31</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>92f6adff</td>\n",
       "      <td>0.58</td>\n",
       "      <td>16h 11m 49s ago</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>151</th>\n",
       "      <td>sqauni@multivar2_medium</td>\n",
       "      <td>openai/text-curie-001</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '512'},'size': medium,'suite': &lt;class 'eval-strategyqa-uniform.StrategyQAmvar2'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 270510, 'cost': 0.54102}</td>\n",
       "      <td>2023:03:16_18:59:17</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>a2c7fe8e</td>\n",
       "      <td>0.56</td>\n",
       "      <td>16h 10m 3s ago</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>116</th>\n",
       "      <td>sqauni@multivar2_medium</td>\n",
       "      <td>openai/text-curie-001</td>\n",
       "      <td>var</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '512'},'size': medium,'suite': &lt;class 'eval-strategyqa-uniform.StrategyQAmvar2'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 1011702, 'cost': 2.023404}</td>\n",
       "      <td>2023:03:17_11:01:44</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>9f2378f8</td>\n",
       "      <td>0.60</td>\n",
       "      <td>7m 36s ago</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                        task                  model decoder  \\\n",
       "413         sqauni@ao_medium  openai/text-curie-001  argmax   \n",
       "354        sqauni@cot_medium  openai/text-curie-001  argmax   \n",
       "151  sqauni@multivar2_medium  openai/text-curie-001  argmax   \n",
       "116  sqauni@multivar2_medium  openai/text-curie-001     var   \n",
       "\n",
       "                                                                                                                                                                                        kwargs  \\\n",
       "413      {'kwargs': {'max_length': '512', 'top1_distribution': True, 'step_budget': '512', 'openai_chunksize': '1024'},'size': medium,'suite': <class 'eval-strategyqa-uniform.StrategyQAao'>}   \n",
       "354      {'kwargs': {'max_length': '512', 'top1_distribution': True, 'step_budget': '512', 'openai_chunksize': '512'},'size': medium,'suite': <class 'eval-strategyqa-uniform.StrategyQAcot'>}   \n",
       "151  {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '512'},'size': medium,'suite': <class 'eval-strategyqa-uniform.StrategyQAmvar2'>}   \n",
       "116  {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '512'},'size': medium,'suite': <class 'eval-strategyqa-uniform.StrategyQAmvar2'>}   \n",
       "\n",
       "     num_samples                                             cost  \\\n",
       "413          100   {'tokens': 8265, 'cost': 0.016530000000000003}   \n",
       "354          100  {'tokens': 143508, 'cost': 0.28701600000000005}   \n",
       "151          100              {'tokens': 270510, 'cost': 0.54102}   \n",
       "116          100            {'tokens': 1011702, 'cost': 2.023404}   \n",
       "\n",
       "               timestamp        result      hash  accuracy  evaluation_time  \\\n",
       "413  2023:03:16_18:56:59  <TaskResult>  66914c8b      0.47  16h 12m 21s ago   \n",
       "354  2023:03:16_18:57:31  <TaskResult>  92f6adff      0.58  16h 11m 49s ago   \n",
       "151  2023:03:16_18:59:17  <TaskResult>  a2c7fe8e      0.56   16h 10m 3s ago   \n",
       "116  2023:03:17_11:01:44  <TaskResult>  9f2378f8      0.60       7m 36s ago   \n",
       "\n",
       "     shots  errors  \n",
       "413      2       0  \n",
       "354      2       0  \n",
       "151      2       0  \n",
       "116      2       0  "
      ]
     },
     "execution_count": 1045,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "from results import *\n",
    "r = results(additional=[accuracy,num_samples,evaluation_time,shots,errors], task=lambda t: \"sqauni\" in t and \"medium\" in t, model=\"curie\", shots=2)\n",
    "r = r.sort_values([\"task\", \"decoder\", \"timestamp\"], ascending=[True, True, False])\n",
    "r\n",
    "# TODO: make check CoT for date understanding\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 720,
   "id": "31469993-fac0-4b6f-9b51-09a8cf841ae1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<error>\n"
     ]
    }
   ],
   "source": [
    "print(data(\"290a9e19\").model_result.values[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6a9a6466-9510-4409-a040-275c01b53e2f",
   "metadata": {},
   "outputs": [],
   "source": [
    "from results import *\n",
    "\n",
    "# len(list(compare(\"6732d792\", \"d8d0c2fb\")))\n",
    "\n",
    "for r_fail,r_success in compare(\"30f3d66e\", \"c29b5c82\"):\n",
    "    # print(\"COT\", r_fail.model_result.rsplit(\"Q: \",1)[1], end=\"\\n\\n\\n\")\n",
    "    # print(\"MVP\", r_success.model_result.rsplit(\"Q: \",1)[1], end=\"\\n\\n\\n\")\n",
    "    print(\"COT\", r_fail.model_result)\n",
    "    print(\"MVP\", r_success.model_result)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a1c84126-3c4a-40ec-b598-6a91c2ec80a9",
   "metadata": {},
   "source": [
    "### Tracking Shuffled Objects"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 640,
   "id": "e940643f-3bd5-470e-9a10-75dcafcca1c3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>task</th>\n",
       "      <th>model</th>\n",
       "      <th>decoder</th>\n",
       "      <th>kwargs</th>\n",
       "      <th>num_samples</th>\n",
       "      <th>cost</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>result</th>\n",
       "      <th>hash</th>\n",
       "      <th>accuracy</th>\n",
       "      <th>evaluation_time</th>\n",
       "      <th>shots</th>\n",
       "      <th>errors</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>126</th>\n",
       "      <td>tracking_shuffled_objects@ao_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': &lt;class 'eval-shuffled-objects.AnswerOnlyTrackingShuffledObjectsSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 167991, 'cost': 3.3598200000000005}</td>\n",
       "      <td>2023:03:16_15:01:53</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>4acb520f</td>\n",
       "      <td>0.10</td>\n",
       "      <td>3h 4m 43s ago</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>51</th>\n",
       "      <td>tracking_shuffled_objects@cot_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '2048', 'top1_distribution': True, 'step_budget': '2048', 'openai_chunksize': '1024'},'size': medium,'suite': &lt;class 'eval-shuffled-objects.CotTrackingShuffledObjectsSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 696422, 'cost': 13.92844}</td>\n",
       "      <td>2023:03:16_15:49:53</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>6732d792</td>\n",
       "      <td>0.46</td>\n",
       "      <td>2h 16m 43s ago</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>320</th>\n",
       "      <td>tracking_shuffled_objects@multivar2_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '2048', 'top1_distribution': True, 'step_budget': '2048', 'openai_chunksize': '128'},'size': medium,'suite': &lt;class 'eval-shuffled-objects.TrackingShuffledObjectsSuiteMV2'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 1631431, 'cost': 32.62862}</td>\n",
       "      <td>2023:03:16_15:55:53</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>d8d0c2fb</td>\n",
       "      <td>0.57</td>\n",
       "      <td>2h 10m 43s ago</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>201</th>\n",
       "      <td>tracking_shuffled_objects@multivar2_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '2048', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '128'},'size': medium,'suite': &lt;class 'eval-shuffled-objects.TrackingShuffledObjectsSuiteMV2'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 2282124, 'cost': 45.64248}</td>\n",
       "      <td>2023:03:16_15:18:56</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>fa6f9798</td>\n",
       "      <td>0.60</td>\n",
       "      <td>2h 47m 40s ago</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>243</th>\n",
       "      <td>tracking_shuffled_objects@multivar2_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>var</td>\n",
       "      <td>{'kwargs': {'max_length': '2048', 'top1_distribution': True, 'step_budget': '2048', 'openai_chunksize': '32'},'size': medium,'suite': &lt;class 'eval-shuffled-objects.TrackingShuffledObjectsSuiteMV2'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 5698632, 'cost': 113.97264}</td>\n",
       "      <td>2023:03:16_17:52:49</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>77e5751a</td>\n",
       "      <td>0.57</td>\n",
       "      <td>13m 47s ago</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                           task                    model  \\\n",
       "126         tracking_shuffled_objects@ao_medium  openai/text-davinci-003   \n",
       "51         tracking_shuffled_objects@cot_medium  openai/text-davinci-003   \n",
       "320  tracking_shuffled_objects@multivar2_medium  openai/text-davinci-003   \n",
       "201  tracking_shuffled_objects@multivar2_medium  openai/text-davinci-003   \n",
       "243  tracking_shuffled_objects@multivar2_medium  openai/text-davinci-003   \n",
       "\n",
       "    decoder  \\\n",
       "126  argmax   \n",
       "51   argmax   \n",
       "320  argmax   \n",
       "201  argmax   \n",
       "243     var   \n",
       "\n",
       "                                                                                                                                                                                                              kwargs  \\\n",
       "126  {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': <class 'eval-shuffled-objects.AnswerOnlyTrackingShuffledObjectsSuite'>}   \n",
       "51          {'kwargs': {'max_length': '2048', 'top1_distribution': True, 'step_budget': '2048', 'openai_chunksize': '1024'},'size': medium,'suite': <class 'eval-shuffled-objects.CotTrackingShuffledObjectsSuite'>}   \n",
       "320          {'kwargs': {'max_length': '2048', 'top1_distribution': True, 'step_budget': '2048', 'openai_chunksize': '128'},'size': medium,'suite': <class 'eval-shuffled-objects.TrackingShuffledObjectsSuiteMV2'>}   \n",
       "201          {'kwargs': {'max_length': '2048', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '128'},'size': medium,'suite': <class 'eval-shuffled-objects.TrackingShuffledObjectsSuiteMV2'>}   \n",
       "243           {'kwargs': {'max_length': '2048', 'top1_distribution': True, 'step_budget': '2048', 'openai_chunksize': '32'},'size': medium,'suite': <class 'eval-shuffled-objects.TrackingShuffledObjectsSuiteMV2'>}   \n",
       "\n",
       "     num_samples                                            cost  \\\n",
       "126          100  {'tokens': 167991, 'cost': 3.3598200000000005}   \n",
       "51           100            {'tokens': 696422, 'cost': 13.92844}   \n",
       "320          100           {'tokens': 1631431, 'cost': 32.62862}   \n",
       "201          100           {'tokens': 2282124, 'cost': 45.64248}   \n",
       "243          100          {'tokens': 5698632, 'cost': 113.97264}   \n",
       "\n",
       "               timestamp        result      hash  accuracy evaluation_time  \\\n",
       "126  2023:03:16_15:01:53  <TaskResult>  4acb520f      0.10   3h 4m 43s ago   \n",
       "51   2023:03:16_15:49:53  <TaskResult>  6732d792      0.46  2h 16m 43s ago   \n",
       "320  2023:03:16_15:55:53  <TaskResult>  d8d0c2fb      0.57  2h 10m 43s ago   \n",
       "201  2023:03:16_15:18:56  <TaskResult>  fa6f9798      0.60  2h 47m 40s ago   \n",
       "243  2023:03:16_17:52:49  <TaskResult>  77e5751a      0.57     13m 47s ago   \n",
       "\n",
       "     shots  errors  \n",
       "126      2       0  \n",
       "51       2       0  \n",
       "320      2       0  \n",
       "201      2       0  \n",
       "243      2       0  "
      ]
     },
     "execution_count": 640,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "from results import *\n",
    "r = results(additional=[accuracy,num_samples,evaluation_time,shots,errors], task=lambda t: \"tracking_shuffled_objects\" in t and \"medium\" in t, model=\"davinci\", shots=2)\n",
    "r = r.sort_values([\"task\", \"decoder\", \"timestamp\"], ascending=[True, True, False])\n",
    "r\n",
    "# TODO: make check CoT for date understanding\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "008722dc-bcf9-40eb-bb03-113e7d646ece",
   "metadata": {},
   "source": [
    "##### Curie"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1051,
   "id": "444da6ea-b7a0-434e-be4c-0d0d8d112fd4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>task</th>\n",
       "      <th>model</th>\n",
       "      <th>decoder</th>\n",
       "      <th>kwargs</th>\n",
       "      <th>num_samples</th>\n",
       "      <th>cost</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>result</th>\n",
       "      <th>hash</th>\n",
       "      <th>accuracy</th>\n",
       "      <th>evaluation_time</th>\n",
       "      <th>shots</th>\n",
       "      <th>errors</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>284</th>\n",
       "      <td>tracking_shuffled_objects@ao_medium</td>\n",
       "      <td>openai/text-curie-001</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '2048', 'top1_distribution': True, 'step_budget': '2048', 'openai_chunksize': '32'},'size': medium,'suite': &lt;class 'eval-shuffled-objects.AnswerOnlyTrackingShuffledObjectsSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 34458, 'cost': 0.068916}</td>\n",
       "      <td>2023:03:16_18:31:08</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>0c0e6b3e</td>\n",
       "      <td>0.18</td>\n",
       "      <td>16h 47m 16s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>125</th>\n",
       "      <td>tracking_shuffled_objects@cot_medium</td>\n",
       "      <td>openai/text-curie-001</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '256'},'size': medium,'suite': &lt;class 'eval-shuffled-objects.CotTrackingShuffledObjectsSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 187828, 'cost': 0.375656}</td>\n",
       "      <td>2023:03:16_18:33:35</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>1359a69f</td>\n",
       "      <td>0.19</td>\n",
       "      <td>16h 44m 49s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>236</th>\n",
       "      <td>tracking_shuffled_objects@multivar2_medium</td>\n",
       "      <td>openai/text-curie-001</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': &lt;class 'eval-shuffled-objects.TrackingShuffledObjectsSuiteMV2'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 775803, 'cost': 1.551606}</td>\n",
       "      <td>2023:03:16_18:35:15</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>fda8ed89</td>\n",
       "      <td>0.22</td>\n",
       "      <td>16h 43m 9s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>220</th>\n",
       "      <td>tracking_shuffled_objects@multivar2_medium</td>\n",
       "      <td>openai/text-curie-001</td>\n",
       "      <td>var</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': &lt;class 'eval-shuffled-objects.TrackingShuffledObjectsSuiteMV2'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 3020106, 'cost': 6.040212}</td>\n",
       "      <td>2023:03:17_11:01:12</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>90980c73</td>\n",
       "      <td>0.24</td>\n",
       "      <td>17m 12s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                           task                  model  \\\n",
       "284         tracking_shuffled_objects@ao_medium  openai/text-curie-001   \n",
       "125        tracking_shuffled_objects@cot_medium  openai/text-curie-001   \n",
       "236  tracking_shuffled_objects@multivar2_medium  openai/text-curie-001   \n",
       "220  tracking_shuffled_objects@multivar2_medium  openai/text-curie-001   \n",
       "\n",
       "    decoder  \\\n",
       "284  argmax   \n",
       "125  argmax   \n",
       "236  argmax   \n",
       "220     var   \n",
       "\n",
       "                                                                                                                                                                                                            kwargs  \\\n",
       "284  {'kwargs': {'max_length': '2048', 'top1_distribution': True, 'step_budget': '2048', 'openai_chunksize': '32'},'size': medium,'suite': <class 'eval-shuffled-objects.AnswerOnlyTrackingShuffledObjectsSuite'>}   \n",
       "125        {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '256'},'size': medium,'suite': <class 'eval-shuffled-objects.CotTrackingShuffledObjectsSuite'>}   \n",
       "236         {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': <class 'eval-shuffled-objects.TrackingShuffledObjectsSuiteMV2'>}   \n",
       "220         {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': <class 'eval-shuffled-objects.TrackingShuffledObjectsSuiteMV2'>}   \n",
       "\n",
       "     num_samples                                   cost            timestamp  \\\n",
       "284          100    {'tokens': 34458, 'cost': 0.068916}  2023:03:16_18:31:08   \n",
       "125          100   {'tokens': 187828, 'cost': 0.375656}  2023:03:16_18:33:35   \n",
       "236          100   {'tokens': 775803, 'cost': 1.551606}  2023:03:16_18:35:15   \n",
       "220          100  {'tokens': 3020106, 'cost': 6.040212}  2023:03:17_11:01:12   \n",
       "\n",
       "           result      hash  accuracy  evaluation_time  shots  errors  \n",
       "284  <TaskResult>  0c0e6b3e      0.18  16h 47m 16s ago      0       0  \n",
       "125  <TaskResult>  1359a69f      0.19  16h 44m 49s ago      0       0  \n",
       "236  <TaskResult>  fda8ed89      0.22   16h 43m 9s ago      0       0  \n",
       "220  <TaskResult>  90980c73      0.24      17m 12s ago      0       0  "
      ]
     },
     "execution_count": 1051,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "from results import *\n",
    "r = results(additional=[accuracy,num_samples,evaluation_time,shots,errors], task=lambda t: \"tracking_shuffled_objects\" in t and \"medium\" in t, model=\"curie\", shots=0)\n",
    "r = r.sort_values([\"task\", \"decoder\", \"timestamp\"], ascending=[True, True, False])\n",
    "r\n",
    "# TODO: make check CoT for date understanding\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8ef91710-6e42-49d2-941c-852c7be96d58",
   "metadata": {},
   "outputs": [],
   "source": [
    "from results import *\n",
    "\n",
    "# len(list(compare(\"6732d792\", \"d8d0c2fb\")))\n",
    "\n",
    "for r_fail,r_success in compare(\"6732d792\", \"d8d0c2fb\"):\n",
    "    # print(\"COT\", r_fail.model_result.rsplit(\"Q: \",1)[1], end=\"\\n\\n\\n\")\n",
    "    # print(\"MVP\", r_success.model_result.rsplit(\"Q: \",1)[1], end=\"\\n\\n\\n\")\n",
    "    print(\"COT\", r_fail.model_result)\n",
    "    print(\"MVP\", r_success.model_result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "22416a82-9523-42e3-a4ca-6794f63997be",
   "metadata": {},
   "outputs": [],
   "source": [
    "for r in data(\"d8d0c2fb\").sort_values(\"query_file\").iloc:\n",
    "    if r.prediction == r.target:\n",
    "        print(r.query_file)\n",
    "        print(r.model_result, end=\"\\n\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ca180f75-40f0-4786-bcd1-4de4f8231d6a",
   "metadata": {},
   "outputs": [],
   "source": [
    "for r in data(\"6732d792\").sort_values(\"query_file\").iloc:\n",
    "    if r.prediction != r.target:\n",
    "        print(r.query_file, r.target)\n",
    "        print(r.model_result, end=\"\\n\\n\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8f8918d8-95c3-4c46-b112-34d422ce0141",
   "metadata": {},
   "source": [
    "### Matrix Shapes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1050,
   "id": "9c814dbb-a28c-4fe5-ae9d-f1b0b46c4d39",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>task</th>\n",
       "      <th>model</th>\n",
       "      <th>decoder</th>\n",
       "      <th>kwargs</th>\n",
       "      <th>num_samples</th>\n",
       "      <th>cost</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>result</th>\n",
       "      <th>hash</th>\n",
       "      <th>accuracy</th>\n",
       "      <th>evaluation_time</th>\n",
       "      <th>shots</th>\n",
       "      <th>errors</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>215</th>\n",
       "      <td>matrixshapes@cot_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>beam_var</td>\n",
       "      <td>{'kwargs': {'max_length': 1024, 'top1_distribution': True, 'num_beams': '2', 'prune': '1.05'},'size': medium,'suite': &lt;class 'eval-matrixshapes.VanillaCoTExpressionEvaluationSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 236440, 'cost': 4.7288}</td>\n",
       "      <td>2023:01:25_18:08:15</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>ebb0c67e</td>\n",
       "      <td>0.71</td>\n",
       "      <td>7w 1d 17h 10m 6s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>95</th>\n",
       "      <td>matrixshapes@cot_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>var</td>\n",
       "      <td>{'kwargs': {'max_length': 1024, 'top1_distribution': True},'size': medium,'suite': &lt;class 'eval-matrixshapes.VanillaCoTExpressionEvaluationSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 213671, 'cost': 4.27342}</td>\n",
       "      <td>2023:01:25_18:05:10</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>b71518c0</td>\n",
       "      <td>0.76</td>\n",
       "      <td>7w 1d 17h 13m 11s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>170</th>\n",
       "      <td>matrixshapes@multivar_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>beam_var</td>\n",
       "      <td>{'kwargs': {'max_length': 512, 'top1_distribution': True, 'num_beams': '2', 'prune': '1.05'},'size': medium,'suite': &lt;class 'eval-matrixshapes.MatrixShapesEvaluationSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 203708, 'cost': 4.07416}</td>\n",
       "      <td>2023:01:25_17:15:38</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>f1b66902</td>\n",
       "      <td>0.79</td>\n",
       "      <td>7w 1d 18h 2m 43s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>329</th>\n",
       "      <td>matrixshapes@multivar_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>var</td>\n",
       "      <td>{'kwargs': {'max_length': '2048', 'top1_distribution': True, 'step_budget': '2048', 'openai_chunksize': '32', 'n': '2'},'size': medium,'suite': &lt;class 'eval-matrixshapes.MatrixShapesEvaluationSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 186660, 'cost': 3.7332}</td>\n",
       "      <td>2023:03:17_00:43:31</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>51173e96</td>\n",
       "      <td>0.84</td>\n",
       "      <td>10h 34m 50s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>330</th>\n",
       "      <td>matrixshapes@multivar_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>var</td>\n",
       "      <td>{'kwargs': {'max_length': '2048', 'top1_distribution': True, 'step_budget': '2048', 'openai_chunksize': '32', 'n': '2'},'size': medium,'suite': &lt;class 'eval-matrixshapes.MatrixShapesEvaluationSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 187357, 'cost': 3.74714}</td>\n",
       "      <td>2023:03:17_00:40:40</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>18244b61</td>\n",
       "      <td>0.80</td>\n",
       "      <td>10h 37m 41s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>matrixshapes@multivar_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>var</td>\n",
       "      <td>{'kwargs': {'max_length': '2048', 'top1_distribution': True, 'step_budget': '2048', 'openai_chunksize': '32', 'n': '2'},'size': medium,'suite': &lt;class 'eval-matrixshapes.MatrixShapesEvaluationSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 275849, 'cost': 5.51698}</td>\n",
       "      <td>2023:03:17_00:35:01</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>1711003c</td>\n",
       "      <td>0.71</td>\n",
       "      <td>10h 43m 20s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>108</th>\n",
       "      <td>matrixshapes@multivar_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>var</td>\n",
       "      <td>{'kwargs': {'max_length': '2048', 'top1_distribution': True, 'step_budget': '2048', 'openai_chunksize': '32', 'n': '4'},'size': medium,'suite': &lt;class 'eval-matrixshapes.MatrixShapesEvaluationSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 278060, 'cost': 5.5612}</td>\n",
       "      <td>2023:03:17_00:30:22</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>8c9b7254</td>\n",
       "      <td>0.63</td>\n",
       "      <td>10h 47m 59s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>275</th>\n",
       "      <td>matrixshapes@multivar_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>var</td>\n",
       "      <td>{'kwargs': {'max_length': '2048', 'top1_distribution': True, 'step_budget': '2048', 'openai_chunksize': '32'},'size': medium,'suite': &lt;class 'eval-matrixshapes.MatrixShapesEvaluationSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 260156, 'cost': 5.20312}</td>\n",
       "      <td>2023:03:17_00:22:33</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>e2f3a650</td>\n",
       "      <td>0.75</td>\n",
       "      <td>10h 55m 48s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>matrixshapes@multivar_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>var</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': &lt;class 'eval-matrixshapes.MatrixShapesEvaluationSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 274064, 'cost': 5.481280000000001}</td>\n",
       "      <td>2023:03:16_20:03:07</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>398fa890</td>\n",
       "      <td>0.73</td>\n",
       "      <td>15h 15m 14s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>309</th>\n",
       "      <td>matrixshapes@multivar_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>var</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': &lt;class 'eval-matrixshapes.MatrixShapesEvaluationSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 0, 'cost': 0.0}</td>\n",
       "      <td>2023:03:16_20:01:47</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>79fd6b95</td>\n",
       "      <td>0.00</td>\n",
       "      <td>15h 16m 34s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>192</th>\n",
       "      <td>matrixshapes@multivar_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>var</td>\n",
       "      <td>{'kwargs': {'max_length': 1024, 'top1_distribution': True},'size': medium,'suite': &lt;class 'eval-matrixshapes.MatrixShapesEvaluationSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 271485, 'cost': 5.4297}</td>\n",
       "      <td>2023:01:25_17:46:28</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>8c6d2180</td>\n",
       "      <td>0.85</td>\n",
       "      <td>7w 1d 17h 31m 53s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                             task                    model   decoder  \\\n",
       "215       matrixshapes@cot_medium  openai/text-davinci-003  beam_var   \n",
       "95        matrixshapes@cot_medium  openai/text-davinci-003       var   \n",
       "170  matrixshapes@multivar_medium  openai/text-davinci-003  beam_var   \n",
       "329  matrixshapes@multivar_medium  openai/text-davinci-003       var   \n",
       "330  matrixshapes@multivar_medium  openai/text-davinci-003       var   \n",
       "5    matrixshapes@multivar_medium  openai/text-davinci-003       var   \n",
       "108  matrixshapes@multivar_medium  openai/text-davinci-003       var   \n",
       "275  matrixshapes@multivar_medium  openai/text-davinci-003       var   \n",
       "33   matrixshapes@multivar_medium  openai/text-davinci-003       var   \n",
       "309  matrixshapes@multivar_medium  openai/text-davinci-003       var   \n",
       "192  matrixshapes@multivar_medium  openai/text-davinci-003       var   \n",
       "\n",
       "                                                                                                                                                                                                       kwargs  \\\n",
       "215                    {'kwargs': {'max_length': 1024, 'top1_distribution': True, 'num_beams': '2', 'prune': '1.05'},'size': medium,'suite': <class 'eval-matrixshapes.VanillaCoTExpressionEvaluationSuite'>}   \n",
       "95                                                        {'kwargs': {'max_length': 1024, 'top1_distribution': True},'size': medium,'suite': <class 'eval-matrixshapes.VanillaCoTExpressionEvaluationSuite'>}   \n",
       "170                             {'kwargs': {'max_length': 512, 'top1_distribution': True, 'num_beams': '2', 'prune': '1.05'},'size': medium,'suite': <class 'eval-matrixshapes.MatrixShapesEvaluationSuite'>}   \n",
       "329  {'kwargs': {'max_length': '2048', 'top1_distribution': True, 'step_budget': '2048', 'openai_chunksize': '32', 'n': '2'},'size': medium,'suite': <class 'eval-matrixshapes.MatrixShapesEvaluationSuite'>}   \n",
       "330  {'kwargs': {'max_length': '2048', 'top1_distribution': True, 'step_budget': '2048', 'openai_chunksize': '32', 'n': '2'},'size': medium,'suite': <class 'eval-matrixshapes.MatrixShapesEvaluationSuite'>}   \n",
       "5    {'kwargs': {'max_length': '2048', 'top1_distribution': True, 'step_budget': '2048', 'openai_chunksize': '32', 'n': '2'},'size': medium,'suite': <class 'eval-matrixshapes.MatrixShapesEvaluationSuite'>}   \n",
       "108  {'kwargs': {'max_length': '2048', 'top1_distribution': True, 'step_budget': '2048', 'openai_chunksize': '32', 'n': '4'},'size': medium,'suite': <class 'eval-matrixshapes.MatrixShapesEvaluationSuite'>}   \n",
       "275            {'kwargs': {'max_length': '2048', 'top1_distribution': True, 'step_budget': '2048', 'openai_chunksize': '32'},'size': medium,'suite': <class 'eval-matrixshapes.MatrixShapesEvaluationSuite'>}   \n",
       "33             {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': <class 'eval-matrixshapes.MatrixShapesEvaluationSuite'>}   \n",
       "309            {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': <class 'eval-matrixshapes.MatrixShapesEvaluationSuite'>}   \n",
       "192                                                               {'kwargs': {'max_length': 1024, 'top1_distribution': True},'size': medium,'suite': <class 'eval-matrixshapes.MatrixShapesEvaluationSuite'>}   \n",
       "\n",
       "     num_samples                                           cost  \\\n",
       "215          100             {'tokens': 236440, 'cost': 4.7288}   \n",
       "95           100            {'tokens': 213671, 'cost': 4.27342}   \n",
       "170          100            {'tokens': 203708, 'cost': 4.07416}   \n",
       "329          100             {'tokens': 186660, 'cost': 3.7332}   \n",
       "330          100            {'tokens': 187357, 'cost': 3.74714}   \n",
       "5            100            {'tokens': 275849, 'cost': 5.51698}   \n",
       "108          100             {'tokens': 278060, 'cost': 5.5612}   \n",
       "275          100            {'tokens': 260156, 'cost': 5.20312}   \n",
       "33           100  {'tokens': 274064, 'cost': 5.481280000000001}   \n",
       "309          100                     {'tokens': 0, 'cost': 0.0}   \n",
       "192          100             {'tokens': 271485, 'cost': 5.4297}   \n",
       "\n",
       "               timestamp        result      hash  accuracy  \\\n",
       "215  2023:01:25_18:08:15  <TaskResult>  ebb0c67e      0.71   \n",
       "95   2023:01:25_18:05:10  <TaskResult>  b71518c0      0.76   \n",
       "170  2023:01:25_17:15:38  <TaskResult>  f1b66902      0.79   \n",
       "329  2023:03:17_00:43:31  <TaskResult>  51173e96      0.84   \n",
       "330  2023:03:17_00:40:40  <TaskResult>  18244b61      0.80   \n",
       "5    2023:03:17_00:35:01  <TaskResult>  1711003c      0.71   \n",
       "108  2023:03:17_00:30:22  <TaskResult>  8c9b7254      0.63   \n",
       "275  2023:03:17_00:22:33  <TaskResult>  e2f3a650      0.75   \n",
       "33   2023:03:16_20:03:07  <TaskResult>  398fa890      0.73   \n",
       "309  2023:03:16_20:01:47  <TaskResult>  79fd6b95      0.00   \n",
       "192  2023:01:25_17:46:28  <TaskResult>  8c6d2180      0.85   \n",
       "\n",
       "           evaluation_time  shots  errors  \n",
       "215   7w 1d 17h 10m 6s ago      0       0  \n",
       "95   7w 1d 17h 13m 11s ago      0       0  \n",
       "170   7w 1d 18h 2m 43s ago      0       0  \n",
       "329        10h 34m 50s ago      0       0  \n",
       "330        10h 37m 41s ago      0       0  \n",
       "5          10h 43m 20s ago      0       0  \n",
       "108        10h 47m 59s ago      0       0  \n",
       "275        10h 55m 48s ago      0       0  \n",
       "33         15h 15m 14s ago      0       0  \n",
       "309        15h 16m 34s ago      0     100  \n",
       "192  7w 1d 17h 31m 53s ago      0       0  "
      ]
     },
     "execution_count": 1050,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "from results import *\n",
    "r = results(additional=[accuracy,num_samples,evaluation_time,shots,errors], task=lambda t: \"matrixshapes\" in t and \"medium\" in t, model=\"davinci\", shots=0, decoder=\"var\")\n",
    "r = r.sort_values([\"task\", \"decoder\", \"timestamp\"], ascending=[True, True, False])\n",
    "r\n",
    "# TODO: make check CoT for date understanding\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4c1d65cf-8e47-4483-b440-23694b828c40",
   "metadata": {},
   "outputs": [],
   "source": [
    "for r1,r2 in compare(\"18244b61\", \"8c6d2180\"):\n",
    "    if r1.prediction != r1.target and r2.prediction == r2.target:\n",
    "        print([r1.model_result])\n",
    "        print([r2.model_result])\n",
    "        print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1063,
   "id": "637d13d5-0c2f-414d-84cb-e0398a77d392",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>task</th>\n",
       "      <th>model</th>\n",
       "      <th>decoder</th>\n",
       "      <th>kwargs</th>\n",
       "      <th>num_samples</th>\n",
       "      <th>cost</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>result</th>\n",
       "      <th>hash</th>\n",
       "      <th>accuracy</th>\n",
       "      <th>evaluation_time</th>\n",
       "      <th>shots</th>\n",
       "      <th>errors</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>174</th>\n",
       "      <td>matrixshapes@ao_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '16'},'size': medium,'suite': &lt;class 'eval-matrixshapes.AnswerOnlyMatrixshapes'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 21361, 'cost': 0.42722000000000004}</td>\n",
       "      <td>2023:03:16_17:16:49</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>9c2eaa46</td>\n",
       "      <td>0.67</td>\n",
       "      <td>18h 20m 32s ago</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>179</th>\n",
       "      <td>matrixshapes@cot_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': &lt;class 'eval-matrixshapes.VanillaCoTExpressionEvaluationSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 110962, 'cost': 2.21924}</td>\n",
       "      <td>2023:03:16_17:17:37</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>9afda2d7</td>\n",
       "      <td>0.76</td>\n",
       "      <td>18h 19m 44s ago</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>196</th>\n",
       "      <td>matrixshapes@multivar_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': &lt;class 'eval-matrixshapes.MatrixShapesEvaluationSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 356736, 'cost': 7.13472}</td>\n",
       "      <td>2023:03:16_17:18:31</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>344b97e2</td>\n",
       "      <td>0.81</td>\n",
       "      <td>18h 18m 50s ago</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>119</th>\n",
       "      <td>matrixshapes@multivar_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>var</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': &lt;class 'eval-matrixshapes.MatrixShapesEvaluationSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 832406, 'cost': 16.64812}</td>\n",
       "      <td>2023:03:16_18:02:22</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>812ef35a</td>\n",
       "      <td>0.77</td>\n",
       "      <td>17h 34m 59s ago</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                             task                    model decoder  \\\n",
       "174        matrixshapes@ao_medium  openai/text-davinci-003  argmax   \n",
       "179       matrixshapes@cot_medium  openai/text-davinci-003  argmax   \n",
       "196  matrixshapes@multivar_medium  openai/text-davinci-003  argmax   \n",
       "119  matrixshapes@multivar_medium  openai/text-davinci-003     var   \n",
       "\n",
       "                                                                                                                                                                                                       kwargs  \\\n",
       "174                 {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '16'},'size': medium,'suite': <class 'eval-matrixshapes.AnswerOnlyMatrixshapes'>}   \n",
       "179  {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': <class 'eval-matrixshapes.VanillaCoTExpressionEvaluationSuite'>}   \n",
       "196          {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '1024'},'size': medium,'suite': <class 'eval-matrixshapes.MatrixShapesEvaluationSuite'>}   \n",
       "119            {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': <class 'eval-matrixshapes.MatrixShapesEvaluationSuite'>}   \n",
       "\n",
       "     num_samples                                            cost  \\\n",
       "174          100  {'tokens': 21361, 'cost': 0.42722000000000004}   \n",
       "179          100             {'tokens': 110962, 'cost': 2.21924}   \n",
       "196          100             {'tokens': 356736, 'cost': 7.13472}   \n",
       "119          100            {'tokens': 832406, 'cost': 16.64812}   \n",
       "\n",
       "               timestamp        result      hash  accuracy  evaluation_time  \\\n",
       "174  2023:03:16_17:16:49  <TaskResult>  9c2eaa46      0.67  18h 20m 32s ago   \n",
       "179  2023:03:16_17:17:37  <TaskResult>  9afda2d7      0.76  18h 19m 44s ago   \n",
       "196  2023:03:16_17:18:31  <TaskResult>  344b97e2      0.81  18h 18m 50s ago   \n",
       "119  2023:03:16_18:02:22  <TaskResult>  812ef35a      0.77  17h 34m 59s ago   \n",
       "\n",
       "     shots  errors  \n",
       "174      2       0  \n",
       "179      2       0  \n",
       "196      2       0  \n",
       "119      2       0  "
      ]
     },
     "execution_count": 1063,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "from results import *\n",
    "r = results(additional=[accuracy,num_samples,evaluation_time,shots,errors], task=lambda t: \"matrixshapes\" in t and \"medium\" in t, model=\"davinci\", shots=2)\n",
    "r = r.sort_values([\"task\", \"decoder\", \"timestamp\"], ascending=[True, True, False])\n",
    "r\n",
    "# TODO: make check CoT for date understanding\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1022,
   "id": "966e8e3b-20c3-407a-8b9c-4e4b7bf6c19c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1000"
      ]
     },
     "execution_count": 1022,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "files_1000 = set()\n",
    "rows_by_id = {}\n",
    "for r in data(\"4fc43122\").iloc:\n",
    "    i = r.query_file.rsplit(\"/\",1)[-1]\n",
    "    rows_by_id[i] = r\n",
    "    files_1000.add(i)\n",
    "len(rows_by_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1025,
   "id": "bb353cd8-c4d1-4733-a4ed-f085b10798c7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "NEW ['Compute the kronecker product of a matrix of shape (2,2,3,3) with a matrix of shape (4,3,3,3). The result is a matrix of shape (8,6,9,9). Sum the result over the second axis. The result is a matrix of shape (8,9,9). Multiply the result with a matrix of shape (8,9,3). The result is a matrix of shape (8,9,3).. The result is a matrix of shape (8,9,3).']\n",
      "OLD ['Compute the kronecker product of a matrix of shape (2,2,3,3) with a matrix of shape (4,3,3,3). The result is a matrix of shape (8,6,6,6). Sum the result over the second axis. The result is a matrix of shape (8,6,6). Multiply the result with a matrix of shape (8,9,3). The result is a matrix of shape (8,6,3).. The result is a matrix of shape (8,6,3).']\n",
      "\n",
      "NEW ['Compute the hadamard product of a matrix of shape (3,2,2) with a matrix of shape (3,2,2). The result is a matrix of shape (3,2,2). Compute the kronecker product of the result with a matrix of shape (2,3,4). The result is a matrix of shape (6,6,8).. The result is a matrix of shape (6,6,8).']\n",
      "OLD ['Compute the hadamard product of a matrix of shape (3,2,2) with a matrix of shape (3,2,2). The result is a matrix of shape (3,2,2). Compute the kronecker product of the result with a matrix of shape (2,3,4). The result is a matrix of shape (6,4,8).. The result is a matrix of shape (6,4,8).']\n",
      "\n",
      "NEW ['Add a matrix of shape (2,3,2,3) to a matrix of shape (2,3,2,3). The result is a matrix of shape (2,3,2,3). Sum the result over the first axis. The result is a matrix of shape (3,2,3). Compute the kronecker product of the result with a matrix of shape (4,3,4). The result is a matrix of shape (12,8,12).. The result is a matrix of shape (12,8,12).']\n",
      "OLD ['Add a matrix of shape (2,3,2,3) to a matrix of shape (2,3,2,3). The result is a matrix of shape (2,3,2,3). Sum the result over the first axis. The result is a matrix of shape (3,2,3). Compute the kronecker product of the result with a matrix of shape (4,3,4). The result is a matrix of shape (12,6,12).. The result is a matrix of shape (12,6,12).']\n",
      "\n",
      "NEW ['Add a matrix of shape (3,2,2,3) to a matrix of shape (3,2,2,3). The result is a matrix of shape (3,2,2,3). Multiply the result with a matrix of shape (3,2,3,2). The result is a matrix of shape (3,2,2,2). Compute the kronecker product of the result with a matrix of shape (3,2,4,3). The result is a matrix of shape (3,2,8,6). Multiply the result with a matrix of shape (9,4,6,2). The result is a matrix of shape (3,2,8,2).. The result is a matrix of shape (3,2,8,2).']\n",
      "OLD ['Add a matrix of shape (3,2,2,3) to a matrix of shape (3,2,2,3). The result is a matrix of shape (3,2,2,3). Multiply the result with a matrix of shape (3,2,3,2). The result is a matrix of shape (3,2,2,2). Compute the kronecker product of the result with a matrix of shape (3,2,4,3). The result is a matrix of shape (3,2,8,6). Multiply the result with a matrix of shape (9,4,6,2). The result is a matrix of shape (9,4,8,2).. The result is a matrix of shape (9,4,8,2).']\n",
      "\n",
      "NEW ['Compute the hadamard product of a matrix of shape (3,2,2,3) with a matrix of shape (3,2,2,3). The result is a matrix of shape (3,2,2,3). Subtract the result from a matrix of shape (3,2,2,3). The result is a matrix of shape (3,2,2,3). Sum the result over the second axis. The result is a matrix of shape (3,2,3). Sum the result over the third axis. The result is a matrix of shape (3,3). Compute the hadamard product of the result with a matrix of shape (3,2). The result is a matrix of shape (3,2). Compute the kronecker product of the result with a matrix of shape (4,3). The result is a matrix of shape (12,6).. The result is a matrix of shape (12,6).']\n",
      "OLD ['Compute the hadamard product of a matrix of shape (3,2,2,3) with a matrix of shape (3,2,2,3). The result is a matrix of shape (3,2,2,3). Subtract the result from a matrix of shape (3,2,2,3). The result is a matrix of shape (3,2,2,3). Sum the result over the second axis. The result is a matrix of shape (3,2,3). Sum the result over the third axis. The result is a matrix of shape (3,3). Compute the hadamard product of the result with a matrix of shape (3,2). The result is a matrix of shape (3,3). Compute the kronecker product of the result with a matrix of shape (4,3). The result is a matrix of shape (12,9).. The result is a matrix of shape (12,9).']\n",
      "\n",
      "NEW ['Multiply a matrix of shape (3,3) with a matrix of shape (3,1). The result is a matrix of shape (3,1). Compute the kronecker product of the result with a matrix of shape (2,2). The result is a matrix of shape (6,2). Multiply the result with a matrix of shape (2,2). The result is a matrix of shape (6,2). Subtract the result from a matrix of shape (6,2). The result is a matrix of shape (6,2). Subtract the result from a matrix of shape (6,2). The result is a matrix of shape (6,2). Compute the kronecker product of the result with a matrix of shape (4,4). The result is a matrix of shape (24,4).. The result is a matrix of shape (24,4).']\n",
      "OLD ['Multiply a matrix of shape (3,3) with a matrix of shape (3,1). The result is a matrix of shape (3,1). Compute the kronecker product of the result with a matrix of shape (2,2). The result is a matrix of shape (6,2). Multiply the result with a matrix of shape (2,2). The result is a matrix of shape (6,2). Subtract the result from a matrix of shape (6,2). The result is a matrix of shape (6,2). Subtract the result from a matrix of shape (6,2). The result is a matrix of shape (6,2). Compute the kronecker product of the result with a matrix of shape (4,4). The result is a matrix of shape (24,8).. The result is a matrix of shape (24,8).']\n",
      "\n",
      "NEW ['Compute the kronecker product of a matrix of shape (2,2,2) with a matrix of shape (2,3,4). The result is a matrix of shape (4,6,8). Sum the result over the third axis. The result is a matrix of shape (4,6). Add the result to a matrix of shape (4,6). The result is a matrix of shape (4,6). Compute the kronecker product of the result with a matrix of shape (3,4). The result is a matrix of shape (12,24). Compute the hadamard product of the result with a matrix of shape (12,24). The result is a matrix of shape (12,24). Add the result to a matrix of shape (12,24). The result is a matrix of shape (12,24).. The result is a matrix of shape (12,24).']\n",
      "OLD ['Compute the kronecker product of a matrix of shape (2,2,2) with a matrix of shape (2,3,4). The result is a matrix of shape (2,2,2,2,3,4). Sum the result over the third axis. The result is a matrix of shape (2,2,2,3,4). Add the result to a matrix of shape (4,6). The result is a matrix of shape (4,6). Compute the kronecker product of the result with a matrix of shape (3,4). The result is a matrix of shape (4,6,3,4). Compute the hadamard product of the result with a matrix of shape (12,24). The result is a matrix of shape (4,6,3,4,12,24). Add the result to a matrix of shape (12,24). The result is a matrix of shape (4,6,3,4,12,24).. The result is a matrix of shape (4,6,3,4,12,24).']\n",
      "\n",
      "NEW ['Compute the kronecker product of a matrix of shape (2,3,2) with a matrix of shape (2,4,2). The result is a matrix of shape (2,12,4).. The result is a matrix of shape (2,12,4).']\n",
      "OLD ['Compute the kronecker product of a matrix of shape (2,3,2) with a matrix of shape (2,4,2). The result is a matrix of shape (4,12,4).. The result is a matrix of shape (4,12,4).']\n",
      "\n",
      "NEW ['Compute the kronecker product of a matrix of shape (2,3,3,3) with a matrix of shape (2,2,3,3). The result is a matrix of shape (4,6,9,9).. The result is a matrix of shape (4,6,9,9).']\n",
      "OLD ['Compute the kronecker product of a matrix of shape (2,3,3,3) with a matrix of shape (2,2,3,3). The result is a matrix of shape (2,2,3,3,3,3).. The result is a matrix of shape (2, 2, 3, 3, 3, 3).']\n",
      "\n",
      "NEW ['Add a matrix of shape (3,3,2,3) to a matrix of shape (3,3,2,3). The result is a matrix of shape (3,3,2,3). Multiply the result with a matrix of shape (3,3,3,4). The result is a matrix of shape (3,3,2,4). Multiply the result with a matrix of shape (3,3,4,1). The result is a matrix of shape (3,3,2,1). Sum the result over the first axis. The result is a matrix of shape (2,1). Multiply the result with a matrix of shape (3,1,3). The result is a matrix of shape (2,3).. The result is a matrix of shape (2,3).']\n",
      "OLD ['Add a matrix of shape (3,3,2,3) to a matrix of shape (3,3,2,3). The result is a matrix of shape (3,3,2,3). Multiply the result with a matrix of shape (3,3,3,4). The result is a matrix of shape (3,3,2,4). Multiply the result with a matrix of shape (3,3,4,1). The result is a matrix of shape (3,3,2,1). Sum the result over the first axis. The result is a matrix of shape (3,2,1). Multiply the result with a matrix of shape (3,1,3). The result is a matrix of shape (3,2,3).. The result is a matrix of shape (3,2,3).']\n",
      "\n",
      "NEW ['Add a matrix of shape (3,2,2,3) to a matrix of shape (3,2,2,3). The result is a matrix of shape (3,2,2,3). Subtract the result from a matrix of shape (3,2,2,3). The result is a matrix of shape (3,2,2,3). Transpose the result. The result is a matrix of shape (3,3,2,2). Transpose the result. The result is a matrix of shape (3,2,2,3).. The result is a matrix of shape (3,2,2,3).']\n",
      "OLD ['Add a matrix of shape (3,2,2,3) to a matrix of shape (3,2,2,3). The result is a matrix of shape (3,2,2,3). Subtract the result from a matrix of shape (3,2,2,3). The result is a matrix of shape (3,2,2,3). Transpose the result. The result is a matrix of shape (3,3,2,2). Transpose the result. The result is a matrix of shape (2,2,3,3).. The result is a matrix of shape (2,2,3,3).']\n",
      "\n",
      "NEW ['Multiply a matrix of shape (2,3,3) with a matrix of shape (2,3,1). The result is a matrix of shape (2,3,1). Compute the kronecker product of the result with a matrix of shape (4,2,2). The result is a matrix of shape (8,6,2).. The result is a matrix of shape (8,6,2).']\n",
      "OLD ['Multiply a matrix of shape (2,3,3) with a matrix of shape (2,3,1). The result is a matrix of shape (2,3,1). Compute the kronecker product of the result with a matrix of shape (4,2,2). The result is a matrix of shape (4,6,2).. The result is a matrix of shape (4,6,2).']\n",
      "\n",
      "NEW ['Transpose a matrix of shape (2,3,3). The result is a matrix of shape (3,2,3). Transpose the result. The result is a matrix of shape (3,3,2). Sum the result over the second axis. The result is a matrix of shape (3,2). Compute the kronecker product of the result with a matrix of shape (3,3). The result is a matrix of shape (9,6). Compute the kronecker product of the result with a matrix of shape (2,3). The result is a matrix of shape (18,18).. The result is a matrix of shape (18,18).']\n",
      "OLD ['Transpose a matrix of shape (2,3,3). The result is a matrix of shape (3,3,2). Transpose the result. The result is a matrix of shape (2,3,3). Sum the result over the second axis. The result is a matrix of shape (2,3). Compute the kronecker product of the result with a matrix of shape (3,3). The result is a matrix of shape (6,9). Compute the kronecker product of the result with a matrix of shape (2,3). The result is a matrix of shape (12,27).. The result is a matrix of shape (12,27).']\n",
      "\n",
      "NEW ['Subtract a matrix of shape (3,3,3) from a matrix of shape (3,3,3). The result is a matrix of shape (3,3,3). Compute the kronecker product of the result with a matrix of shape (2,3,3). The result is a matrix of shape (6,3,3). Compute the kronecker product of the result with a matrix of shape (3,4,2). The result is a matrix of shape (18,12,6).. The result is a matrix of shape (18,12,6).']\n",
      "OLD ['Subtract a matrix of shape (3,3,3) from a matrix of shape (3,3,3). The result is a matrix of shape (3,3,3). Compute the kronecker product of the result with a matrix of shape (2,3,3). The result is a matrix of shape (6,9,9). Compute the kronecker product of the result with a matrix of shape (3,4,2). The result is a matrix of shape (18,36,18).. The result is a matrix of shape (18,36,18).']\n",
      "\n",
      "NEW ['Compute the kronecker product of a matrix of shape (2,3,2,3) with a matrix of shape (2,2,3,3). The result is a matrix of shape (4,6,6,6). Compute the kronecker product of the result with a matrix of shape (2,2,3,4). The result is a matrix of shape (8,12,18,24).. The result is a matrix of shape (8,12,18,24).']\n",
      "OLD ['Compute the kronecker product of a matrix of shape (2,3,2,3) with a matrix of shape (2,2,3,3). The result is a matrix of shape (4,6,6,9). Compute the kronecker product of the result with a matrix of shape (2,2,3,4). The result is a matrix of shape (8,12,18,36).. The result is a matrix of shape (8,12,18,36).']\n",
      "\n",
      "NEW ['Transpose a matrix of shape (2,3,3,3). The result is a matrix of shape (3,3,2,3). Subtract the result from a matrix of shape (3,3,3,2). The result is a matrix of shape (3,3,2,2). Sum the result over the third axis. The result is a matrix of shape (3,3,2). Compute the kronecker product of the result with a matrix of shape (4,4,4). The result is a matrix of shape (12,12,8).. The result is a matrix of shape (12,12,8).']\n",
      "OLD ['Transpose a matrix of shape (2,3,3,3). The result is a matrix of shape (3,3,2,3). Subtract the result from a matrix of shape (3,3,3,2). The result is a matrix of shape (3,3,2,2). Sum the result over the third axis. The result is a matrix of shape (3,3,2). Compute the kronecker product of the result with a matrix of shape (4,4,4). The result is a matrix of shape (12,12,4).. The result is a matrix of shape (12,12,4).']\n",
      "\n",
      "NEW ['Multiply a matrix of shape (3,3,2,3) with a matrix of shape (3,3,3,1). The result is a matrix of shape (3,3,2,1). Transpose the result. The result is a matrix of shape (1,2,3,3). Compute the kronecker product of the result with a matrix of shape (4,2,4,4). The result is a matrix of shape (4,8,12,12). Subtract the result from a matrix of shape (4,4,12,12). The result is a matrix of shape (4,8,12,12). Multiply the result with a matrix of shape (4,4,12,1). The result is a matrix of shape (4,8,12,1).. The result is a matrix of shape (4,8,12,1).']\n",
      "OLD ['Multiply a matrix of shape (3,3,2,3) with a matrix of shape (3,3,3,1). The result is a matrix of shape (3,3,2,1). Transpose the result. The result is a matrix of shape (1,2,3,3). Compute the kronecker product of the result with a matrix of shape (4,2,4,4). The result is a matrix of shape (4,8,3,3). Subtract the result from a matrix of shape (4,4,12,12). The result is a matrix of shape (4,4,12,12). Multiply the result with a matrix of shape (4,4,12,1). The result is a matrix of shape (4,4,12,1).. The result is a matrix of shape (4, 4, 12, 1).']\n",
      "\n",
      "NEW ['Compute the kronecker product of a matrix of shape (3,3,3,2) with a matrix of shape (3,2,2,2). The result is a matrix of shape (3,3,3,2,3,2,2,2).. The result is a matrix of shape (3,3,3,2,3,2,2,2).']\n",
      "OLD ['Compute the kronecker product of a matrix of shape (3,3,3,2) with a matrix of shape (3,2,2,2). The result is a matrix of shape (9,6,6,4).. The result is a matrix of shape (9,6,6,4).']\n",
      "\n",
      "NEW ['Compute the kronecker product of a matrix of shape (2,3,2,2) with a matrix of shape (2,3,3,3). The result is a matrix of shape (2,3,2,2,2,3,3,3). Subtract the result from a matrix of shape (4,9,6,6). The result is a matrix of shape (4,9,6,6). Transpose the result. The result is a matrix of shape (6,6,4,9).. The result is a matrix of shape (6,6,4,9).']\n",
      "OLD ['Compute the kronecker product of a matrix of shape (2,3,2,2) with a matrix of shape (2,3,3,3). The result is a matrix of shape (2,3,2,2,2,3,3,3) Subtract the result from a matrix of shape (4,9,6,6). The result is a matrix of shape (4,9,6,6). Transpose the result. The result is a matrix of shape (6,6,9,4).. The result is a matrix of shape (6,6,9,4).']\n",
      "\n",
      "NEW ['Multiply a matrix of shape (2,2,3) with a matrix of shape (2,3,3). The result is a matrix of shape (2,2,3). Compute the hadamard product of the result with a matrix of shape (2,2,3). The result is a matrix of shape (2,2,3). Compute the hadamard product of the result with a matrix of shape (2,2,3). The result is a matrix of shape (2,2,3). Compute the kronecker product of the result with a matrix of shape (3,4,4). The result is a matrix of shape (6,8,12).. The result is a matrix of shape (6, 8, 12).']\n",
      "OLD ['Multiply a matrix of shape (2,2,3) with a matrix of shape (2,3,3). The result is a matrix of shape (2,2,3). Compute the hadamard product of the result with a matrix of shape (2,2,3). The result is a matrix of shape (2,2,3). Compute the hadamard product of the result with a matrix of shape (2,2,3). The result is a matrix of shape (2,2,3). Compute the kronecker product of the result with a matrix of shape (3,4,4). The result is a matrix of shape (6,8,8).. The result is a matrix of shape (6,8,8).']\n",
      "\n",
      "NEW ['Transpose a matrix of shape (2,3,2,3). The result is a matrix of shape (3,2,3,2). Compute the kronecker product of the result with a matrix of shape (3,2,3,3). The result is a matrix of shape (9,6,9,6). Sum the result over the third axis. The result is a matrix of shape (9,6,6). Sum the result over the first axis. The result is a matrix of shape (6,6).. The result is a matrix of shape (6,6).']\n",
      "OLD ['Transpose a matrix of shape (2,3,2,3). The result is a matrix of shape (3,2,3,2). Compute the kronecker product of the result with a matrix of shape (3,2,3,3). The result is a matrix of shape (9,4,9,6). Sum the result over the third axis. The result is a matrix of shape (9,4,6). Sum the result over the first axis. The result is a matrix of shape (4,6).. The result is a matrix of shape (4,6).']\n",
      "\n",
      "NEW ['Transpose a matrix of shape (3,3,2,3). The result is a matrix of shape (3,2,3,3). Sum the result over the second axis. The result is a matrix of shape (3,3,3).. The result is a matrix of shape (3,3,3).']\n",
      "OLD ['Transpose a matrix of shape (3,3,2,3). The result is a matrix of shape (2,3,3,3). Sum the result over the second axis. The result is a matrix of shape (2,3,3).. The result is a matrix of shape (2,3,3).']\n",
      "\n",
      "NEW ['Compute the hadamard product of a matrix of shape (3,3,3) with a matrix of shape (3,3,3). The result is a matrix of shape (3,3,3). Multiply the result with a matrix of shape (3,3,1). The result is a matrix of shape (3,3,1). Subtract the result from a matrix of shape (3,3,1). The result is a matrix of shape (3,3,1). Add the result to a matrix of shape (3,3,1). The result is a matrix of shape (3,3,1). Sum the result over the first axis. The result is a matrix of shape (3,1). Compute the kronecker product of the result with a matrix of shape (4,3). The result is a matrix of shape (4,3).. The result is a matrix of shape (4,3).']\n",
      "OLD ['Compute the hadamard product of a matrix of shape (3,3,3) with a matrix of shape (3,3,3). The result is a matrix of shape (3,3,3). Multiply the result with a matrix of shape (3,3,1). The result is a matrix of shape (3,3,3). Subtract the result from a matrix of shape (3,3,1). The result is a matrix of shape (3,3,1). Add the result to a matrix of shape (3,3,1). The result is a matrix of shape (3,3,1). Sum the result over the first axis. The result is a matrix of shape (3,1). Compute the kronecker product of the result with a matrix of shape (4,3). The result is a matrix of shape (12,3).. The result is a matrix of shape (12,3).']\n",
      "\n",
      "NEW ['Transpose a matrix of shape (2,3,3). The result is a matrix of shape (3,2,3).. The result is a matrix of shape (3,2,3).']\n",
      "OLD ['Transpose a matrix of shape (2,3,3). The result is a matrix of shape (3,3,2). The result is a matrix of shape (3,3,2)']\n",
      "\n",
      "NEW ['Transpose a matrix of shape (2,3,2,2). The result is a matrix of shape (2,2,3,2).. The result is a matrix of shape (2,2,3,2).']\n",
      "OLD ['Transpose a matrix of shape (2,3,2,2). The result is a matrix of shape (2,2,2,3).. The result is a matrix of shape (2,2,2,3)']\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "25"
      ]
     },
     "execution_count": 1025,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d = 0\n",
    "files_100 = set()\n",
    "for r in data(\"9f8b248c\").iloc:\n",
    "    correct = r.prediction == r.target\n",
    "    i = r.query_file.rsplit(\"/\",1)[-1]\n",
    "    files_100.add(i)\n",
    "    new_r = rows_by_id[i]\n",
    "    new_correct = new_r.prediction == r.target\n",
    "    if correct != new_correct:\n",
    "        d += 1\n",
    "        print(\"NEW\", [new_r.model_result])\n",
    "        print(\"OLD\", [r.model_result])\n",
    "        print()\n",
    "d"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1024,
   "id": "e29ad871-74f6-455b-b35b-03040663442e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "100"
      ]
     },
     "execution_count": 1024,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(files_1000) - len(files_1000 - files_100)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "64b94d0c-4526-40df-82fe-b0646156a6fc",
   "metadata": {},
   "source": [
    "##### Curie"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 811,
   "id": "f8f90ae8-d2f3-4e3e-b8a4-d71f2ff26d00",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>task</th>\n",
       "      <th>model</th>\n",
       "      <th>decoder</th>\n",
       "      <th>kwargs</th>\n",
       "      <th>num_samples</th>\n",
       "      <th>cost</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>result</th>\n",
       "      <th>hash</th>\n",
       "      <th>accuracy</th>\n",
       "      <th>evaluation_time</th>\n",
       "      <th>shots</th>\n",
       "      <th>errors</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>233</th>\n",
       "      <td>matrixshapes@ao_medium</td>\n",
       "      <td>openai/text-curie-001</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': &lt;class 'eval-matrixshapes.AnswerOnlyMatrixshapes'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 9740, 'cost': 0.01948}</td>\n",
       "      <td>2023:03:16_18:15:43</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>36b7d8fa</td>\n",
       "      <td>0.04</td>\n",
       "      <td>55m 40s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>matrixshapes@cot_medium</td>\n",
       "      <td>openai/text-curie-001</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': &lt;class 'eval-matrixshapes.VanillaCoTExpressionEvaluationSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 109671, 'cost': 0.219342}</td>\n",
       "      <td>2023:03:16_18:14:48</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>6731bafa</td>\n",
       "      <td>0.07</td>\n",
       "      <td>56m 35s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>243</th>\n",
       "      <td>matrixshapes@multivar_medium</td>\n",
       "      <td>openai/text-curie-001</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': &lt;class 'eval-matrixshapes.MatrixShapesEvaluationSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 78404, 'cost': 0.156808}</td>\n",
       "      <td>2023:03:16_18:10:53</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>1e9a462b</td>\n",
       "      <td>0.01</td>\n",
       "      <td>1h 30s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>84</th>\n",
       "      <td>matrixshapes@multivar_medium</td>\n",
       "      <td>openai/text-curie-001</td>\n",
       "      <td>beam_var</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': &lt;class 'eval-matrixshapes.MatrixShapesEvaluationSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 540158, 'cost': 1.080316}</td>\n",
       "      <td>2023:03:16_18:20:39</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>23119bd4</td>\n",
       "      <td>0.00</td>\n",
       "      <td>50m 44s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>116</th>\n",
       "      <td>matrixshapes@multivar_medium</td>\n",
       "      <td>openai/text-curie-001</td>\n",
       "      <td>var</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': &lt;class 'eval-matrixshapes.MatrixShapesEvaluationSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 284397, 'cost': 0.568794}</td>\n",
       "      <td>2023:03:16_18:16:16</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>b13a91fe</td>\n",
       "      <td>0.00</td>\n",
       "      <td>55m 7s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                             task                  model   decoder  \\\n",
       "233        matrixshapes@ao_medium  openai/text-curie-001    argmax   \n",
       "29        matrixshapes@cot_medium  openai/text-curie-001    argmax   \n",
       "243  matrixshapes@multivar_medium  openai/text-curie-001    argmax   \n",
       "84   matrixshapes@multivar_medium  openai/text-curie-001  beam_var   \n",
       "116  matrixshapes@multivar_medium  openai/text-curie-001       var   \n",
       "\n",
       "                                                                                                                                                                                                     kwargs  \\\n",
       "233               {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': <class 'eval-matrixshapes.AnswerOnlyMatrixshapes'>}   \n",
       "29   {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': <class 'eval-matrixshapes.VanillaCoTExpressionEvaluationSuite'>}   \n",
       "243          {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': <class 'eval-matrixshapes.MatrixShapesEvaluationSuite'>}   \n",
       "84           {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': <class 'eval-matrixshapes.MatrixShapesEvaluationSuite'>}   \n",
       "116          {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': <class 'eval-matrixshapes.MatrixShapesEvaluationSuite'>}   \n",
       "\n",
       "     num_samples                                  cost            timestamp  \\\n",
       "233          100     {'tokens': 9740, 'cost': 0.01948}  2023:03:16_18:15:43   \n",
       "29           100  {'tokens': 109671, 'cost': 0.219342}  2023:03:16_18:14:48   \n",
       "243          100   {'tokens': 78404, 'cost': 0.156808}  2023:03:16_18:10:53   \n",
       "84           100  {'tokens': 540158, 'cost': 1.080316}  2023:03:16_18:20:39   \n",
       "116          100  {'tokens': 284397, 'cost': 0.568794}  2023:03:16_18:16:16   \n",
       "\n",
       "           result      hash  accuracy evaluation_time  shots  errors  \n",
       "233  <TaskResult>  36b7d8fa      0.04     55m 40s ago      0       0  \n",
       "29   <TaskResult>  6731bafa      0.07     56m 35s ago      0       0  \n",
       "243  <TaskResult>  1e9a462b      0.01      1h 30s ago      0       0  \n",
       "84   <TaskResult>  23119bd4      0.00     50m 44s ago      0       0  \n",
       "116  <TaskResult>  b13a91fe      0.00      55m 7s ago      0       0  "
      ]
     },
     "execution_count": 811,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "from results import *\n",
    "r = results(additional=[accuracy,num_samples,evaluation_time,shots,errors], task=lambda t: \"matrixshapes\" in t and \"medium\" in t, model=\"curie\", shots=0)\n",
    "r = r.sort_values([\"task\", \"decoder\", \"timestamp\"], ascending=[True, True, False])\n",
    "r\n",
    "# TODO: make check CoT for date understanding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 777,
   "id": "704ecff9-c246-48de-bb21-ccbd05b6f1a6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>task</th>\n",
       "      <th>model</th>\n",
       "      <th>decoder</th>\n",
       "      <th>kwargs</th>\n",
       "      <th>num_samples</th>\n",
       "      <th>cost</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>result</th>\n",
       "      <th>hash</th>\n",
       "      <th>accuracy</th>\n",
       "      <th>evaluation_time</th>\n",
       "      <th>shots</th>\n",
       "      <th>errors</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>278</th>\n",
       "      <td>matrixshapes@ao_medium</td>\n",
       "      <td>openai/text-curie-001</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': &lt;class 'eval-matrixshapes.AnswerOnlyMatrixshapes'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 21673, 'cost': 0.043345999999999996}</td>\n",
       "      <td>2023:03:16_18:41:53</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>6d6831aa</td>\n",
       "      <td>0.02</td>\n",
       "      <td>7m 21s ago</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>matrixshapes@cot_medium</td>\n",
       "      <td>openai/text-curie-001</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': &lt;class 'eval-matrixshapes.VanillaCoTExpressionEvaluationSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 249338, 'cost': 0.498676}</td>\n",
       "      <td>2023:03:16_18:45:23</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>5aaf9448</td>\n",
       "      <td>0.06</td>\n",
       "      <td>3m 51s ago</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>163</th>\n",
       "      <td>matrixshapes@multivar_medium</td>\n",
       "      <td>openai/text-curie-001</td>\n",
       "      <td>argmax</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': &lt;class 'eval-matrixshapes.MatrixShapesEvaluationSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 355204, 'cost': 0.710408}</td>\n",
       "      <td>2023:03:16_18:12:40</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>c6edb7c4</td>\n",
       "      <td>0.03</td>\n",
       "      <td>36m 34s ago</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                             task                  model decoder  \\\n",
       "278        matrixshapes@ao_medium  openai/text-curie-001  argmax   \n",
       "43        matrixshapes@cot_medium  openai/text-curie-001  argmax   \n",
       "163  matrixshapes@multivar_medium  openai/text-curie-001  argmax   \n",
       "\n",
       "                                                                                                                                                                                                     kwargs  \\\n",
       "278               {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': <class 'eval-matrixshapes.AnswerOnlyMatrixshapes'>}   \n",
       "43   {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': <class 'eval-matrixshapes.VanillaCoTExpressionEvaluationSuite'>}   \n",
       "163          {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': <class 'eval-matrixshapes.MatrixShapesEvaluationSuite'>}   \n",
       "\n",
       "     num_samples                                             cost  \\\n",
       "278          100  {'tokens': 21673, 'cost': 0.043345999999999996}   \n",
       "43           100             {'tokens': 249338, 'cost': 0.498676}   \n",
       "163          100             {'tokens': 355204, 'cost': 0.710408}   \n",
       "\n",
       "               timestamp        result      hash  accuracy evaluation_time  \\\n",
       "278  2023:03:16_18:41:53  <TaskResult>  6d6831aa      0.02      7m 21s ago   \n",
       "43   2023:03:16_18:45:23  <TaskResult>  5aaf9448      0.06      3m 51s ago   \n",
       "163  2023:03:16_18:12:40  <TaskResult>  c6edb7c4      0.03     36m 34s ago   \n",
       "\n",
       "     shots  errors  \n",
       "278      2       0  \n",
       "43       2       0  \n",
       "163      2       0  "
      ]
     },
     "execution_count": 777,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "from results import *\n",
    "r = results(additional=[accuracy,num_samples,evaluation_time,shots,errors], task=lambda t: \"matrixshapes\" in t and \"medium\" in t, model=\"curie\", shots=2)\n",
    "r = r.sort_values([\"task\", \"decoder\", \"timestamp\"], ascending=[True, True, False])\n",
    "r\n",
    "# TODO: make check CoT for date understanding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c63d19b2-66e4-4312-8c95-29cd38d1c231",
   "metadata": {},
   "outputs": [],
   "source": [
    "for r in data(\"23119bd4\").iloc:\n",
    "    print(r.model_result, r.target, r.prediction, end=\"\\n\\n\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
