{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 266,
   "id": "e181fa1c-d822-4263-a9f1-826f89039b81",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    }
   ],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "from results import *\n",
    "# pandas show full column contents\n",
    "pd.set_option('display.max_colwidth', None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 212,
   "id": "7947e2ff-8701-4756-a6d5-3972a6bb9a42",
   "metadata": {},
   "outputs": [],
   "source": [
    "# value normalizer used for GSM8K evaluation\n",
    "def value_normalizer(v):\n",
    "    v = str(v).strip(\". '\")\n",
    "    if v.endswith(\".0\"):\n",
    "        v = v[:-2]\n",
    "    # v = v.rstrip(\".0\")\n",
    "    return v\n",
    "# more lenient accuracy\n",
    "def acc(v):\n",
    "    return accuracy(v, t=value_normalizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 392,
   "id": "73c07b45-df85-44cb-9528-2787f5b1454d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Date Understanding ,       0.48,      0.70,      0.75,      0.75,      0.69,      0.72,      0.73,      0.66,      0.69\n",
      "Information Essentiality ,       0.01,      0.25,      0.06,      0.15\n",
      "AQuA ,       0.31,      0.37,      0.37,      0.35,      0.35,      0.40,      0.47,      0.35,      0.37\n",
      "StrategyQA ,       0.68,      0.71,      0.72,      0.67,      0.67,      0.69,      0.77,      0.66,      0.65\n",
      "multistep ,       0.20,      0.43,      0.44,      0.49,      0.44,      0.45,      0.48,      0.38,      0.45\n",
      "GSM8K ,       0.08,      0.56,      0.58,      0.65,      0.58,      0.57,      0.53,      0.59,      0.63\n",
      "Shuffled Obj. ,       0.19,      0.62,      0.47,      0.52,      0.52,      0.64,      0.62,      0.66,      0.33\n",
      "Matrix Shapes. ,       0.61,      0.77,      0.77,      0.71,      0.76,      0.81,      0.79,      0.85,      0.79\n"
     ]
    }
   ],
   "source": [
    "table_all({\n",
    "    \"Date Understanding\": [\n",
    "        #ao\n",
    "        \"a10edc95\",\n",
    "        #cot\n",
    "        \"c4e70e4b\", \"39f97e86\", \"917ab73e\", \"d303e93d\", \n",
    "        #mvp\n",
    "        \"d472bfd5\", \"87b4b0e2\", \"712d9396\", \"6cda71e6\"\n",
    "    ],\n",
    "    \"Information Essentiality\": [\n",
    "        \"2f5cec53\", \"181ca06b\", \"b9dfe2d6\", \"28da5e8b\"\n",
    "    ],\n",
    "    \"AQuA\": [\n",
    "        #ao\n",
    "        \"64968482\", \n",
    "        #cot\n",
    "        \"bfbb09ea\", \"efc16fb7\", \"aaa39758\", \"dc5bd7ea\",\n",
    "        # mvp\n",
    "        \"e32dd7dd\", \"29e97f97\", \"d19382f3\", \"07c78cdc\"\n",
    "    ],\n",
    "    \"StrategyQA\": [\n",
    "        #ao\n",
    "        \"9c78e2be\", \n",
    "        #cot\n",
    "        \"e4ba3a5a\", \"d0e19cc9\", \"21665325\", \"921c8605\",\n",
    "        #mvp\n",
    "        \"5178a90c\", \"40a82b05\", \"a8b1ecf3\", \"a297d325\"\n",
    "    ],\n",
    "    \"multistep\": [\n",
    "        #ao\n",
    "        \"57e0a00d\",\n",
    "        #cot\n",
    "        \"4b8b85ff\", \"5b930750\", \"0d5662d3\", \"f7e244b9\",\n",
    "        #mvp\n",
    "        \"b4ef7d92\", \"8fd9ef97\", \"0408ff36\", \"74fb1c39\"\n",
    "    ],\n",
    "    \"GSM8K\": [\n",
    "        acc, # will be used to compute accuracy\n",
    "        # ao\n",
    "        \"1ec5e546\",\n",
    "        # cot\n",
    "        \"96d57a9b\", \"bb25c840\", \"093a15a8\", \"0c3a8fb3\",\n",
    "        # mvp\n",
    "        \"299eb5b8\", \"1519f3ba\", \"06e576af\", \"44d0ce6d\"\n",
    "    ],\n",
    "    \"Shuffled Obj.\": [\n",
    "        # ao\n",
    "        \"555ef80a\",\n",
    "        #cot,\n",
    "        \"26f4511c\", \"8e79838b\", \"2f27ca41\", \"70257bc3\", \n",
    "        # mvp\n",
    "        \"f81b3fd0\", \"db180ab9\", \"fe1e1a46\", \"18b2ada0\"\n",
    "    ], \n",
    "    \"Matrix Shapes.\": [\n",
    "        # ao\n",
    "        \"bb7cf17a\",\n",
    "        # cot\n",
    "        \"491acd13\", \"95722aa2\", \"ebb0c67e\", \"b71518c0\",\n",
    "        #mvp\n",
    "        \"859b2d5d\", \"f1b66902\", \"8c6d2180\", \"3df264e0\"\n",
    "    ]\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 431,
   "id": "26242b99-e06e-4133-a7b7-a479421666a4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>task</th>\n",
       "      <th>model</th>\n",
       "      <th>decoder</th>\n",
       "      <th>kwargs</th>\n",
       "      <th>num_samples</th>\n",
       "      <th>cost</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>result</th>\n",
       "      <th>hash</th>\n",
       "      <th>acc</th>\n",
       "      <th>evaluation_time</th>\n",
       "      <th>shots</th>\n",
       "      <th>errors</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>matrixshapes@multivar_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>var</td>\n",
       "      <td>{'kwargs': {'max_length': '2048', 'top1_distribution': True, 'step_budget': '2048', 'openai_chunksize': '32', 'n': '2'},'size': medium,'suite': &lt;class 'eval-matrixshapes.MatrixShapesEvaluationSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 275849, 'cost': 5.51698}</td>\n",
       "      <td>2023:03:17_00:35:01</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>1711003c</td>\n",
       "      <td>0.71</td>\n",
       "      <td>11m 1s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>matrixshapes@multivar_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>var</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': &lt;class 'eval-matrixshapes.MatrixShapesEvaluationSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 274064, 'cost': 5.481280000000001}</td>\n",
       "      <td>2023:03:16_20:03:07</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>398fa890</td>\n",
       "      <td>0.73</td>\n",
       "      <td>4h 42m 55s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>106</th>\n",
       "      <td>matrixshapes@multivar_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>var</td>\n",
       "      <td>{'kwargs': {'max_length': '2048', 'top1_distribution': True, 'step_budget': '2048', 'openai_chunksize': '32', 'n': '4'},'size': medium,'suite': &lt;class 'eval-matrixshapes.MatrixShapesEvaluationSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 278060, 'cost': 5.5612}</td>\n",
       "      <td>2023:03:17_00:30:22</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>8c9b7254</td>\n",
       "      <td>0.63</td>\n",
       "      <td>15m 40s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>166</th>\n",
       "      <td>matrixshapes@multivar_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>beam_var</td>\n",
       "      <td>{'kwargs': {'max_length': 512, 'top1_distribution': True, 'num_beams': '2', 'prune': '1.05'},'size': medium,'suite': &lt;class 'eval-matrixshapes.MatrixShapesEvaluationSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 203708, 'cost': 4.07416}</td>\n",
       "      <td>2023:01:25_17:15:38</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>f1b66902</td>\n",
       "      <td>0.79</td>\n",
       "      <td>7w 1d 7h 30m 24s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>188</th>\n",
       "      <td>matrixshapes@multivar_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>var</td>\n",
       "      <td>{'kwargs': {'max_length': 1024, 'top1_distribution': True},'size': medium,'suite': &lt;class 'eval-matrixshapes.MatrixShapesEvaluationSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 271485, 'cost': 5.4297}</td>\n",
       "      <td>2023:01:25_17:46:28</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>8c6d2180</td>\n",
       "      <td>0.85</td>\n",
       "      <td>7w 1d 6h 59m 34s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>268</th>\n",
       "      <td>matrixshapes@multivar_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>var</td>\n",
       "      <td>{'kwargs': {'max_length': '2048', 'top1_distribution': True, 'step_budget': '2048', 'openai_chunksize': '32'},'size': medium,'suite': &lt;class 'eval-matrixshapes.MatrixShapesEvaluationSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 260156, 'cost': 5.20312}</td>\n",
       "      <td>2023:03:17_00:22:33</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>e2f3a650</td>\n",
       "      <td>0.75</td>\n",
       "      <td>23m 29s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>301</th>\n",
       "      <td>matrixshapes@multivar_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>var</td>\n",
       "      <td>{'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': &lt;class 'eval-matrixshapes.MatrixShapesEvaluationSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 0, 'cost': 0.0}</td>\n",
       "      <td>2023:03:16_20:01:47</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>79fd6b95</td>\n",
       "      <td>0.00</td>\n",
       "      <td>4h 44m 15s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>321</th>\n",
       "      <td>matrixshapes@multivar_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>var</td>\n",
       "      <td>{'kwargs': {'max_length': '2048', 'top1_distribution': True, 'step_budget': '2048', 'openai_chunksize': '32', 'n': '2'},'size': medium,'suite': &lt;class 'eval-matrixshapes.MatrixShapesEvaluationSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 186660, 'cost': 3.7332}</td>\n",
       "      <td>2023:03:17_00:43:31</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>51173e96</td>\n",
       "      <td>0.84</td>\n",
       "      <td>2m 31s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>322</th>\n",
       "      <td>matrixshapes@multivar_medium</td>\n",
       "      <td>openai/text-davinci-003</td>\n",
       "      <td>var</td>\n",
       "      <td>{'kwargs': {'max_length': '2048', 'top1_distribution': True, 'step_budget': '2048', 'openai_chunksize': '32', 'n': '2'},'size': medium,'suite': &lt;class 'eval-matrixshapes.MatrixShapesEvaluationSuite'&gt;}</td>\n",
       "      <td>100</td>\n",
       "      <td>{'tokens': 187357, 'cost': 3.74714}</td>\n",
       "      <td>2023:03:17_00:40:40</td>\n",
       "      <td>&lt;TaskResult&gt;</td>\n",
       "      <td>18244b61</td>\n",
       "      <td>0.80</td>\n",
       "      <td>5m 22s ago</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                             task                    model   decoder  \\\n",
       "5    matrixshapes@multivar_medium  openai/text-davinci-003       var   \n",
       "31   matrixshapes@multivar_medium  openai/text-davinci-003       var   \n",
       "106  matrixshapes@multivar_medium  openai/text-davinci-003       var   \n",
       "166  matrixshapes@multivar_medium  openai/text-davinci-003  beam_var   \n",
       "188  matrixshapes@multivar_medium  openai/text-davinci-003       var   \n",
       "268  matrixshapes@multivar_medium  openai/text-davinci-003       var   \n",
       "301  matrixshapes@multivar_medium  openai/text-davinci-003       var   \n",
       "321  matrixshapes@multivar_medium  openai/text-davinci-003       var   \n",
       "322  matrixshapes@multivar_medium  openai/text-davinci-003       var   \n",
       "\n",
       "                                                                                                                                                                                                       kwargs  \\\n",
       "5    {'kwargs': {'max_length': '2048', 'top1_distribution': True, 'step_budget': '2048', 'openai_chunksize': '32', 'n': '2'},'size': medium,'suite': <class 'eval-matrixshapes.MatrixShapesEvaluationSuite'>}   \n",
       "31             {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': <class 'eval-matrixshapes.MatrixShapesEvaluationSuite'>}   \n",
       "106  {'kwargs': {'max_length': '2048', 'top1_distribution': True, 'step_budget': '2048', 'openai_chunksize': '32', 'n': '4'},'size': medium,'suite': <class 'eval-matrixshapes.MatrixShapesEvaluationSuite'>}   \n",
       "166                             {'kwargs': {'max_length': 512, 'top1_distribution': True, 'num_beams': '2', 'prune': '1.05'},'size': medium,'suite': <class 'eval-matrixshapes.MatrixShapesEvaluationSuite'>}   \n",
       "188                                                               {'kwargs': {'max_length': 1024, 'top1_distribution': True},'size': medium,'suite': <class 'eval-matrixshapes.MatrixShapesEvaluationSuite'>}   \n",
       "268            {'kwargs': {'max_length': '2048', 'top1_distribution': True, 'step_budget': '2048', 'openai_chunksize': '32'},'size': medium,'suite': <class 'eval-matrixshapes.MatrixShapesEvaluationSuite'>}   \n",
       "301            {'kwargs': {'max_length': '1024', 'top1_distribution': True, 'step_budget': '1024', 'openai_chunksize': '32'},'size': medium,'suite': <class 'eval-matrixshapes.MatrixShapesEvaluationSuite'>}   \n",
       "321  {'kwargs': {'max_length': '2048', 'top1_distribution': True, 'step_budget': '2048', 'openai_chunksize': '32', 'n': '2'},'size': medium,'suite': <class 'eval-matrixshapes.MatrixShapesEvaluationSuite'>}   \n",
       "322  {'kwargs': {'max_length': '2048', 'top1_distribution': True, 'step_budget': '2048', 'openai_chunksize': '32', 'n': '2'},'size': medium,'suite': <class 'eval-matrixshapes.MatrixShapesEvaluationSuite'>}   \n",
       "\n",
       "     num_samples                                           cost  \\\n",
       "5            100            {'tokens': 275849, 'cost': 5.51698}   \n",
       "31           100  {'tokens': 274064, 'cost': 5.481280000000001}   \n",
       "106          100             {'tokens': 278060, 'cost': 5.5612}   \n",
       "166          100            {'tokens': 203708, 'cost': 4.07416}   \n",
       "188          100             {'tokens': 271485, 'cost': 5.4297}   \n",
       "268          100            {'tokens': 260156, 'cost': 5.20312}   \n",
       "301          100                     {'tokens': 0, 'cost': 0.0}   \n",
       "321          100             {'tokens': 186660, 'cost': 3.7332}   \n",
       "322          100            {'tokens': 187357, 'cost': 3.74714}   \n",
       "\n",
       "               timestamp        result      hash   acc       evaluation_time  \\\n",
       "5    2023:03:17_00:35:01  <TaskResult>  1711003c  0.71            11m 1s ago   \n",
       "31   2023:03:16_20:03:07  <TaskResult>  398fa890  0.73        4h 42m 55s ago   \n",
       "106  2023:03:17_00:30:22  <TaskResult>  8c9b7254  0.63           15m 40s ago   \n",
       "166  2023:01:25_17:15:38  <TaskResult>  f1b66902  0.79  7w 1d 7h 30m 24s ago   \n",
       "188  2023:01:25_17:46:28  <TaskResult>  8c6d2180  0.85  7w 1d 6h 59m 34s ago   \n",
       "268  2023:03:17_00:22:33  <TaskResult>  e2f3a650  0.75           23m 29s ago   \n",
       "301  2023:03:16_20:01:47  <TaskResult>  79fd6b95  0.00        4h 44m 15s ago   \n",
       "321  2023:03:17_00:43:31  <TaskResult>  51173e96  0.84            2m 31s ago   \n",
       "322  2023:03:17_00:40:40  <TaskResult>  18244b61  0.80            5m 22s ago   \n",
       "\n",
       "     shots  errors  \n",
       "5        0       0  \n",
       "31       0       0  \n",
       "106      0       0  \n",
       "166      0       0  \n",
       "188      0       0  \n",
       "268      0       0  \n",
       "301      0     100  \n",
       "321      0       0  \n",
       "322      0       0  "
      ]
     },
     "execution_count": 431,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "r = results(additional=[acc,num_samples,evaluation_time,shots,errors], task=lambda t: \"matrix\" in t and \"multi\" in t and \"medium\" in t, shots=0, decoder=\"var\", model=\"davinci\")\n",
    "r"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 476,
   "id": "3946efa3-00b6-4411-9afa-db3deaa546a4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['results/sqauni@cot_medium-openai-text-davinci-003-var-2023:01:26_19:14:46.json',\n",
       " 'results/sqauni@cot_medium-openai-text-davinci-003-beam_var-2023:01:26_19:31:22.json']"
      ]
     },
     "execution_count": 476,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "r = results(additional=[acc,num_samples,evaluation_time,shots,errors], task=lambda t: \"sqauni\" in t and \"cot\" in t and \"medium\" in t, shots=0, decoder=\"var\", model=\"davinci\")\n",
    "[r.path for r in r.result]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d9226f52-25c3-492c-ab3b-3b41d44f5a6a",
   "metadata": {},
   "outputs": [],
   "source": [
    "for r1,r2 in compare(\"e2f3a650\", \"8c6d2180\"):\n",
    "    print(\"NEW\", [r1.model_result])\n",
    "    print(\"OLD\", [r2.model_result], end=\"\\n\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 468,
   "id": "b280434a-0475-4423-804e-eb14964f2f1b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Two-Shot Results\n",
      "ao @ argmax, cot @ argmax , mvp @ argmax , mvp @ var\n",
      "AQuA ,       0.29,      0.45,      0.46,      0.44\n",
      "StrategyQA ,       0.67,      0.74,      0.78,      0.78\n",
      "Shuffled Obj. ,       0.10,      0.46,      0.57,      0.57\n",
      "Matrix Shapes. ,       0.67,      0.76,      0.81,      0.77\n",
      "\n",
      "Files:\n",
      "AQuA\n",
      "results/AQUA@ao_medium-openai-text-davinci-003-argmax-2023:03:15_15:59:52.csv\n",
      "results/AQUA@cot_medium-openai-text-davinci-003-argmax-2023:03:15_15:41:11.csv\n",
      "results/AQUA@dash_multivar_medium-openai-text-davinci-003-argmax-2023:03:15_15:42:50.csv\n",
      "results/AQUA@dash_multivar_medium-openai-text-davinci-003-var-2023:03:15_16:06:32.csv\n",
      "StrategyQA\n",
      "results/sqauni@ao_medium-openai-text-davinci-003-argmax-2023:03:15_18:42:02.csv\n",
      "results/sqauni@cot_medium-openai-text-davinci-003-argmax-2023:03:15_18:39:23.csv\n",
      "results/sqauni@multivar2_medium-openai-text-davinci-003-argmax-2023:03:15_18:40:38.csv\n",
      "results/sqauni@multivar2_medium-openai-text-davinci-003-var-2023:03:15_18:46:47.csv\n",
      "Shuffled Obj.\n",
      "results/tracking_shuffled_objects@ao_medium-openai-text-davinci-003-argmax-2023:03:16_15:01:53.csv\n",
      "results/tracking_shuffled_objects@cot_medium-openai-text-davinci-003-argmax-2023:03:16_15:49:53.csv\n",
      "results/tracking_shuffled_objects@multivar2_medium-openai-text-davinci-003-argmax-2023:03:16_15:55:53.csv\n",
      "results/tracking_shuffled_objects@multivar2_medium-openai-text-davinci-003-var-2023:03:16_17:52:49.csv\n",
      "Matrix Shapes.\n",
      "results/matrixshapes@ao_medium-openai-text-davinci-003-argmax-2023:03:16_17:16:49.csv\n",
      "results/matrixshapes@cot_medium-openai-text-davinci-003-argmax-2023:03:16_17:17:37.csv\n",
      "results/matrixshapes@multivar_medium-openai-text-davinci-003-argmax-2023:03:16_17:18:31.csv\n",
      "results/matrixshapes@multivar_medium-openai-text-davinci-003-var-2023:03:16_18:02:22.csv\n"
     ]
    }
   ],
   "source": [
    "print(\"Two-Shot Results\")\n",
    "print(\"ao @ argmax, cot @ argmax , mvp @ argmax , mvp @ var\")\n",
    "table_all({\n",
    "    \"AQuA\": [\n",
    "        #ao\n",
    "        \"56c0d98b\", \n",
    "        #cot\n",
    "        \"ff5c9699\",\n",
    "        # mvp\n",
    "        \"648c0fc3\", \"de08b397\"\n",
    "    ],\n",
    "    \"StrategyQA\": [\n",
    "        #ao\n",
    "        \"cf223ec2\", \n",
    "        #cot\n",
    "        \"30f3d66e\", \n",
    "        #mvp\n",
    "        \"c29b5c82\", \"22a1913c\", \n",
    "    ],\n",
    "    \"Shuffled Obj.\": [\n",
    "        # ao\n",
    "        \"4acb520f\",\n",
    "        #cot,\n",
    "        \"6732d792\",\n",
    "        # mvp\n",
    "        \"d8d0c2fb\", \"77e5751a\"\n",
    "    ], \n",
    "    \"Matrix Shapes.\": [\n",
    "        # ao\n",
    "        \"9c2eaa46\",\n",
    "        # cot\n",
    "        \"9afda2d7\",\n",
    "        #mvp\n",
    "        \"344b97e2\", \"812ef35a\"\n",
    "    ]\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 472,
   "id": "3800496e-fb16-436f-8c3a-08c0724c7c34",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "openai/text-curie-001 results\n",
      "ao @ argmax, cot @ argmax , mvp @ argmax , mvp @ var\n",
      "AQuA ,       0.16,      0.24,      0.27,      0.17\n",
      "StrategyQA ,       0.46,      0.53,      0.58,      0.52\n",
      "Shuffled Obj. ,       0.18,      0.19,      0.22,      0.24\n",
      "Matrix Shapes. ,       0.04,      0.07,      0.01,      0.00\n",
      "\n",
      "Files:\n",
      "AQuA\n",
      "results/AQUA@ao_medium-openai-text-curie-001-argmax-2023:03:16_18:54:24.csv\n",
      "results/AQUA@cot_medium-openai-text-curie-001-argmax-2023:03:15_17:40:40.csv\n",
      "results/AQUA@multivar_medium-openai-text-curie-001-var-2023:03:17_11:18:06.csv\n",
      "results/AQUA@dash_multivar_medium-openai-text-curie-001-argmax-2023:03:15_17:58:32.csv\n",
      "StrategyQA\n",
      "results/sqauni@ao_medium-openai-text-curie-001-argmax-2023:03:16_18:24:14.csv\n",
      "results/sqauni@cot_medium-openai-text-curie-001-argmax-2023:03:16_18:25:00.csv\n",
      "results/sqauni@multivar2_medium-openai-text-curie-001-argmax-2023:03:16_18:27:28.csv\n",
      "results/sqauni@multivar2_medium-openai-text-curie-001-var-2023:03:17_11:09:17.csv\n",
      "Shuffled Obj.\n",
      "results/tracking_shuffled_objects@ao_medium-openai-text-curie-001-argmax-2023:03:16_18:31:08.csv\n",
      "results/tracking_shuffled_objects@cot_medium-openai-text-curie-001-argmax-2023:03:16_18:33:35.csv\n",
      "results/tracking_shuffled_objects@multivar2_medium-openai-text-curie-001-argmax-2023:03:16_18:35:15.csv\n",
      "results/tracking_shuffled_objects@multivar2_medium-openai-text-curie-001-var-2023:03:17_11:01:12.csv\n",
      "Matrix Shapes.\n",
      "results/matrixshapes@ao_medium-openai-text-curie-001-argmax-2023:03:16_18:15:43.csv\n",
      "results/matrixshapes@cot_medium-openai-text-curie-001-argmax-2023:03:16_18:14:48.csv\n",
      "results/matrixshapes@multivar_medium-openai-text-curie-001-argmax-2023:03:16_18:10:53.csv\n",
      "results/matrixshapes@multivar_medium-openai-text-curie-001-var-2023:03:16_18:16:16.csv\n"
     ]
    }
   ],
   "source": [
    "print(\"openai/text-curie-001 results\")\n",
    "print(\"ao @ argmax, cot @ argmax , mvp @ argmax , mvp @ var\")\n",
    "table_all({\n",
    "    \"AQuA\": [\n",
    "        #ao\n",
    "        \"b99bc1ff\", \n",
    "        #cot\n",
    "        \"e6642dd8\",\n",
    "        # mvp\n",
    "        \"27541b43\", \"60cc1e52\"\n",
    "    ],\n",
    "    \"StrategyQA\": [\n",
    "        #ao\n",
    "        \"d20d051e\", \n",
    "        #cot\n",
    "        \"6a8b9a06\", \n",
    "        #mvp\n",
    "        \"d5ea2c80\", \"4828042e\", \n",
    "    ],\n",
    "    \"Shuffled Obj.\": [\n",
    "        # ao\n",
    "        \"0c0e6b3e\",\n",
    "        #cot,\n",
    "        \"1359a69f\",\n",
    "        # mvp\n",
    "        \"fda8ed89\", \"90980c73\"\n",
    "    ], \n",
    "    \"Matrix Shapes.\": [\n",
    "        # ao\n",
    "        \"36b7d8fa\",\n",
    "        # cot\n",
    "        \"6731bafa\",\n",
    "        #mvp\n",
    "        \"1e9a462b\", \"b13a91fe\"\n",
    "    ]\n",
    "})"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  },
  "toc-autonumbering": true
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
