{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "# redirect to the parent directory\n",
    "os.chdir(\"..\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "🚀 Starting Evaluation...\n",
      "Loading results/wtq-cot-all/result_5.jsonl...\n",
      "Loading results/wtq-cot-all/result_5.jsonl...\n",
      "Loaded 4344 results.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Progress: 100%|██████████| 100/100 [00:22<00:00,  4.39batch/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "🏁 Evaluation Complete.\n",
      "📊 Statistical Summary of 100 Trials on 4344 Examples from Combined Checkpoints\n",
      "Min Accuracy: 63.42% (2755.0/4344)\n",
      "Max Accuracy: 64.20% (2789.0/4344)\n",
      "Mean Accuracy: 63.84% (2773.0/4344)\n",
      "Standard Deviation: 0.17%\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "from evaluate import eval_wtq\n",
    "\n",
    "# evaluate 5 self consistency of CoT, average over 100 times shuffling\n",
    "eval_wtq(\n",
    "    checkpoints=\"results/wtq-cot-all/result_5.jsonl\",\n",
    "    n_times=100\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "🚀 Starting Evaluation...\n",
      "Loading ['results/wtq-agent-all/result_sc1.jsonl', 'results/wtq-agent-all/result_sc2.jsonl', 'results/wtq-agent-all/result_sc3.jsonl', 'results/wtq-agent-all/result_sc4.jsonl', 'results/wtq-agent-all/result_sc5.jsonl']...\n",
      "Loading results/wtq-agent-all/result_sc1.jsonl...\n",
      "Loaded 4344 results.\n",
      "Loading results/wtq-agent-all/result_sc2.jsonl...\n",
      "Loaded 4344 results.\n",
      "Loading results/wtq-agent-all/result_sc3.jsonl...\n",
      "Loaded 4344 results.\n",
      "Loading results/wtq-agent-all/result_sc4.jsonl...\n",
      "Loaded 4344 results.\n",
      "Loading results/wtq-agent-all/result_sc5.jsonl...\n",
      "Loaded 4344 results.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Progress: 100%|██████████| 100/100 [00:34<00:00,  2.88batch/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "🏁 Evaluation Complete.\n",
      "📊 Statistical Summary of 100 Trials on 4344 Examples from Combined Checkpoints\n",
      "Min Accuracy: 60.64% (2634.0/4344)\n",
      "Max Accuracy: 61.56% (2674.0/4344)\n",
      "Mean Accuracy: 61.08% (2653.0/4344)\n",
      "Standard Deviation: 0.17%\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "# evaluate 5 self consistency of PyAgent, average over 100 times shuffling\n",
    "eval_wtq(\n",
    "    checkpoints=[\"results/wtq-agent-all/result_sc1.jsonl\", \"results/wtq-agent-all/result_sc2.jsonl\", \"results/wtq-agent-all/result_sc3.jsonl\", \"results/wtq-agent-all/result_sc4.jsonl\", \"results/wtq-agent-all/result_sc5.jsonl\"],\n",
    "    n_times=100\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "🚀 Starting Evaluation...\n",
      "Loading ['results/wtq-cot-all/result_5.jsonl', 'results/wtq-agent-all/result_sc1.jsonl', 'results/wtq-agent-all/result_sc2.jsonl', 'results/wtq-agent-all/result_sc3.jsonl', 'results/wtq-agent-all/result_sc4.jsonl', 'results/wtq-agent-all/result_sc5.jsonl']...\n",
      "Loading results/wtq-cot-all/result_5.jsonl...\n",
      "Loaded 4344 results.\n",
      "Loading results/wtq-agent-all/result_sc1.jsonl...\n",
      "Loaded 4344 results.\n",
      "Loading results/wtq-agent-all/result_sc2.jsonl...\n",
      "Loaded 4344 results.\n",
      "Loading results/wtq-agent-all/result_sc3.jsonl...\n",
      "Loaded 4344 results.\n",
      "Loading results/wtq-agent-all/result_sc4.jsonl...\n",
      "Loaded 4344 results.\n",
      "Loading results/wtq-agent-all/result_sc5.jsonl...\n",
      "Loaded 4344 results.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Progress: 100%|██████████| 1/1 [00:00<00:00,  2.18batch/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "🏁 Evaluation Complete.\n",
      "📊 Statistical Summary of 1 Trials on 4344 Examples from Combined Checkpoints\n",
      "Min Accuracy: 73.64% (3199.0/4344)\n",
      "Max Accuracy: 73.64% (3199.0/4344)\n",
      "Mean Accuracy: 73.64% (3199.0/4344)\n",
      "Standard Deviation: 0.00%\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "# evaluate 5+5 self consistency of CoT and PyAgent, do not shuffle\n",
    "eval_wtq(\n",
    "    checkpoints=[\"results/wtq-cot-all/result_5.jsonl\", \"results/wtq-agent-all/result_sc1.jsonl\", \"results/wtq-agent-all/result_sc2.jsonl\", \"results/wtq-agent-all/result_sc3.jsonl\", \"results/wtq-agent-all/result_sc4.jsonl\", \"results/wtq-agent-all/result_sc5.jsonl\"],\n",
    "    n_times=1\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "🚀 Starting Evaluation...\n",
      "Loading ['results/wtq-cot-all/result_5.jsonl', 'results/wtq-agent-all/result_sc1.jsonl', 'results/wtq-agent-all/result_sc2.jsonl']...\n",
      "Loading results/wtq-cot-all/result_5.jsonl...\n",
      "Loaded 4344 results.\n",
      "Loading results/wtq-agent-all/result_sc1.jsonl...\n",
      "Loaded 4344 results.\n",
      "Loading results/wtq-agent-all/result_sc2.jsonl...\n",
      "Loaded 4344 results.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Progress: 100%|██████████| 1/1 [00:00<00:00,  3.73batch/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "🏁 Evaluation Complete.\n",
      "📊 Statistical Summary of 1 Trials on 4344 Examples from Combined Checkpoints\n",
      "Min Accuracy: 69.15% (3004.0/4344)\n",
      "Max Accuracy: 69.15% (3004.0/4344)\n",
      "Mean Accuracy: 69.15% (3004.0/4344)\n",
      "Standard Deviation: 0.00%\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "# eval any combination of checkpoints\n",
    "eval_wtq(\n",
    "    checkpoints=[\"results/wtq-cot-all/result_5.jsonl\", \"results/wtq-agent-all/result_sc1.jsonl\", \"results/wtq-agent-all/result_sc2.jsonl\"],\n",
    "    elements_per_checkpoint=[3, 1, 1],\n",
    "    n_times=1\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "table",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
