{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "34d07ea6",
   "metadata": {},
   "source": [
    "## Box Plots for MCQA Figure 8"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9d6d0f84",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "import matplotlib.pyplot as plt\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fbb8b1b1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open('skate/figures/figure_8/MCstats_Experiment1a_sonnet.json', 'r') as f:\n",
    "    mcstats_sonnet = json.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9e7981f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# Collect scores for each question\n",
    "all_scores = []\n",
    "for _ in range(10):\n",
    "    gt_answer = mcstats_sonnet['gt_answers'][_]\n",
    "    scores = []\n",
    "\n",
    "    for answer_dict, mc_options in mcstats_sonnet['results'][str(_)]:\n",
    "        if gt_answer in answer_dict.keys():\n",
    "            score = answer_dict[gt_answer] / 50.0\n",
    "        else:\n",
    "            score = 0.0\n",
    "        scores.append(score)\n",
    "    all_scores.append(scores)\n",
    "\n",
    "\n",
    "\n",
    "plt.figure(figsize=(12, 6))\n",
    "box = plt.boxplot(\n",
    "    all_scores, \n",
    "    positions=range(1, len(all_scores) + 1), \n",
    "    patch_artist=True, \n",
    "    boxprops=dict(facecolor='white', color='black'),\n",
    "    medianprops=dict(color='black', linewidth=2),\n",
    "    whiskerprops=dict(color='black', linewidth=2),\n",
    "    capprops=dict(color='black', linewidth=2),\n",
    "    flierprops=dict(marker='o', color='black', alpha=0.7)\n",
    ")\n",
    "plt.xlabel('Question Number', fontsize=22)\n",
    "plt.ylabel('Fraction Correct', fontsize=22)\n",
    "plt.title('LLM sensitivity to MCQA Option Set', fontsize=22, color='#333333')\n",
    "plt.xticks(range(1, len(all_scores) + 1), fontsize=12)\n",
    "plt.yticks(fontsize=12)\n",
    "plt.grid(True, linestyle='--', alpha=0.3)\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d1c1f068",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open('skate/figures/figure_8/MCstats_Experiment2a_sonnet.json', 'r') as f:\n",
    "    mcstats_sonnet = json.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5f0a1d8d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Collect scores for each question\n",
    "all_scores = []\n",
    "for _ in range(10):\n",
    "    gt_answer = mcstats_sonnet['gt_answers'][_]\n",
    "    scores = []\n",
    "\n",
    "    for answer_dict, mc_options in mcstats_sonnet['results'][str(_)]:\n",
    "        if gt_answer in answer_dict.keys():\n",
    "            score = answer_dict[gt_answer] / 50.0\n",
    "        else:\n",
    "            score = 0.0\n",
    "        scores.append(score)\n",
    "    all_scores.append(scores)\n",
    "\n",
    "\n",
    "\n",
    "plt.figure(figsize=(12, 6))\n",
    "box = plt.boxplot(\n",
    "    all_scores, \n",
    "    positions=range(1, len(all_scores) + 1), \n",
    "    patch_artist=True, \n",
    "    boxprops=dict(facecolor='white', color='black'),\n",
    "    medianprops=dict(color='black', linewidth=2),\n",
    "    whiskerprops=dict(color='black', linewidth=2),\n",
    "    capprops=dict(color='black', linewidth=2),\n",
    "    flierprops=dict(marker='o', color='black', alpha=0.7)\n",
    ")\n",
    "plt.xlabel('Question Number', fontsize=22)\n",
    "plt.ylabel('Fraction Correct', fontsize=22)\n",
    "plt.title('LLM sensitivity to MCQA Option Ordering', fontsize=22, color='#333333')\n",
    "plt.xticks(range(1, len(all_scores) + 1), fontsize=12)\n",
    "plt.yticks(fontsize=12)\n",
    "plt.grid(True, linestyle='--', alpha=0.3)\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
