{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>accuracy_complex_cot</th>\n",
       "      <th>total_complex_cot</th>\n",
       "      <th>accuracy_complex_cot_php</th>\n",
       "      <th>total_complex_cot_php</th>\n",
       "      <th>accuracy_cr4shot</th>\n",
       "      <th>total_cr4shot</th>\n",
       "      <th>accuracy_cr4shot_php</th>\n",
       "      <th>total_cr4shot_php</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.790698</td>\n",
       "      <td>43</td>\n",
       "      <td>0.837209</td>\n",
       "      <td>43</td>\n",
       "      <td>0.837209</td>\n",
       "      <td>43</td>\n",
       "      <td>0.906977</td>\n",
       "      <td>43</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.722222</td>\n",
       "      <td>90</td>\n",
       "      <td>0.866667</td>\n",
       "      <td>90</td>\n",
       "      <td>0.788889</td>\n",
       "      <td>90</td>\n",
       "      <td>0.866667</td>\n",
       "      <td>90</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.628571</td>\n",
       "      <td>105</td>\n",
       "      <td>0.638095</td>\n",
       "      <td>105</td>\n",
       "      <td>0.628571</td>\n",
       "      <td>105</td>\n",
       "      <td>0.704762</td>\n",
       "      <td>105</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.382812</td>\n",
       "      <td>128</td>\n",
       "      <td>0.437500</td>\n",
       "      <td>128</td>\n",
       "      <td>0.429688</td>\n",
       "      <td>128</td>\n",
       "      <td>0.492188</td>\n",
       "      <td>128</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.223881</td>\n",
       "      <td>134</td>\n",
       "      <td>0.238806</td>\n",
       "      <td>134</td>\n",
       "      <td>0.320896</td>\n",
       "      <td>134</td>\n",
       "      <td>0.268657</td>\n",
       "      <td>134</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   accuracy_complex_cot  total_complex_cot  accuracy_complex_cot_php  \\\n",
       "1              0.790698                 43                  0.837209   \n",
       "2              0.722222                 90                  0.866667   \n",
       "3              0.628571                105                  0.638095   \n",
       "4              0.382812                128                  0.437500   \n",
       "5              0.223881                134                  0.238806   \n",
       "\n",
       "   total_complex_cot_php  accuracy_cr4shot  total_cr4shot  \\\n",
       "1                     43          0.837209             43   \n",
       "2                     90          0.788889             90   \n",
       "3                    105          0.628571            105   \n",
       "4                    128          0.429688            128   \n",
       "5                    134          0.320896            134   \n",
       "\n",
       "   accuracy_cr4shot_php  total_cr4shot_php  \n",
       "1              0.906977                 43  \n",
       "2              0.866667                 90  \n",
       "3              0.704762                105  \n",
       "4              0.492188                128  \n",
       "5              0.268657                134  "
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "\n",
    "# Initialize dictionaries to store accuracy details for each level for both files\n",
    "accuracy_data_level_complex_cot = {}\n",
    "accuracy_data_level_complex_cot_php = {}\n",
    "accuracy_data_level_cr4shot = {}\n",
    "accuracy_data_level_cr4shot_php = {}\n",
    "\n",
    "# Process the complex-cot file to extract accuracy details for each level\n",
    "with open(\"./results/results-math-complex-cot-gpt-4.jsonl\", \"r\") as file:\n",
    "    \n",
    "    # Process each JSON line to update accuracy counts\n",
    "    for line in file:\n",
    "        try:\n",
    "            json_obj = json.loads(line)\n",
    "            level = json_obj[\"level\"]\n",
    "            correctness = json_obj[\"correctness\"]\n",
    "            \n",
    "            if level not in accuracy_data_level_complex_cot:\n",
    "                accuracy_data_level_complex_cot[level] = {\"correct\": 0, \"total\": 0}\n",
    "            \n",
    "            accuracy_data_level_complex_cot[level][\"total\"] += 1\n",
    "            if correctness == \"Correct\":\n",
    "                accuracy_data_level_complex_cot[level][\"correct\"] += 1\n",
    "        except:\n",
    "            continue\n",
    "\n",
    "# Process the complex-cot-php file to extract accuracy details for each level\n",
    "with open(\"./results/results-math-php-gpt-4.jsonl\", \"r\") as file:\n",
    "    \n",
    "    # Process each JSON line to update accuracy counts\n",
    "    for line in file:\n",
    "        try:\n",
    "            json_obj = json.loads(line)\n",
    "            level = json_obj[\"level\"]\n",
    "            correctness = json_obj[\"correctness\"]\n",
    "            \n",
    "            if level not in accuracy_data_level_complex_cot_php:\n",
    "                accuracy_data_level_complex_cot_php[level] = {\"correct\": 0, \"total\": 0}\n",
    "            \n",
    "            accuracy_data_level_complex_cot_php[level][\"total\"] += 1\n",
    "            if correctness == \"Correct\":\n",
    "                accuracy_data_level_complex_cot_php[level][\"correct\"] += 1\n",
    "        except:\n",
    "            continue\n",
    "\n",
    "# Process the cr-4shot file to extract accuracy details for each problem_subject\n",
    "with open(\"./results/results-math-cr-4shot-gpt-4.jsonl\", \"r\") as file:\n",
    "    \n",
    "    # Process each JSON line to update accuracy counts\n",
    "    for line in file:\n",
    "        try:\n",
    "            json_obj = json.loads(line)\n",
    "            level = json_obj[\"level\"]\n",
    "            correctness = json_obj[\"correctness\"]\n",
    "            \n",
    "            if level not in accuracy_data_level_cr4shot:\n",
    "                accuracy_data_level_cr4shot[level] = {\"correct\": 0, \"total\": 0}\n",
    "            \n",
    "            accuracy_data_level_cr4shot[level][\"total\"] += 1\n",
    "            if correctness == \"Correct\":\n",
    "                accuracy_data_level_cr4shot[level][\"correct\"] += 1\n",
    "        except:\n",
    "            continue\n",
    "\n",
    "# Process the cr-4shot file to extract accuracy details for each problem_subject\n",
    "with open(\"./results/results-math-cr-4shot-php-gpt-4.jsonl\", \"r\") as file:\n",
    "    \n",
    "    # Process each JSON line to update accuracy counts\n",
    "    for line in file:\n",
    "        try:\n",
    "            json_obj = json.loads(line)\n",
    "            level = json_obj[\"level\"]\n",
    "            correctness = json_obj[\"correctness\"]\n",
    "            \n",
    "            if level not in accuracy_data_level_cr4shot_php:\n",
    "                accuracy_data_level_cr4shot_php[level] = {\"correct\": 0, \"total\": 0}\n",
    "            \n",
    "            accuracy_data_level_cr4shot_php[level][\"total\"] += 1\n",
    "            if correctness == \"Correct\":\n",
    "                accuracy_data_level_cr4shot_php[level][\"correct\"] += 1\n",
    "        except:\n",
    "            continue\n",
    "\n",
    "\n",
    "# Calculate accuracy for each problem_level\n",
    "accuracy_results_level = {\n",
    "    subject: {\n",
    "        \"accuracy_complex_cot\": accuracy_data_level_complex_cot[subject][\"correct\"] / accuracy_data_level_complex_cot[subject][\"total\"] if subject in accuracy_data_level_complex_cot else 0,\n",
    "        \"total_complex_cot\": accuracy_data_level_complex_cot[subject][\"total\"] if subject in accuracy_data_level_complex_cot else 0,\n",
    "        \"accuracy_complex_cot_php\": accuracy_data_level_complex_cot_php[subject][\"correct\"] / accuracy_data_level_complex_cot_php[subject][\"total\"] if subject in accuracy_data_level_complex_cot_php else 0,\n",
    "        \"total_complex_cot_php\": accuracy_data_level_complex_cot_php[subject][\"total\"] if subject in accuracy_data_level_complex_cot_php else 0,\n",
    "        \"accuracy_cr4shot\": accuracy_data_level_cr4shot[subject][\"correct\"] / accuracy_data_level_cr4shot[subject][\"total\"] if subject in accuracy_data_level_cr4shot else 0,\n",
    "        \"total_cr4shot\": accuracy_data_level_cr4shot[subject][\"total\"] if subject in accuracy_data_level_cr4shot else 0,\n",
    "        \"accuracy_cr4shot_php\": accuracy_data_level_cr4shot_php[subject][\"correct\"] / accuracy_data_level_cr4shot_php[subject][\"total\"] if subject in accuracy_data_level_cr4shot_php else 0,\n",
    "        \"total_cr4shot_php\": accuracy_data_level_cr4shot_php[subject][\"total\"] if subject in accuracy_data_level_cr4shot_php else 0,\n",
    "    }\n",
    "    for subject in set(accuracy_data_level_complex_cot) | set(accuracy_data_level_cr4shot)\n",
    "}\n",
    " \n",
    "# Convert to a DataFrame for better presentation\n",
    "df_subject_accuracy = pd.DataFrame.from_dict(accuracy_results_level, orient='index')\n",
    "df_subject_accuracy = df_subject_accuracy[['accuracy_complex_cot', 'total_complex_cot', 'accuracy_complex_cot_php', 'total_complex_cot_php', 'accuracy_cr4shot', 'total_cr4shot', 'accuracy_cr4shot_php', 'total_cr4shot_php']]\n",
    "df_subject_accuracy.sort_values(by='accuracy_complex_cot', ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>accuracy_complex_cot</th>\n",
       "      <th>total_complex_cot</th>\n",
       "      <th>accuracy_complex_cot_php</th>\n",
       "      <th>total_complex_cot_php</th>\n",
       "      <th>accuracy_cr4shot</th>\n",
       "      <th>total_cr4shot</th>\n",
       "      <th>accuracy_cr4shot_php</th>\n",
       "      <th>total_cr4shot_php</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Prealgebra</th>\n",
       "      <td>0.707317</td>\n",
       "      <td>82</td>\n",
       "      <td>0.841463</td>\n",
       "      <td>82</td>\n",
       "      <td>0.792683</td>\n",
       "      <td>82</td>\n",
       "      <td>0.865854</td>\n",
       "      <td>82</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Algebra</th>\n",
       "      <td>0.620968</td>\n",
       "      <td>124</td>\n",
       "      <td>0.685484</td>\n",
       "      <td>124</td>\n",
       "      <td>0.717742</td>\n",
       "      <td>124</td>\n",
       "      <td>0.717742</td>\n",
       "      <td>124</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Counting &amp; Probability</th>\n",
       "      <td>0.473684</td>\n",
       "      <td>38</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>38</td>\n",
       "      <td>0.578947</td>\n",
       "      <td>38</td>\n",
       "      <td>0.631579</td>\n",
       "      <td>38</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Number Theory</th>\n",
       "      <td>0.467742</td>\n",
       "      <td>62</td>\n",
       "      <td>0.532258</td>\n",
       "      <td>62</td>\n",
       "      <td>0.548387</td>\n",
       "      <td>62</td>\n",
       "      <td>0.596774</td>\n",
       "      <td>62</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Geometry</th>\n",
       "      <td>0.341463</td>\n",
       "      <td>41</td>\n",
       "      <td>0.439024</td>\n",
       "      <td>41</td>\n",
       "      <td>0.390244</td>\n",
       "      <td>41</td>\n",
       "      <td>0.439024</td>\n",
       "      <td>41</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Precalculus</th>\n",
       "      <td>0.339286</td>\n",
       "      <td>56</td>\n",
       "      <td>0.303571</td>\n",
       "      <td>56</td>\n",
       "      <td>0.303571</td>\n",
       "      <td>56</td>\n",
       "      <td>0.357143</td>\n",
       "      <td>56</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Intermediate Algebra</th>\n",
       "      <td>0.298969</td>\n",
       "      <td>97</td>\n",
       "      <td>0.288660</td>\n",
       "      <td>97</td>\n",
       "      <td>0.288660</td>\n",
       "      <td>97</td>\n",
       "      <td>0.319588</td>\n",
       "      <td>97</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                        accuracy_complex_cot  total_complex_cot  \\\n",
       "Prealgebra                          0.707317                 82   \n",
       "Algebra                             0.620968                124   \n",
       "Counting & Probability              0.473684                 38   \n",
       "Number Theory                       0.467742                 62   \n",
       "Geometry                            0.341463                 41   \n",
       "Precalculus                         0.339286                 56   \n",
       "Intermediate Algebra                0.298969                 97   \n",
       "\n",
       "                        accuracy_complex_cot_php  total_complex_cot_php  \\\n",
       "Prealgebra                              0.841463                     82   \n",
       "Algebra                                 0.685484                    124   \n",
       "Counting & Probability                  0.500000                     38   \n",
       "Number Theory                           0.532258                     62   \n",
       "Geometry                                0.439024                     41   \n",
       "Precalculus                             0.303571                     56   \n",
       "Intermediate Algebra                    0.288660                     97   \n",
       "\n",
       "                        accuracy_cr4shot  total_cr4shot  accuracy_cr4shot_php  \\\n",
       "Prealgebra                      0.792683             82              0.865854   \n",
       "Algebra                         0.717742            124              0.717742   \n",
       "Counting & Probability          0.578947             38              0.631579   \n",
       "Number Theory                   0.548387             62              0.596774   \n",
       "Geometry                        0.390244             41              0.439024   \n",
       "Precalculus                     0.303571             56              0.357143   \n",
       "Intermediate Algebra            0.288660             97              0.319588   \n",
       "\n",
       "                        total_cr4shot_php  \n",
       "Prealgebra                             82  \n",
       "Algebra                               124  \n",
       "Counting & Probability                 38  \n",
       "Number Theory                          62  \n",
       "Geometry                               41  \n",
       "Precalculus                            56  \n",
       "Intermediate Algebra                   97  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "\n",
    "# Initialize dictionaries to store accuracy details for each problem_subject for both files\n",
    "accuracy_data_subject_complex_cot = {}\n",
    "accuracy_data_subject_complex_cot_php = {}\n",
    "accuracy_data_subject_cr4shot = {}\n",
    "accuracy_data_subject_cr4shot_php = {}\n",
    "\n",
    "# Process the complex-cot file to extract accuracy details for each problem_subject\n",
    "with open(\"./results/results-math-complex-cot-gpt-4.jsonl\", \"r\") as file:\n",
    "    \n",
    "    # Process each JSON line to update accuracy counts\n",
    "    for line in file:\n",
    "        try:\n",
    "            json_obj = json.loads(line)\n",
    "            subject = json_obj[\"problem_subject\"]\n",
    "            correctness = json_obj[\"correctness\"]\n",
    "            \n",
    "            if subject not in accuracy_data_subject_complex_cot:\n",
    "                accuracy_data_subject_complex_cot[subject] = {\"correct\": 0, \"total\": 0}\n",
    "            \n",
    "            accuracy_data_subject_complex_cot[subject][\"total\"] += 1\n",
    "            if correctness == \"Correct\":\n",
    "                accuracy_data_subject_complex_cot[subject][\"correct\"] += 1\n",
    "        except:\n",
    "            continue\n",
    "\n",
    "# Process the complex-cot-php file to extract accuracy details for each problem_subject\n",
    "with open(\"./results/results-math-php-gpt-4.jsonl\", \"r\") as file:\n",
    "    \n",
    "    # Process each JSON line to update accuracy counts\n",
    "    for line in file:\n",
    "        try:\n",
    "            json_obj = json.loads(line)\n",
    "            subject = json_obj[\"problem_subject\"]\n",
    "            correctness = json_obj[\"correctness\"]\n",
    "            \n",
    "            if subject not in accuracy_data_subject_complex_cot_php:\n",
    "                accuracy_data_subject_complex_cot_php[subject] = {\"correct\": 0, \"total\": 0}\n",
    "            \n",
    "            accuracy_data_subject_complex_cot_php[subject][\"total\"] += 1\n",
    "            if correctness == \"Correct\":\n",
    "                accuracy_data_subject_complex_cot_php[subject][\"correct\"] += 1\n",
    "        except:\n",
    "            continue\n",
    "\n",
    "# Process the cr-4shot file to extract accuracy details for each problem_subject\n",
    "with open(\"./results/results-math-cr-4shot-gpt-4.jsonl\", \"r\") as file:\n",
    "    \n",
    "    # Process each JSON line to update accuracy counts\n",
    "    for line in file:\n",
    "        try:\n",
    "            json_obj = json.loads(line)\n",
    "            subject = json_obj[\"problem_subject\"]\n",
    "            correctness = json_obj[\"correctness\"]\n",
    "            \n",
    "            if subject not in accuracy_data_subject_cr4shot:\n",
    "                accuracy_data_subject_cr4shot[subject] = {\"correct\": 0, \"total\": 0}\n",
    "            \n",
    "            accuracy_data_subject_cr4shot[subject][\"total\"] += 1\n",
    "            if correctness == \"Correct\":\n",
    "                accuracy_data_subject_cr4shot[subject][\"correct\"] += 1\n",
    "        except:\n",
    "            continue\n",
    "\n",
    "# Process the cr-4shot file to extract accuracy details for each problem_subject\n",
    "with open(\"./results/results-math-cr-4shot-php-gpt-4.jsonl\", \"r\") as file:\n",
    "    \n",
    "    # Process each JSON line to update accuracy counts\n",
    "    for line in file:\n",
    "        try:\n",
    "            json_obj = json.loads(line)\n",
    "            subject = json_obj[\"problem_subject\"]\n",
    "            correctness = json_obj[\"correctness\"]\n",
    "            \n",
    "            if subject not in accuracy_data_subject_cr4shot_php:\n",
    "                accuracy_data_subject_cr4shot_php[subject] = {\"correct\": 0, \"total\": 0}\n",
    "            \n",
    "            accuracy_data_subject_cr4shot_php[subject][\"total\"] += 1\n",
    "            if correctness == \"Correct\":\n",
    "                accuracy_data_subject_cr4shot_php[subject][\"correct\"] += 1\n",
    "        except:\n",
    "            continue\n",
    "\n",
    "# Calculate accuracy for each problem_subject\n",
    "accuracy_results_subject = {\n",
    "    subject: {\n",
    "        \"accuracy_complex_cot\": accuracy_data_subject_complex_cot[subject][\"correct\"] / accuracy_data_subject_complex_cot[subject][\"total\"] if subject in accuracy_data_subject_complex_cot else 0,\n",
    "        \"total_complex_cot\": accuracy_data_subject_complex_cot[subject][\"total\"] if subject in accuracy_data_subject_complex_cot else 0,\n",
    "        \"accuracy_complex_cot_php\": accuracy_data_subject_complex_cot_php[subject][\"correct\"] / accuracy_data_subject_complex_cot_php[subject][\"total\"] if subject in accuracy_data_subject_complex_cot_php else 0,\n",
    "        \"total_complex_cot_php\": accuracy_data_subject_complex_cot_php[subject][\"total\"] if subject in accuracy_data_subject_complex_cot_php else 0,\n",
    "        \"accuracy_cr4shot\": accuracy_data_subject_cr4shot[subject][\"correct\"] / accuracy_data_subject_cr4shot[subject][\"total\"] if subject in accuracy_data_subject_cr4shot else 0,\n",
    "        \"total_cr4shot\": accuracy_data_subject_cr4shot[subject][\"total\"] if subject in accuracy_data_subject_cr4shot else 0,\n",
    "        \"accuracy_cr4shot_php\": accuracy_data_subject_cr4shot_php[subject][\"correct\"] / accuracy_data_subject_cr4shot_php[subject][\"total\"] if subject in accuracy_data_subject_cr4shot_php else 0,\n",
    "        \"total_cr4shot_php\": accuracy_data_subject_cr4shot_php[subject][\"total\"] if subject in accuracy_data_subject_cr4shot_php else 0,\n",
    "    }\n",
    "    for subject in set(accuracy_data_subject_complex_cot) | set(accuracy_data_subject_cr4shot)\n",
    "}\n",
    "\n",
    "# Convert to a DataFrame for better presentation\n",
    "df_subject_accuracy = pd.DataFrame.from_dict(accuracy_results_subject, orient='index')\n",
    "df_subject_accuracy = df_subject_accuracy[['accuracy_complex_cot', 'total_complex_cot', 'accuracy_complex_cot_php', 'total_complex_cot_php', 'accuracy_cr4shot', 'total_cr4shot', 'accuracy_cr4shot_php', 'total_cr4shot_php']]\n",
    "df_subject_accuracy.sort_values(by='accuracy_complex_cot', ascending=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "guidance",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
