{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import random\n",
    "import json\n",
    "import os\n",
    "random.seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "subjects = sorted([f.split(\"_test.csv\")[0] for f in os.listdir(os.path.join(\"data\", \"test\")) if \"_test.csv\" in f])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "subcategories = {\n",
    "    \"abstract_algebra\": [\"math\"],\n",
    "    \"anatomy\": [\"health\"],\n",
    "    \"astronomy\": [\"physics\"],\n",
    "    \"business_ethics\": [\"business\"],\n",
    "    \"clinical_knowledge\": [\"health\"],\n",
    "    \"college_biology\": [\"biology\"],\n",
    "    \"college_chemistry\": [\"chemistry\"],\n",
    "    \"college_computer_science\": [\"computer science\"],\n",
    "    \"college_mathematics\": [\"math\"],\n",
    "    \"college_medicine\": [\"health\"],\n",
    "    \"college_physics\": [\"physics\"],\n",
    "    \"computer_security\": [\"computer science\"],\n",
    "    \"conceptual_physics\": [\"physics\"],\n",
    "    \"econometrics\": [\"economics\"],\n",
    "    \"electrical_engineering\": [\"engineering\"],\n",
    "    \"elementary_mathematics\": [\"math\"],\n",
    "    \"formal_logic\": [\"philosophy\"],\n",
    "    \"global_facts\": [\"other\"],\n",
    "    \"high_school_biology\": [\"biology\"],\n",
    "    \"high_school_chemistry\": [\"chemistry\"],\n",
    "    \"high_school_computer_science\": [\"computer science\"],\n",
    "    \"high_school_european_history\": [\"history\"],\n",
    "    \"high_school_geography\": [\"geography\"],\n",
    "    \"high_school_government_and_politics\": [\"politics\"],\n",
    "    \"high_school_macroeconomics\": [\"economics\"],\n",
    "    \"high_school_mathematics\": [\"math\"],\n",
    "    \"high_school_microeconomics\": [\"economics\"],\n",
    "    \"high_school_physics\": [\"physics\"],\n",
    "    \"high_school_psychology\": [\"psychology\"],\n",
    "    \"high_school_statistics\": [\"math\"],\n",
    "    \"high_school_us_history\": [\"history\"],\n",
    "    \"high_school_world_history\": [\"history\"],\n",
    "    \"human_aging\": [\"health\"],\n",
    "    \"human_sexuality\": [\"culture\"],\n",
    "    \"international_law\": [\"law\"],\n",
    "    \"jurisprudence\": [\"law\"],\n",
    "    \"logical_fallacies\": [\"philosophy\"],\n",
    "    \"machine_learning\": [\"computer science\"],\n",
    "    \"management\": [\"business\"],\n",
    "    \"marketing\": [\"business\"],\n",
    "    \"medical_genetics\": [\"health\"],\n",
    "    \"miscellaneous\": [\"other\"],\n",
    "    \"moral_disputes\": [\"philosophy\"],\n",
    "    \"moral_scenarios\": [\"philosophy\"],\n",
    "    \"nutrition\": [\"health\"],\n",
    "    \"philosophy\": [\"philosophy\"],\n",
    "    \"prehistory\": [\"history\"],\n",
    "    \"professional_accounting\": [\"other\"],\n",
    "    \"professional_law\": [\"law\"],\n",
    "    \"professional_medicine\": [\"health\"],\n",
    "    \"professional_psychology\": [\"psychology\"],\n",
    "    \"public_relations\": [\"politics\"],\n",
    "    \"security_studies\": [\"politics\"],\n",
    "    \"sociology\": [\"culture\"],\n",
    "    \"us_foreign_policy\": [\"politics\"],\n",
    "    \"virology\": [\"health\"],\n",
    "    \"world_religions\": [\"philosophy\"],\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "categories = {\n",
    "    \"STEM\": [\"physics\", \"chemistry\", \"biology\", \"computer science\", \"math\", \"engineering\"],\n",
    "    \"humanities\": [\"history\", \"philosophy\", \"law\"],\n",
    "    \"social sciences\": [\"politics\", \"culture\", \"economics\", \"geography\", \"psychology\"],\n",
    "    \"other (business, health, misc.)\": [\"other\", \"business\", \"health\"],\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "human = []\n",
    "social = []\n",
    "stem = []\n",
    "other = []\n",
    "\n",
    "for subject in subjects:\n",
    "    test_df = pd.read_csv(os.path.join(\"data\", \"test\", subject + \"_test.csv\"), header=None)\n",
    "    mid_subject = subcategories[subject][0]\n",
    "    super_subject = None\n",
    "    for k, v in categories.items():\n",
    "        if mid_subject in v:\n",
    "            super_subject = k\n",
    "            break\n",
    "\n",
    "    assert super_subject != None\n",
    "\n",
    "    for i in range(len(test_df)):\n",
    "        row = test_df.iloc[i]\n",
    "        ex = {}\n",
    "        ex['source'] = 'MMLU'\n",
    "        ex['task'] = 'QA'\n",
    "        ex['subject'] = subject\n",
    "        ex['subcategory'] = mid_subject\n",
    "        ex['supcategory'] = super_subject\n",
    "        ex['question'] = row[0]\n",
    "        ex['choices'] = {}\n",
    "\n",
    "        chs = [row[1], row[2], row[3], row[4]]\n",
    "        \n",
    "        fg = False\n",
    "        for ans in chs:\n",
    "            if \"none of the above\" in str(ans).lower():\n",
    "                # print(ans)\n",
    "                fg = True\n",
    "                break\n",
    "        if fg:\n",
    "            continue\n",
    "\n",
    "        chs.append(\"I don't know\")\n",
    "        chs.append(\"None of the above\")\n",
    "        letters = [\"A\", \"B\", \"C\", \"D\", \"E\", \"F\"]\n",
    "        for j in range(len(letters)):\n",
    "            ex['choices'][letters[j]] = chs[j]\n",
    "        ex['answer'] = row[5]\n",
    "\n",
    "        assert row[5] in letters\n",
    "\n",
    "        if super_subject == \"humanities\":\n",
    "            human.append(ex)\n",
    "        elif super_subject == \"social sciences\":\n",
    "            social.append(ex)\n",
    "        elif super_subject == \"STEM\":\n",
    "            stem.append(ex)\n",
    "        elif super_subject == \"other (business, health, misc.)\":\n",
    "            other.append(ex)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4612, 3060, 2981, 3232)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(human), len(social), len(stem), len(other)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10000"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sampled_data = random.sample(human, 2500) + random.sample(social, 2500) + random.sample(stem, 2500) + random.sample(other, 2500)\n",
    "random.shuffle(sampled_data)\n",
    "len(sampled_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "for idx, dd in enumerate(sampled_data):\n",
    "    dd['id'] = idx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"data/mmlu_10k.json\", 'w') as f:\n",
    "    json.dump(sampled_data, f, indent=2)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
