{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/opt/dlami/nvme/anonymous/codes/med-sipf\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "os.chdir(\"../..\")\n",
    "print(os.getcwd())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import dotenv\n",
    "\n",
    "dotenv.load_dotenv(override=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/dlami/nvme/anonymous/misc/miniconda3/envs/anonymous-med_sipf/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "import json\n",
    "from typing import List, Union\n",
    "from pprint import pprint\n",
    "import seaborn as sns\n",
    "import datasets\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def stats_domain(repo_id, domain_repo_id, repo_id_split, domain_repo_id_split):\n",
    "    dataset = datasets.load_dataset(repo_id, split=repo_id_split)\n",
    "    prompt_domain_dataset = datasets.load_dataset(\n",
    "    domain_repo_id, split=domain_repo_id_split\n",
    ")\n",
    "    prompt_domain_mapping = (\n",
    "    prompt_domain_dataset.to_pandas().set_index(\"prompt\").to_dict(orient=\"index\")\n",
    ")\n",
    "\n",
    "\n",
    "    def add_domain(sample):\n",
    "        return prompt_domain_mapping[sample[\"prompt\"]]\n",
    "\n",
    "\n",
    "    dataset = dataset.map(add_domain)\n",
    "    kept_columns = [\"prompt\", \"domain_code\", \"domain_name\", \"source\"]\n",
    "    removed_columns = [col for col in dataset.column_names if col not in kept_columns]\n",
    "    domain_dataset = dataset.remove_columns(removed_columns)\n",
    "\n",
    "    df = domain_dataset.to_pandas()\n",
    "    dict(df.value_counts(\"domain_name\"))\n",
    "    count_table = {}\n",
    "    for source in df[\"source\"].unique():\n",
    "        count_table[source] = {}\n",
    "        for domain_name in df[\"domain_name\"].unique():\n",
    "            count_table[source][domain_name] = len(\n",
    "            df[(df[\"domain_name\"] == domain_name) & (df[\"source\"] == source)]\n",
    "        )\n",
    "    count_df = pd.DataFrame(count_table)\n",
    "    count_df.sort_index(inplace=True)\n",
    "    pd.set_option(\"display.max_rows\", None)\n",
    "    display(count_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>GBaker/MedQA-USMLE-4-options</th>\n",
       "      <th>qiaojin/PubMedQA:pqa_labeled</th>\n",
       "      <th>openlifescienceai/headqa</th>\n",
       "      <th>openlifescienceai/medmcqa</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Abnormalities</th>\n",
       "      <td>24</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>646</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Administration &amp; Dosage</th>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>590</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Adverse Effects</th>\n",
       "      <td>116</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>936</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Agonists</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Analogs &amp; Derivatives</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Analysis</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>14</td>\n",
       "      <td>222</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Anatomy &amp; Histology</th>\n",
       "      <td>17</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>2328</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Antagonists &amp; Inhibitors</th>\n",
       "      <td>14</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>113</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Biosynthesis</th>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>73</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Blood</th>\n",
       "      <td>45</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>360</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Blood Supply</th>\n",
       "      <td>19</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>228</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Cerebrospinal Fluid</th>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>34</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Chemical Synthesis</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Chemically Induced</th>\n",
       "      <td>16</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>82</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Chemistry</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>21</td>\n",
       "      <td>342</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Classification</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>341</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Complications</th>\n",
       "      <td>73</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Congenital</th>\n",
       "      <td>59</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>382</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Cytology</th>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>90</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Deficiency</th>\n",
       "      <td>58</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>245</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Diagnosis</th>\n",
       "      <td>229</td>\n",
       "      <td>1</td>\n",
       "      <td>17</td>\n",
       "      <td>1681</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Diagnostic Imaging</th>\n",
       "      <td>25</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>720</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Diet Therapy</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>41</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Drug Effects</th>\n",
       "      <td>12</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>107</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Drug Therapy</th>\n",
       "      <td>164</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>509</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Economics</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Education</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Embryology</th>\n",
       "      <td>12</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>245</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Enzymology</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>95</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Epidemiology</th>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>265</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Ethics</th>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Ethnology</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Etiology</th>\n",
       "      <td>133</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>657</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Genetics</th>\n",
       "      <td>66</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>293</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Growth &amp; Development</th>\n",
       "      <td>8</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>245</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>History</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>72</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Immunology</th>\n",
       "      <td>17</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>199</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Injuries</th>\n",
       "      <td>12</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>430</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Innervation</th>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>157</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Instrumentation</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>151</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Isolation &amp; Purification</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Legislation &amp; Jurisprudence</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>204</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Metabolism</th>\n",
       "      <td>15</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>158</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Methods</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>260</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Microbiology</th>\n",
       "      <td>27</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>358</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Mortality</th>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Nursing</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Organization &amp; Administration</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>111</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Parasitology</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>209</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Pathogenicity</th>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>71</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Pathology</th>\n",
       "      <td>68</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1532</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Pharmacokinetics</th>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Pharmacology</th>\n",
       "      <td>31</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>315</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Physiology</th>\n",
       "      <td>45</td>\n",
       "      <td>1</td>\n",
       "      <td>17</td>\n",
       "      <td>1168</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Physiopathology</th>\n",
       "      <td>92</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>1093</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Poisoning</th>\n",
       "      <td>17</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>171</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Prevention &amp; Control</th>\n",
       "      <td>17</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>131</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Psychology</th>\n",
       "      <td>20</td>\n",
       "      <td>1</td>\n",
       "      <td>21</td>\n",
       "      <td>165</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Radiation Effects</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>56</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Radiotherapy</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>28</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Rehabilitation</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Secondary</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>44</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Standards</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>67</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Statistics &amp; Numerical Data</th>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>53</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Supply &amp; Distribution</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Surgery</th>\n",
       "      <td>5</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>749</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Therapeutic Use</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>230</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Therapy</th>\n",
       "      <td>45</td>\n",
       "      <td>2</td>\n",
       "      <td>7</td>\n",
       "      <td>373</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Toxicity</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Transmission</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>86</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Transplantation</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Trends</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Ultrastructure</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Urine</th>\n",
       "      <td>11</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>77</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Veterinary</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Virology</th>\n",
       "      <td>12</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>90</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               GBaker/MedQA-USMLE-4-options  \\\n",
       "Abnormalities                                            24   \n",
       "Administration & Dosage                                   9   \n",
       "Adverse Effects                                         116   \n",
       "Agonists                                                  2   \n",
       "Analogs & Derivatives                                     1   \n",
       "Analysis                                                  2   \n",
       "Anatomy & Histology                                      17   \n",
       "Antagonists & Inhibitors                                 14   \n",
       "Biosynthesis                                              4   \n",
       "Blood                                                    45   \n",
       "Blood Supply                                             19   \n",
       "Cerebrospinal Fluid                                       7   \n",
       "Chemical Synthesis                                        0   \n",
       "Chemically Induced                                       16   \n",
       "Chemistry                                                 0   \n",
       "Classification                                            0   \n",
       "Complications                                            73   \n",
       "Congenital                                               59   \n",
       "Cytology                                                  3   \n",
       "Deficiency                                               58   \n",
       "Diagnosis                                               229   \n",
       "Diagnostic Imaging                                       25   \n",
       "Diet Therapy                                              3   \n",
       "Drug Effects                                             12   \n",
       "Drug Therapy                                            164   \n",
       "Economics                                                 1   \n",
       "Education                                                 0   \n",
       "Embryology                                               12   \n",
       "Enzymology                                                5   \n",
       "Epidemiology                                              8   \n",
       "Ethics                                                    8   \n",
       "Ethnology                                                 0   \n",
       "Etiology                                                133   \n",
       "Genetics                                                 66   \n",
       "Growth & Development                                      8   \n",
       "History                                                   0   \n",
       "Immunology                                               17   \n",
       "Injuries                                                 12   \n",
       "Innervation                                              10   \n",
       "Instrumentation                                           0   \n",
       "Isolation & Purification                                  0   \n",
       "Legislation & Jurisprudence                               0   \n",
       "Metabolism                                               15   \n",
       "Methods                                                   1   \n",
       "Microbiology                                             27   \n",
       "Mortality                                                 3   \n",
       "Nursing                                                   0   \n",
       "Organization & Administration                             0   \n",
       "Parasitology                                              5   \n",
       "Pathogenicity                                            10   \n",
       "Pathology                                                68   \n",
       "Pharmacokinetics                                          4   \n",
       "Pharmacology                                             31   \n",
       "Physiology                                               45   \n",
       "Physiopathology                                          92   \n",
       "Poisoning                                                17   \n",
       "Prevention & Control                                     17   \n",
       "Psychology                                               20   \n",
       "Radiation Effects                                         1   \n",
       "Radiotherapy                                              0   \n",
       "Rehabilitation                                            0   \n",
       "Secondary                                                 2   \n",
       "Standards                                                 2   \n",
       "Statistics & Numerical Data                               5   \n",
       "Supply & Distribution                                     0   \n",
       "Surgery                                                   5   \n",
       "Therapeutic Use                                           5   \n",
       "Therapy                                                  45   \n",
       "Toxicity                                                  2   \n",
       "Transmission                                              1   \n",
       "Transplantation                                           0   \n",
       "Trends                                                    0   \n",
       "Ultrastructure                                            0   \n",
       "Urine                                                    11   \n",
       "Veterinary                                                0   \n",
       "Virology                                                 12   \n",
       "\n",
       "                               qiaojin/PubMedQA:pqa_labeled  \\\n",
       "Abnormalities                                             1   \n",
       "Administration & Dosage                                   0   \n",
       "Adverse Effects                                           2   \n",
       "Agonists                                                  0   \n",
       "Analogs & Derivatives                                     0   \n",
       "Analysis                                                  2   \n",
       "Anatomy & Histology                                       0   \n",
       "Antagonists & Inhibitors                                  0   \n",
       "Biosynthesis                                              0   \n",
       "Blood                                                     1   \n",
       "Blood Supply                                              0   \n",
       "Cerebrospinal Fluid                                       0   \n",
       "Chemical Synthesis                                        0   \n",
       "Chemically Induced                                        0   \n",
       "Chemistry                                                 0   \n",
       "Classification                                            0   \n",
       "Complications                                             1   \n",
       "Congenital                                                0   \n",
       "Cytology                                                  0   \n",
       "Deficiency                                                1   \n",
       "Diagnosis                                                 1   \n",
       "Diagnostic Imaging                                        4   \n",
       "Diet Therapy                                              1   \n",
       "Drug Effects                                              0   \n",
       "Drug Therapy                                              0   \n",
       "Economics                                                 1   \n",
       "Education                                                 1   \n",
       "Embryology                                                0   \n",
       "Enzymology                                                0   \n",
       "Epidemiology                                              0   \n",
       "Ethics                                                    0   \n",
       "Ethnology                                                 0   \n",
       "Etiology                                                  1   \n",
       "Genetics                                                  0   \n",
       "Growth & Development                                      1   \n",
       "History                                                   0   \n",
       "Immunology                                                0   \n",
       "Injuries                                                  0   \n",
       "Innervation                                               0   \n",
       "Instrumentation                                           0   \n",
       "Isolation & Purification                                  0   \n",
       "Legislation & Jurisprudence                               0   \n",
       "Metabolism                                                0   \n",
       "Methods                                                   1   \n",
       "Microbiology                                              0   \n",
       "Mortality                                                 0   \n",
       "Nursing                                                   0   \n",
       "Organization & Administration                             1   \n",
       "Parasitology                                              0   \n",
       "Pathogenicity                                             0   \n",
       "Pathology                                                 0   \n",
       "Pharmacokinetics                                          0   \n",
       "Pharmacology                                              0   \n",
       "Physiology                                                1   \n",
       "Physiopathology                                           1   \n",
       "Poisoning                                                 0   \n",
       "Prevention & Control                                      0   \n",
       "Psychology                                                1   \n",
       "Radiation Effects                                         0   \n",
       "Radiotherapy                                              0   \n",
       "Rehabilitation                                            0   \n",
       "Secondary                                                 0   \n",
       "Standards                                                 0   \n",
       "Statistics & Numerical Data                               1   \n",
       "Supply & Distribution                                     0   \n",
       "Surgery                                                   2   \n",
       "Therapeutic Use                                           0   \n",
       "Therapy                                                   2   \n",
       "Toxicity                                                  0   \n",
       "Transmission                                              0   \n",
       "Transplantation                                           0   \n",
       "Trends                                                    0   \n",
       "Ultrastructure                                            0   \n",
       "Urine                                                     0   \n",
       "Veterinary                                                0   \n",
       "Virology                                                  0   \n",
       "\n",
       "                               openlifescienceai/headqa  \\\n",
       "Abnormalities                                         0   \n",
       "Administration & Dosage                               3   \n",
       "Adverse Effects                                       2   \n",
       "Agonists                                              1   \n",
       "Analogs & Derivatives                                 0   \n",
       "Analysis                                             14   \n",
       "Anatomy & Histology                                   3   \n",
       "Antagonists & Inhibitors                              2   \n",
       "Biosynthesis                                          3   \n",
       "Blood                                                 0   \n",
       "Blood Supply                                          0   \n",
       "Cerebrospinal Fluid                                   0   \n",
       "Chemical Synthesis                                    5   \n",
       "Chemically Induced                                    1   \n",
       "Chemistry                                            21   \n",
       "Classification                                        5   \n",
       "Complications                                         2   \n",
       "Congenital                                            0   \n",
       "Cytology                                              1   \n",
       "Deficiency                                            2   \n",
       "Diagnosis                                            17   \n",
       "Diagnostic Imaging                                    2   \n",
       "Diet Therapy                                          2   \n",
       "Drug Effects                                          0   \n",
       "Drug Therapy                                          6   \n",
       "Economics                                             1   \n",
       "Education                                             1   \n",
       "Embryology                                            1   \n",
       "Enzymology                                            3   \n",
       "Epidemiology                                          3   \n",
       "Ethics                                                2   \n",
       "Ethnology                                             0   \n",
       "Etiology                                              5   \n",
       "Genetics                                              5   \n",
       "Growth & Development                                  3   \n",
       "History                                               2   \n",
       "Immunology                                            2   \n",
       "Injuries                                              1   \n",
       "Innervation                                           1   \n",
       "Instrumentation                                       0   \n",
       "Isolation & Purification                              0   \n",
       "Legislation & Jurisprudence                           1   \n",
       "Metabolism                                            5   \n",
       "Methods                                               3   \n",
       "Microbiology                                          0   \n",
       "Mortality                                             0   \n",
       "Nursing                                               5   \n",
       "Organization & Administration                         3   \n",
       "Parasitology                                          1   \n",
       "Pathogenicity                                         0   \n",
       "Pathology                                             2   \n",
       "Pharmacokinetics                                      6   \n",
       "Pharmacology                                          1   \n",
       "Physiology                                           17   \n",
       "Physiopathology                                       3   \n",
       "Poisoning                                             0   \n",
       "Prevention & Control                                  1   \n",
       "Psychology                                           21   \n",
       "Radiation Effects                                     0   \n",
       "Radiotherapy                                          0   \n",
       "Rehabilitation                                        1   \n",
       "Secondary                                             0   \n",
       "Standards                                             0   \n",
       "Statistics & Numerical Data                           2   \n",
       "Supply & Distribution                                 0   \n",
       "Surgery                                               1   \n",
       "Therapeutic Use                                       1   \n",
       "Therapy                                               7   \n",
       "Toxicity                                              0   \n",
       "Transmission                                          1   \n",
       "Transplantation                                       0   \n",
       "Trends                                                0   \n",
       "Ultrastructure                                        0   \n",
       "Urine                                                 0   \n",
       "Veterinary                                            0   \n",
       "Virology                                              5   \n",
       "\n",
       "                               openlifescienceai/medmcqa  \n",
       "Abnormalities                                        646  \n",
       "Administration & Dosage                              590  \n",
       "Adverse Effects                                      936  \n",
       "Agonists                                              20  \n",
       "Analogs & Derivatives                                 10  \n",
       "Analysis                                             222  \n",
       "Anatomy & Histology                                 2328  \n",
       "Antagonists & Inhibitors                             113  \n",
       "Biosynthesis                                          73  \n",
       "Blood                                                360  \n",
       "Blood Supply                                         228  \n",
       "Cerebrospinal Fluid                                   34  \n",
       "Chemical Synthesis                                     1  \n",
       "Chemically Induced                                    82  \n",
       "Chemistry                                            342  \n",
       "Classification                                       341  \n",
       "Complications                                        625  \n",
       "Congenital                                           382  \n",
       "Cytology                                              90  \n",
       "Deficiency                                           245  \n",
       "Diagnosis                                           1681  \n",
       "Diagnostic Imaging                                   720  \n",
       "Diet Therapy                                          41  \n",
       "Drug Effects                                         107  \n",
       "Drug Therapy                                         509  \n",
       "Economics                                             17  \n",
       "Education                                             20  \n",
       "Embryology                                           245  \n",
       "Enzymology                                            95  \n",
       "Epidemiology                                         265  \n",
       "Ethics                                                20  \n",
       "Ethnology                                              5  \n",
       "Etiology                                             657  \n",
       "Genetics                                             293  \n",
       "Growth & Development                                 245  \n",
       "History                                               72  \n",
       "Immunology                                           199  \n",
       "Injuries                                             430  \n",
       "Innervation                                          157  \n",
       "Instrumentation                                      151  \n",
       "Isolation & Purification                               7  \n",
       "Legislation & Jurisprudence                          204  \n",
       "Metabolism                                           158  \n",
       "Methods                                              260  \n",
       "Microbiology                                         358  \n",
       "Mortality                                             30  \n",
       "Nursing                                                6  \n",
       "Organization & Administration                        111  \n",
       "Parasitology                                         209  \n",
       "Pathogenicity                                         71  \n",
       "Pathology                                           1532  \n",
       "Pharmacokinetics                                      99  \n",
       "Pharmacology                                         315  \n",
       "Physiology                                          1168  \n",
       "Physiopathology                                     1093  \n",
       "Poisoning                                            171  \n",
       "Prevention & Control                                 131  \n",
       "Psychology                                           165  \n",
       "Radiation Effects                                     56  \n",
       "Radiotherapy                                          28  \n",
       "Rehabilitation                                        10  \n",
       "Secondary                                             44  \n",
       "Standards                                             67  \n",
       "Statistics & Numerical Data                           53  \n",
       "Supply & Distribution                                 10  \n",
       "Surgery                                              749  \n",
       "Therapeutic Use                                      230  \n",
       "Therapy                                              373  \n",
       "Toxicity                                              29  \n",
       "Transmission                                          86  \n",
       "Transplantation                                       31  \n",
       "Trends                                                 1  \n",
       "Ultrastructure                                         7  \n",
       "Urine                                                 77  \n",
       "Veterinary                                             2  \n",
       "Virology                                              90  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "repo_id = (\n",
    "    \"anonymous/m196k-dedup-decon-filter_easy-r1-filter_wrong-decon_eval-tokenized-120325\"\n",
    ")\n",
    "repo_id_split = \"train\"\n",
    "\n",
    "domain_repo_id = \"anonymous/m196k-dedup-decon-filter_easy-r1-filter_wrong-decon_eval-tokenized-120325-extract-domain-code\"\n",
    "domain_repo_id_split = \"train\"\n",
    "stats_domain(repo_id, domain_repo_id, repo_id_split, domain_repo_id_split)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>openlifescienceai/medmcqa</th>\n",
       "      <th>qiaojin/PubMedQA:pqa_labeled</th>\n",
       "      <th>openlifescienceai/headqa</th>\n",
       "      <th>GBaker/MedQA-USMLE-4-options</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Abnormalities</th>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Administration &amp; Dosage</th>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Adverse Effects</th>\n",
       "      <td>7</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Agonists</th>\n",
       "      <td>13</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Analogs &amp; Derivatives</th>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Analysis</th>\n",
       "      <td>6</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Anatomy &amp; Histology</th>\n",
       "      <td>11</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Antagonists &amp; Inhibitors</th>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Biosynthesis</th>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Blood</th>\n",
       "      <td>12</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Blood Supply</th>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Cerebrospinal Fluid</th>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Chemical Synthesis</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Chemically Induced</th>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Chemistry</th>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Classification</th>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Complications</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Congenital</th>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Cytology</th>\n",
       "      <td>18</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Deficiency</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Diagnosis</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Diagnostic Imaging</th>\n",
       "      <td>11</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Diet Therapy</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Drug Effects</th>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Drug Therapy</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Economics</th>\n",
       "      <td>11</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Education</th>\n",
       "      <td>12</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Embryology</th>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Enzymology</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Epidemiology</th>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Ethics</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Ethnology</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Etiology</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Genetics</th>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Growth &amp; Development</th>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>History</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Immunology</th>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Injuries</th>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Innervation</th>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Instrumentation</th>\n",
       "      <td>17</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Isolation &amp; Purification</th>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Legislation &amp; Jurisprudence</th>\n",
       "      <td>15</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Metabolism</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Methods</th>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Microbiology</th>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Mortality</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Nursing</th>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Organization &amp; Administration</th>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Parasitology</th>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Pathogenicity</th>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Pathology</th>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Pharmacokinetics</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Pharmacology</th>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Physiology</th>\n",
       "      <td>13</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Physiopathology</th>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Poisoning</th>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Prevention &amp; Control</th>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Psychology</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Radiation Effects</th>\n",
       "      <td>23</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Radiotherapy</th>\n",
       "      <td>17</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Rehabilitation</th>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Secondary</th>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Standards</th>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Statistics &amp; Numerical Data</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Supply &amp; Distribution</th>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Surgery</th>\n",
       "      <td>7</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Therapeutic Use</th>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Therapy</th>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Toxicity</th>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Transmission</th>\n",
       "      <td>15</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Transplantation</th>\n",
       "      <td>11</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Trends</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Ultrastructure</th>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Urine</th>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Veterinary</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Virology</th>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               openlifescienceai/medmcqa  \\\n",
       "Abnormalities                                         10   \n",
       "Administration & Dosage                                9   \n",
       "Adverse Effects                                        7   \n",
       "Agonists                                              13   \n",
       "Analogs & Derivatives                                 10   \n",
       "Analysis                                               6   \n",
       "Anatomy & Histology                                   11   \n",
       "Antagonists & Inhibitors                               6   \n",
       "Biosynthesis                                           6   \n",
       "Blood                                                 12   \n",
       "Blood Supply                                           8   \n",
       "Cerebrospinal Fluid                                   10   \n",
       "Chemical Synthesis                                     1   \n",
       "Chemically Induced                                     8   \n",
       "Chemistry                                              8   \n",
       "Classification                                         8   \n",
       "Complications                                          3   \n",
       "Congenital                                             4   \n",
       "Cytology                                              18   \n",
       "Deficiency                                             3   \n",
       "Diagnosis                                              4   \n",
       "Diagnostic Imaging                                    11   \n",
       "Diet Therapy                                           3   \n",
       "Drug Effects                                           9   \n",
       "Drug Therapy                                           5   \n",
       "Economics                                             11   \n",
       "Education                                             12   \n",
       "Embryology                                             6   \n",
       "Enzymology                                             2   \n",
       "Epidemiology                                           4   \n",
       "Ethics                                                 2   \n",
       "Ethnology                                              5   \n",
       "Etiology                                               3   \n",
       "Genetics                                               3   \n",
       "Growth & Development                                   5   \n",
       "History                                                5   \n",
       "Immunology                                             7   \n",
       "Injuries                                              10   \n",
       "Innervation                                            9   \n",
       "Instrumentation                                       17   \n",
       "Isolation & Purification                               7   \n",
       "Legislation & Jurisprudence                           15   \n",
       "Metabolism                                             5   \n",
       "Methods                                                9   \n",
       "Microbiology                                           8   \n",
       "Mortality                                              5   \n",
       "Nursing                                                6   \n",
       "Organization & Administration                         10   \n",
       "Parasitology                                           9   \n",
       "Pathogenicity                                          4   \n",
       "Pathology                                              7   \n",
       "Pharmacokinetics                                       1   \n",
       "Pharmacology                                           6   \n",
       "Physiology                                            13   \n",
       "Physiopathology                                        6   \n",
       "Poisoning                                              7   \n",
       "Prevention & Control                                  10   \n",
       "Psychology                                             4   \n",
       "Radiation Effects                                     23   \n",
       "Radiotherapy                                          17   \n",
       "Rehabilitation                                        10   \n",
       "Secondary                                              6   \n",
       "Standards                                             10   \n",
       "Statistics & Numerical Data                            4   \n",
       "Supply & Distribution                                 10   \n",
       "Surgery                                                7   \n",
       "Therapeutic Use                                        6   \n",
       "Therapy                                                3   \n",
       "Toxicity                                               7   \n",
       "Transmission                                          15   \n",
       "Transplantation                                       11   \n",
       "Trends                                                 1   \n",
       "Ultrastructure                                         7   \n",
       "Urine                                                  6   \n",
       "Veterinary                                             2   \n",
       "Virology                                               4   \n",
       "\n",
       "                               qiaojin/PubMedQA:pqa_labeled  \\\n",
       "Abnormalities                                             1   \n",
       "Administration & Dosage                                   0   \n",
       "Adverse Effects                                           2   \n",
       "Agonists                                                  0   \n",
       "Analogs & Derivatives                                     0   \n",
       "Analysis                                                  2   \n",
       "Anatomy & Histology                                       0   \n",
       "Antagonists & Inhibitors                                  0   \n",
       "Biosynthesis                                              0   \n",
       "Blood                                                     1   \n",
       "Blood Supply                                              0   \n",
       "Cerebrospinal Fluid                                       0   \n",
       "Chemical Synthesis                                        0   \n",
       "Chemically Induced                                        0   \n",
       "Chemistry                                                 0   \n",
       "Classification                                            0   \n",
       "Complications                                             1   \n",
       "Congenital                                                0   \n",
       "Cytology                                                  0   \n",
       "Deficiency                                                1   \n",
       "Diagnosis                                                 1   \n",
       "Diagnostic Imaging                                        4   \n",
       "Diet Therapy                                              1   \n",
       "Drug Effects                                              0   \n",
       "Drug Therapy                                              0   \n",
       "Economics                                                 1   \n",
       "Education                                                 1   \n",
       "Embryology                                                0   \n",
       "Enzymology                                                0   \n",
       "Epidemiology                                              0   \n",
       "Ethics                                                    0   \n",
       "Ethnology                                                 0   \n",
       "Etiology                                                  1   \n",
       "Genetics                                                  0   \n",
       "Growth & Development                                      1   \n",
       "History                                                   0   \n",
       "Immunology                                                0   \n",
       "Injuries                                                  0   \n",
       "Innervation                                               0   \n",
       "Instrumentation                                           0   \n",
       "Isolation & Purification                                  0   \n",
       "Legislation & Jurisprudence                               0   \n",
       "Metabolism                                                0   \n",
       "Methods                                                   1   \n",
       "Microbiology                                              0   \n",
       "Mortality                                                 0   \n",
       "Nursing                                                   0   \n",
       "Organization & Administration                             1   \n",
       "Parasitology                                              0   \n",
       "Pathogenicity                                             0   \n",
       "Pathology                                                 0   \n",
       "Pharmacokinetics                                          0   \n",
       "Pharmacology                                              0   \n",
       "Physiology                                                1   \n",
       "Physiopathology                                           1   \n",
       "Poisoning                                                 0   \n",
       "Prevention & Control                                      0   \n",
       "Psychology                                                1   \n",
       "Radiation Effects                                         0   \n",
       "Radiotherapy                                              0   \n",
       "Rehabilitation                                            0   \n",
       "Secondary                                                 0   \n",
       "Standards                                                 0   \n",
       "Statistics & Numerical Data                               1   \n",
       "Supply & Distribution                                     0   \n",
       "Surgery                                                   2   \n",
       "Therapeutic Use                                           0   \n",
       "Therapy                                                   2   \n",
       "Toxicity                                                  0   \n",
       "Transmission                                              0   \n",
       "Transplantation                                           0   \n",
       "Trends                                                    0   \n",
       "Ultrastructure                                            0   \n",
       "Urine                                                     0   \n",
       "Veterinary                                                0   \n",
       "Virology                                                  0   \n",
       "\n",
       "                               openlifescienceai/headqa  \\\n",
       "Abnormalities                                         0   \n",
       "Administration & Dosage                               3   \n",
       "Adverse Effects                                       2   \n",
       "Agonists                                              1   \n",
       "Analogs & Derivatives                                 0   \n",
       "Analysis                                              2   \n",
       "Anatomy & Histology                                   3   \n",
       "Antagonists & Inhibitors                              2   \n",
       "Biosynthesis                                          3   \n",
       "Blood                                                 0   \n",
       "Blood Supply                                          0   \n",
       "Cerebrospinal Fluid                                   0   \n",
       "Chemical Synthesis                                    5   \n",
       "Chemically Induced                                    1   \n",
       "Chemistry                                             7   \n",
       "Classification                                        5   \n",
       "Complications                                         2   \n",
       "Congenital                                            0   \n",
       "Cytology                                              1   \n",
       "Deficiency                                            2   \n",
       "Diagnosis                                             3   \n",
       "Diagnostic Imaging                                    2   \n",
       "Diet Therapy                                          2   \n",
       "Drug Effects                                          0   \n",
       "Drug Therapy                                          4   \n",
       "Economics                                             1   \n",
       "Education                                             1   \n",
       "Embryology                                            1   \n",
       "Enzymology                                            1   \n",
       "Epidemiology                                          3   \n",
       "Ethics                                                2   \n",
       "Ethnology                                             0   \n",
       "Etiology                                              5   \n",
       "Genetics                                              2   \n",
       "Growth & Development                                  1   \n",
       "History                                               2   \n",
       "Immunology                                            2   \n",
       "Injuries                                              1   \n",
       "Innervation                                           1   \n",
       "Instrumentation                                       0   \n",
       "Isolation & Purification                              0   \n",
       "Legislation & Jurisprudence                           1   \n",
       "Metabolism                                            4   \n",
       "Methods                                               3   \n",
       "Microbiology                                          0   \n",
       "Mortality                                             0   \n",
       "Nursing                                               5   \n",
       "Organization & Administration                         3   \n",
       "Parasitology                                          1   \n",
       "Pathogenicity                                         0   \n",
       "Pathology                                             2   \n",
       "Pharmacokinetics                                      6   \n",
       "Pharmacology                                          1   \n",
       "Physiology                                            5   \n",
       "Physiopathology                                       3   \n",
       "Poisoning                                             0   \n",
       "Prevention & Control                                  1   \n",
       "Psychology                                            1   \n",
       "Radiation Effects                                     0   \n",
       "Radiotherapy                                          0   \n",
       "Rehabilitation                                        1   \n",
       "Secondary                                             0   \n",
       "Standards                                             0   \n",
       "Statistics & Numerical Data                           2   \n",
       "Supply & Distribution                                 0   \n",
       "Surgery                                               1   \n",
       "Therapeutic Use                                       1   \n",
       "Therapy                                               3   \n",
       "Toxicity                                              0   \n",
       "Transmission                                          1   \n",
       "Transplantation                                       0   \n",
       "Trends                                                0   \n",
       "Ultrastructure                                        0   \n",
       "Urine                                                 0   \n",
       "Veterinary                                            0   \n",
       "Virology                                              5   \n",
       "\n",
       "                               GBaker/MedQA-USMLE-4-options  \n",
       "Abnormalities                                            10  \n",
       "Administration & Dosage                                   6  \n",
       "Adverse Effects                                           9  \n",
       "Agonists                                                  2  \n",
       "Analogs & Derivatives                                     1  \n",
       "Analysis                                                  2  \n",
       "Anatomy & Histology                                      10  \n",
       "Antagonists & Inhibitors                                  7  \n",
       "Biosynthesis                                              4  \n",
       "Blood                                                     7  \n",
       "Blood Supply                                              6  \n",
       "Cerebrospinal Fluid                                       4  \n",
       "Chemical Synthesis                                        0  \n",
       "Chemically Induced                                        5  \n",
       "Chemistry                                                 0  \n",
       "Classification                                            0  \n",
       "Complications                                             6  \n",
       "Congenital                                                7  \n",
       "Cytology                                                  3  \n",
       "Deficiency                                                6  \n",
       "Diagnosis                                                 2  \n",
       "Diagnostic Imaging                                        7  \n",
       "Diet Therapy                                              2  \n",
       "Drug Effects                                             11  \n",
       "Drug Therapy                                              1  \n",
       "Economics                                                 1  \n",
       "Education                                                 0  \n",
       "Embryology                                                7  \n",
       "Enzymology                                                4  \n",
       "Epidemiology                                              4  \n",
       "Ethics                                                    5  \n",
       "Ethnology                                                 0  \n",
       "Etiology                                                  4  \n",
       "Genetics                                                  4  \n",
       "Growth & Development                                      0  \n",
       "History                                                   0  \n",
       "Immunology                                               10  \n",
       "Injuries                                                 12  \n",
       "Innervation                                               3  \n",
       "Instrumentation                                           0  \n",
       "Isolation & Purification                                  0  \n",
       "Legislation & Jurisprudence                               0  \n",
       "Metabolism                                                1  \n",
       "Methods                                                   1  \n",
       "Microbiology                                              5  \n",
       "Mortality                                                 3  \n",
       "Nursing                                                   0  \n",
       "Organization & Administration                             0  \n",
       "Parasitology                                              5  \n",
       "Pathogenicity                                             5  \n",
       "Pathology                                                11  \n",
       "Pharmacokinetics                                          3  \n",
       "Pharmacology                                              5  \n",
       "Physiology                                                5  \n",
       "Physiopathology                                           5  \n",
       "Poisoning                                                 5  \n",
       "Prevention & Control                                      6  \n",
       "Psychology                                                0  \n",
       "Radiation Effects                                         1  \n",
       "Radiotherapy                                              0  \n",
       "Rehabilitation                                            0  \n",
       "Secondary                                                 2  \n",
       "Standards                                                 2  \n",
       "Statistics & Numerical Data                               5  \n",
       "Supply & Distribution                                     0  \n",
       "Surgery                                                   5  \n",
       "Therapeutic Use                                           5  \n",
       "Therapy                                                   5  \n",
       "Toxicity                                                  2  \n",
       "Transmission                                              1  \n",
       "Transplantation                                           0  \n",
       "Trends                                                    0  \n",
       "Ultrastructure                                            0  \n",
       "Urine                                                     8  \n",
       "Veterinary                                                0  \n",
       "Virology                                                  6  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "repo_id = (\n",
    "    \"anonymous/m196k-dedup-decon-filter_easy-r1-filter_wrong-decon_eval-domain_source_1k-150325\"\n",
    ")\n",
    "repo_id_split = \"train\"\n",
    "\n",
    "domain_repo_id = \"anonymous/m196k-dedup-decon-filter_easy-r1-filter_wrong-decon_eval-tokenized-120325-extract-domain-code\"\n",
    "domain_repo_id_split = \"train\"\n",
    "stats_domain(repo_id, domain_repo_id, repo_id_split, domain_repo_id_split)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "anonymous-med_sipf",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
