{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/data1/anonymous/codes/med-sipf\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "os.chdir(\"../..\")\n",
    "print(os.getcwd())\n",
    "import dotenv\n",
    "\n",
    "dotenv.load_dotenv(override=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/data1/anonymous/misc/miniconda3/envs/m1/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "import datasets\n",
    "from collections import Counter\n",
    "from pprint import pformat\n",
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "\n",
    "\n",
    "def stat_domain_dataset(dataset, path, is_domain=False):\n",
    "    source_list = dataset[\"source\"]\n",
    "    source_counter = Counter(source_list)\n",
    "    source_counter = {k: v for k, v in source_counter.items()}\n",
    "\n",
    "    try:\n",
    "        domain_counter = Counter(dataset[\"domain_name\"])\n",
    "        domain_counter = {k: v for k, v in domain_counter.items()}\n",
    "    except KeyError:\n",
    "        domain_counter = None\n",
    "\n",
    "    num_samples = len(dataset)\n",
    "    source_counter_perecent = {k: v / num_samples for k, v in source_counter.items()}\n",
    "\n",
    "    print(\n",
    "        f\"{path}\\n{dataset}\\n{pformat(source_counter)}\\n{pformat(source_counter_perecent)}\\n{pformat(domain_counter)}\\n\"\n",
    "    )\n",
    "\n",
    "    source_counter[\"path\"]= path\n",
    "    source_counter = [source_counter]\n",
    "    source_counter_df = pd.DataFrame(source_counter)\n",
    "    # sort the columns\n",
    "    source_counter_df = source_counter_df.reindex(sorted(source_counter_df.columns), axis=1)\n",
    "    \n",
    "    Path(\"misc\").mkdir(parents=True, exist_ok=True)\n",
    "    source_counter_df.to_csv(\"misc/dataset_stats.tsv\", sep=\"\\t\", index=False, mode=\"a\")\n",
    "\n",
    "    if domain_counter is not None:\n",
    "        domain_counter[\"path\"]= path\n",
    "        domain_counter = [domain_counter]\n",
    "        domain_counter_df = pd.DataFrame(domain_counter)\n",
    "        # sort the columns\n",
    "        domain_counter_df = domain_counter_df.reindex(sorted(domain_counter_df.columns), axis=1)\n",
    "        \n",
    "        Path(\"misc\").mkdir(parents=True, exist_ok=True)\n",
    "        domain_counter_df.to_csv(\"misc/domain_stats.tsv\", sep=\"\\t\", index=False, mode=\"a\")\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "anonymous/m196k\n",
      "Dataset({\n",
      "    features: ['answer_idx', 'source', 'metadata', 'prompt', 'answer_letter', 'answer_string'],\n",
      "    num_rows: 196157\n",
      "})\n",
      "{'GBaker/MedQA-USMLE-4-options': 10178,\n",
      " 'openlifescienceai/headqa': 2657,\n",
      " 'openlifescienceai/medmcqa': 182822,\n",
      " 'qiaojin/PubMedQA:pqa_labeled': 500}\n",
      "{'GBaker/MedQA-USMLE-4-options': 0.05188700887554357,\n",
      " 'openlifescienceai/headqa': 0.013545272409345575,\n",
      " 'openlifescienceai/medmcqa': 0.9320187400908456,\n",
      " 'qiaojin/PubMedQA:pqa_labeled': 0.002548978624265257}\n",
      "None\n",
      "\n",
      "anonymous/m196k-dedup-decon-filter_easy\n",
      "Dataset({\n",
      "    features: ['answer_idx', 'source', 'metadata', 'prompt', 'answer_letter', 'answer_string', 'qwen_7b-extracted_answer_string', 'qwen_7b-model_answer_string', 'qwen_7b-correct', 'qwen_32b-extracted_answer_string', 'qwen_32b-model_answer_string', 'qwen_32b-correct'],\n",
      "    num_rows: 37816\n",
      "})\n",
      "{'GBaker/MedQA-USMLE-4-options': 2099,\n",
      " 'openlifescienceai/headqa': 331,\n",
      " 'openlifescienceai/medmcqa': 35270,\n",
      " 'qiaojin/PubMedQA:pqa_labeled': 116}\n",
      "{'GBaker/MedQA-USMLE-4-options': 0.05550560609265919,\n",
      " 'openlifescienceai/headqa': 0.008752908821662788,\n",
      " 'openlifescienceai/medmcqa': 0.9326740004231013,\n",
      " 'qiaojin/PubMedQA:pqa_labeled': 0.003067484662576687}\n",
      "None\n",
      "\n",
      "anonymous/m196k-dedup-decon-filter_easy-r1-filter_wrong\n",
      "Dataset({\n",
      "    features: ['answer_idx', 'source', 'metadata', 'prompt', 'answer_letter', 'answer_string', 'qwen_7b-extracted_answer_string', 'qwen_7b-model_answer_string', 'qwen_7b-correct', 'qwen_32b-extracted_answer_string', 'qwen_32b-model_answer_string', 'qwen_32b-correct', 'reasoning', 'distilled_answer_string', 'r1-extracted_answer_string', 'r1-model_answer_string', 'r1-correct'],\n",
      "    num_rows: 23504\n",
      "})\n",
      "{'GBaker/MedQA-USMLE-4-options': 1628,\n",
      " 'openlifescienceai/headqa': 209,\n",
      " 'openlifescienceai/medmcqa': 21628,\n",
      " 'qiaojin/PubMedQA:pqa_labeled': 39}\n",
      "{'GBaker/MedQA-USMLE-4-options': 0.0692648059904697,\n",
      " 'openlifescienceai/headqa': 0.008892103471749489,\n",
      " 'openlifescienceai/medmcqa': 0.9201837985023825,\n",
      " 'qiaojin/PubMedQA:pqa_labeled': 0.00165929203539823}\n",
      "None\n",
      "\n",
      "anonymous/m196k-dedup-decon-filter_easy-r1-filter_wrong-decon_eval\n",
      "Dataset({\n",
      "    features: ['answer_idx', 'source', 'metadata', 'prompt', 'answer_letter', 'answer_string', 'qwen_7b-extracted_answer_string', 'qwen_7b-model_answer_string', 'qwen_7b-correct', 'qwen_32b-extracted_answer_string', 'qwen_32b-model_answer_string', 'qwen_32b-correct', 'reasoning', 'distilled_answer_string', 'r1-extracted_answer_string', 'r1-model_answer_string', 'r1-correct'],\n",
      "    num_rows: 23493\n",
      "})\n",
      "{'GBaker/MedQA-USMLE-4-options': 1628,\n",
      " 'openlifescienceai/headqa': 209,\n",
      " 'openlifescienceai/medmcqa': 21628,\n",
      " 'qiaojin/PubMedQA:pqa_labeled': 28}\n",
      "{'GBaker/MedQA-USMLE-4-options': 0.06929723747499256,\n",
      " 'openlifescienceai/headqa': 0.008896266973140936,\n",
      " 'openlifescienceai/medmcqa': 0.9206146511726897,\n",
      " 'qiaojin/PubMedQA:pqa_labeled': 0.001191844379176776}\n",
      "None\n",
      "\n",
      "anonymous/m1k-random_1k-tokenized\n",
      "Dataset({\n",
      "    features: ['answer_idx', 'source', 'metadata', 'prompt', 'answer_letter', 'answer_string', 'reasoning', 'distilled_answer_string', 'text'],\n",
      "    num_rows: 1000\n",
      "})\n",
      "{'GBaker/MedQA-USMLE-4-options': 61,\n",
      " 'openlifescienceai/headqa': 8,\n",
      " 'openlifescienceai/medmcqa': 929,\n",
      " 'qiaojin/PubMedQA:pqa_labeled': 2}\n",
      "{'GBaker/MedQA-USMLE-4-options': 0.061,\n",
      " 'openlifescienceai/headqa': 0.008,\n",
      " 'openlifescienceai/medmcqa': 0.929,\n",
      " 'qiaojin/PubMedQA:pqa_labeled': 0.002}\n",
      "None\n",
      "\n",
      "anonymous/m1k-hard_random_1k-tokenized\n",
      "Dataset({\n",
      "    features: ['answer_idx', 'source', 'metadata', 'prompt', 'answer_letter', 'answer_string', 'reasoning', 'distilled_answer_string', 'text'],\n",
      "    num_rows: 1000\n",
      "})\n",
      "{'GBaker/MedQA-USMLE-4-options': 78,\n",
      " 'openlifescienceai/headqa': 10,\n",
      " 'openlifescienceai/medmcqa': 909,\n",
      " 'qiaojin/PubMedQA:pqa_labeled': 3}\n",
      "{'GBaker/MedQA-USMLE-4-options': 0.078,\n",
      " 'openlifescienceai/headqa': 0.01,\n",
      " 'openlifescienceai/medmcqa': 0.909,\n",
      " 'qiaojin/PubMedQA:pqa_labeled': 0.003}\n",
      "None\n",
      "\n",
      "anonymous/m1k-domain_1k-tokenized\n",
      "Dataset({\n",
      "    features: ['answer_idx', 'source', 'metadata', 'prompt', 'answer_letter', 'answer_string', 'reasoning', 'distilled_answer_string', 'domain_code', 'domain_name', 'text'],\n",
      "    num_rows: 1000\n",
      "})\n",
      "{'GBaker/MedQA-USMLE-4-options': 52,\n",
      " 'openlifescienceai/headqa': 20,\n",
      " 'openlifescienceai/medmcqa': 924,\n",
      " 'qiaojin/PubMedQA:pqa_labeled': 4}\n",
      "{'GBaker/MedQA-USMLE-4-options': 0.052,\n",
      " 'openlifescienceai/headqa': 0.02,\n",
      " 'openlifescienceai/medmcqa': 0.924,\n",
      " 'qiaojin/PubMedQA:pqa_labeled': 0.004}\n",
      "{'Abnormalities': 14,\n",
      " 'Administration & Dosage': 20,\n",
      " 'Adverse Effects': 15,\n",
      " 'Agonists': 14,\n",
      " 'Analogs & Derivatives': 11,\n",
      " 'Analysis': 10,\n",
      " 'Anatomy & Histology': 20,\n",
      " 'Antagonists & Inhibitors': 11,\n",
      " 'Biosynthesis': 9,\n",
      " 'Blood': 14,\n",
      " 'Blood Supply': 15,\n",
      " 'Cerebrospinal Fluid': 10,\n",
      " 'Chemical Synthesis': 6,\n",
      " 'Chemically Induced': 14,\n",
      " 'Chemistry': 14,\n",
      " 'Classification': 13,\n",
      " 'Complications': 13,\n",
      " 'Congenital': 8,\n",
      " 'Cytology': 13,\n",
      " 'Deficiency': 19,\n",
      " 'Diagnosis': 16,\n",
      " 'Diagnostic Imaging': 14,\n",
      " 'Diet Therapy': 16,\n",
      " 'Drug Effects': 22,\n",
      " 'Drug Therapy': 7,\n",
      " 'Economics': 9,\n",
      " 'Education': 18,\n",
      " 'Embryology': 16,\n",
      " 'Enzymology': 20,\n",
      " 'Epidemiology': 15,\n",
      " 'Ethics': 9,\n",
      " 'Ethnology': 5,\n",
      " 'Etiology': 15,\n",
      " 'Genetics': 12,\n",
      " 'Growth & Development': 13,\n",
      " 'History': 11,\n",
      " 'Immunology': 14,\n",
      " 'Injuries': 20,\n",
      " 'Innervation': 20,\n",
      " 'Instrumentation': 14,\n",
      " 'Isolation & Purification': 7,\n",
      " 'Legislation & Jurisprudence': 14,\n",
      " 'Metabolism': 18,\n",
      " 'Methods': 15,\n",
      " 'Microbiology': 12,\n",
      " 'Mortality': 10,\n",
      " 'Nursing': 11,\n",
      " 'Organization & Administration': 17,\n",
      " 'Parasitology': 18,\n",
      " 'Pathogenicity': 9,\n",
      " 'Pathology': 18,\n",
      " 'Pharmacokinetics': 16,\n",
      " 'Pharmacology': 18,\n",
      " 'Physiology': 20,\n",
      " 'Physiopathology': 14,\n",
      " 'Poisoning': 11,\n",
      " 'Prevention & Control': 15,\n",
      " 'Psychology': 15,\n",
      " 'Radiation Effects': 18,\n",
      " 'Radiotherapy': 10,\n",
      " 'Rehabilitation': 11,\n",
      " 'Secondary': 11,\n",
      " 'Standards': 11,\n",
      " 'Statistics & Numerical Data': 13,\n",
      " 'Supply & Distribution': 10,\n",
      " 'Surgery': 12,\n",
      " 'Therapeutic Use': 15,\n",
      " 'Therapy': 12,\n",
      " 'Toxicity': 12,\n",
      " 'Transmission': 9,\n",
      " 'Transplantation': 16,\n",
      " 'Trends': 1,\n",
      " 'Ultrastructure': 7,\n",
      " 'Urine': 12,\n",
      " 'Veterinary': 2,\n",
      " 'Virology': 11}\n",
      "\n",
      "anonymous/m1k-tokenized\n",
      "Dataset({\n",
      "    features: ['answer_idx', 'source', 'metadata', 'prompt', 'answer_letter', 'answer_string', 'reasoning', 'distilled_answer_string', 'domain_code', 'domain_name', 'text'],\n",
      "    num_rows: 1000\n",
      "})\n",
      "{'GBaker/MedQA-USMLE-4-options': 274,\n",
      " 'openlifescienceai/headqa': 123,\n",
      " 'openlifescienceai/medmcqa': 575,\n",
      " 'qiaojin/PubMedQA:pqa_labeled': 28}\n",
      "{'GBaker/MedQA-USMLE-4-options': 0.274,\n",
      " 'openlifescienceai/headqa': 0.123,\n",
      " 'openlifescienceai/medmcqa': 0.575,\n",
      " 'qiaojin/PubMedQA:pqa_labeled': 0.028}\n",
      "{'Abnormalities': 21,\n",
      " 'Administration & Dosage': 18,\n",
      " 'Adverse Effects': 20,\n",
      " 'Agonists': 16,\n",
      " 'Analogs & Derivatives': 11,\n",
      " 'Analysis': 12,\n",
      " 'Anatomy & Histology': 24,\n",
      " 'Antagonists & Inhibitors': 15,\n",
      " 'Biosynthesis': 13,\n",
      " 'Blood': 20,\n",
      " 'Blood Supply': 14,\n",
      " 'Cerebrospinal Fluid': 14,\n",
      " 'Chemical Synthesis': 6,\n",
      " 'Chemically Induced': 14,\n",
      " 'Chemistry': 15,\n",
      " 'Classification': 13,\n",
      " 'Complications': 12,\n",
      " 'Congenital': 11,\n",
      " 'Cytology': 22,\n",
      " 'Deficiency': 12,\n",
      " 'Diagnosis': 10,\n",
      " 'Diagnostic Imaging': 24,\n",
      " 'Diet Therapy': 8,\n",
      " 'Drug Effects': 20,\n",
      " 'Drug Therapy': 10,\n",
      " 'Economics': 14,\n",
      " 'Education': 14,\n",
      " 'Embryology': 14,\n",
      " 'Enzymology': 7,\n",
      " 'Epidemiology': 11,\n",
      " 'Ethics': 9,\n",
      " 'Ethnology': 5,\n",
      " 'Etiology': 13,\n",
      " 'Genetics': 9,\n",
      " 'Growth & Development': 7,\n",
      " 'History': 7,\n",
      " 'Immunology': 19,\n",
      " 'Injuries': 23,\n",
      " 'Innervation': 13,\n",
      " 'Instrumentation': 17,\n",
      " 'Isolation & Purification': 7,\n",
      " 'Legislation & Jurisprudence': 16,\n",
      " 'Metabolism': 10,\n",
      " 'Methods': 14,\n",
      " 'Microbiology': 13,\n",
      " 'Mortality': 8,\n",
      " 'Nursing': 11,\n",
      " 'Organization & Administration': 14,\n",
      " 'Parasitology': 15,\n",
      " 'Pathogenicity': 9,\n",
      " 'Pathology': 20,\n",
      " 'Pharmacokinetics': 10,\n",
      " 'Pharmacology': 12,\n",
      " 'Physiology': 24,\n",
      " 'Physiopathology': 15,\n",
      " 'Poisoning': 12,\n",
      " 'Prevention & Control': 17,\n",
      " 'Psychology': 6,\n",
      " 'Radiation Effects': 24,\n",
      " 'Radiotherapy': 17,\n",
      " 'Rehabilitation': 11,\n",
      " 'Secondary': 8,\n",
      " 'Standards': 12,\n",
      " 'Statistics & Numerical Data': 12,\n",
      " 'Supply & Distribution': 10,\n",
      " 'Surgery': 15,\n",
      " 'Therapeutic Use': 12,\n",
      " 'Therapy': 13,\n",
      " 'Toxicity': 9,\n",
      " 'Transmission': 17,\n",
      " 'Transplantation': 11,\n",
      " 'Trends': 1,\n",
      " 'Ultrastructure': 7,\n",
      " 'Urine': 14,\n",
      " 'Veterinary': 2,\n",
      " 'Virology': 15}\n",
      "\n",
      "anonymous/m1k-random_23k-tokenized\n",
      "Dataset({\n",
      "    features: ['answer_idx', 'source', 'metadata', 'prompt', 'answer_letter', 'answer_string', 'reasoning', 'distilled_answer_string', 'text'],\n",
      "    num_rows: 23493\n",
      "})\n",
      "{'GBaker/MedQA-USMLE-4-options': 1316,\n",
      " 'openlifescienceai/headqa': 317,\n",
      " 'openlifescienceai/medmcqa': 21831,\n",
      " 'qiaojin/PubMedQA:pqa_labeled': 29}\n",
      "{'GBaker/MedQA-USMLE-4-options': 0.056016685821308476,\n",
      " 'openlifescienceai/headqa': 0.0134933810071085,\n",
      " 'openlifescienceai/medmcqa': 0.9292555229217213,\n",
      " 'qiaojin/PubMedQA:pqa_labeled': 0.001234410249861661}\n",
      "None\n",
      "\n",
      "anonymous/m23k-tokenized\n",
      "Dataset({\n",
      "    features: ['answer_idx', 'source', 'metadata', 'prompt', 'answer_letter', 'answer_string', 'reasoning', 'distilled_answer_string', 'text'],\n",
      "    num_rows: 23493\n",
      "})\n",
      "{'GBaker/MedQA-USMLE-4-options': 1628,\n",
      " 'openlifescienceai/headqa': 209,\n",
      " 'openlifescienceai/medmcqa': 21628,\n",
      " 'qiaojin/PubMedQA:pqa_labeled': 28}\n",
      "{'GBaker/MedQA-USMLE-4-options': 0.06929723747499256,\n",
      " 'openlifescienceai/headqa': 0.008896266973140936,\n",
      " 'openlifescienceai/medmcqa': 0.9206146511726897,\n",
      " 'qiaojin/PubMedQA:pqa_labeled': 0.001191844379176776}\n",
      "None\n",
      "\n"
     ]
    }
   ],
   "source": [
    "dataset_path_list = [\n",
    "    # source\n",
    "    \"anonymous/m196k\",\n",
    "    \"anonymous/m196k-dedup-decon-filter_easy\",\n",
    "    \"anonymous/m196k-dedup-decon-filter_easy-r1-filter_wrong\",\n",
    "    \"anonymous/m196k-dedup-decon-filter_easy-r1-filter_wrong-decon_eval\",\n",
    "    # 1k\n",
    "    \"anonymous/m1k-random_1k-tokenized\",\n",
    "    \"anonymous/m1k-hard_random_1k-tokenized\",\n",
    "    \"anonymous/m1k-domain_1k-tokenized\",\n",
    "    \"anonymous/m1k-tokenized\",\n",
    "    # 23k\n",
    "    \"anonymous/m1k-random_23k-tokenized\",\n",
    "    \"anonymous/m23k-tokenized\",\n",
    "]\n",
    "split = \"train\"\n",
    "\n",
    "for dataset_path in dataset_path_list:\n",
    "    dataset = datasets.load_dataset(dataset_path, split=split)\n",
    "    stat_domain_dataset(dataset, dataset_path)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "m1",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
