{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/opt/dlami/nvme/anonymous/codes/med-sipf\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "os.chdir(\"../..\")\n",
    "print(os.getcwd())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import dotenv\n",
    "dotenv.load_dotenv(override=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/dlami/nvme/anonymous/misc/miniconda3/envs/anonymous-med_sipf/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "import json\n",
    "from typing import List, Union\n",
    "from pprint import pprint\n",
    "import seaborn as sns\n",
    "import datasets\n",
    "import tqdm\n",
    "from datetime import datetime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from hashlib import sha256\n",
    "def get_str_hash(input_string: str) -> str:\n",
    "    return sha256(input_string.encode()).hexdigest()[:16]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map: 100%|██████████| 23493/23493 [00:02<00:00, 7859.71 examples/s]\n"
     ]
    }
   ],
   "source": [
    "repo_id = \"anonymous/m196k-dedup-decon-filter_easy-r1-filter_wrong-decon_eval\"\n",
    "\n",
    "split = \"train\"\n",
    "dataset = datasets.load_dataset(repo_id, split=split)\n",
    "def add_prompt_hash(sample):\n",
    "    return {\"prompt_hash\": get_str_hash(sample[\"prompt\"])}\n",
    "dataset = dataset.map(add_prompt_hash)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map: 100%|██████████| 23493/23493 [00:02<00:00, 8068.72 examples/s]\n"
     ]
    }
   ],
   "source": [
    "domain_repo_id = \"anonymous/m196k-dedup-decon-filter_easy-r1-filter_wrong-decon_eval-tokenized-120325-extract-domain-code\"\n",
    "split = \"train\"\n",
    "prompt_domain_dataset = datasets.load_dataset(domain_repo_id, split=split)\n",
    "prompt_domain_dataset = prompt_domain_dataset.map(add_prompt_hash)\n",
    "prompt_domain_mapping = prompt_domain_dataset.to_pandas().set_index(\"prompt_hash\").to_dict(orient=\"index\")\n",
    "\n",
    "def add_domain(sample):\n",
    "    return prompt_domain_mapping[sample[\"prompt_hash\"]]\n",
    "\n",
    "dataset = dataset.map(add_domain)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'answer_idx': 2,\n",
       " 'source': 'GBaker/MedQA-USMLE-4-options',\n",
       " 'metadata': '{\\'question\\': \"A mother brings her 3-week-old infant to the pediatrician\\'s office because she is concerned about his feeding habits. He was born without complications and has not had any medical problems up until this time. However, for the past 4 days, he has been fussy, is regurgitating all of his feeds, and his vomit is yellow in color. On physical exam, the child\\'s abdomen is minimally distended but no other abnormalities are appreciated. Which of the following embryologic errors could account for this presentation?\", \\'answer\\': \\'Abnormal migration of ventral pancreatic bud\\', \\'options\\': {\\'A\\': \\'Abnormal migration of ventral pancreatic bud\\', \\'B\\': \\'Complete failure of proximal duodenum to recanalize\\', \\'C\\': \\'Abnormal hypertrophy of the pylorus\\', \\'D\\': \\'Failure of lateral body folds to move ventrally and fuse in the midline\\'}, \\'meta_info\\': \\'step1\\', \\'answer_idx\\': \\'A\\', \\'metamap_phrases\\': [\\'mother\\', \\'week old infant\\', \"pediatrician\\'s office\", \\'concerned\\', \\'feeding habits\\', \\'born\\', \\'complications\\', \\'not\\', \\'medical problems\\', \\'time\\', \\'past\\', \\'days\\', \\'fussy\\', \\'regurgitating\\', \\'feeds\\', \\'vomit\\', \\'yellow\\', \\'color\\', \\'physical exam\\', \"child\\'s abdomen\", \\'distended\\', \\'abnormalities\\', \\'following embryologic errors\\', \\'account\\', \\'presentation\\']}',\n",
       " 'prompt': \"A mother brings her 3-week-old infant to the pediatrician's office because she is concerned about his feeding habits. He was born without complications and has not had any medical problems up until this time. However, for the past 4 days, he has been fussy, is regurgitating all of his feeds, and his vomit is yellow in color. On physical exam, the child's abdomen is minimally distended but no other abnormalities are appreciated. Which of the following embryologic errors could account for this presentation?\\nA. Abnormal hypertrophy of the pylorus\\nB. Failure of lateral body folds to move ventrally and fuse in the midline\\nC. Abnormal migration of ventral pancreatic bud\\nD. Complete failure of proximal duodenum to recanalize\",\n",
       " 'answer_letter': 'C',\n",
       " 'answer_string': 'Abnormal migration of ventral pancreatic bud',\n",
       " 'qwen_7b-extracted_answer_string': 'A. Abnormal hypertrophy of the pylorus',\n",
       " 'qwen_7b-model_answer_string': '\\\\boxed{A. Abnormal hypertrophy of the pylorus}\\n\\nThe symptoms described—fussiness, regurgitation of feeds, and yellow vomit—are consistent with pyloric stenosis, which is an embryologic error involving the failure of the pylorus to develop properly during fetal development. The pylorus is the opening between the stomach and the small intestine. An abnormally thickened pylorus can lead to obstruction, making it difficult for food to pass from the stomach into the small intestine, causing symptoms like regurgitation and possible jaundice if bile flows back into the stomach and is then vomited.',\n",
       " 'qwen_7b-correct': False,\n",
       " 'qwen_32b-extracted_answer_string': 'D',\n",
       " 'qwen_32b-model_answer_string': 'The clinical presentation described, which includes fussiness, regurgitation of all feeds, and yellow-colored vomit in a 3-week-old infant, along with a minimally distended abdomen, points towards a mechanical obstruction affecting the upper gastrointestinal tract. The regurgitation of feeds and the yellowness of vomit can be interpreted as bile present in the vomitus, suggesting that the obstruction is downstream of the ampulla of Vater, which is where the common bile duct enters the duodenum.\\n\\nGiven the options provided:\\n\\nA. Abnormal hypertrophy of the pylorus - This condition is known as hypertrophic pyloric stenosis and typically does not present with bile-stained (yellow) vomit, as the bile would be blocked by the narrowed pylorus itself.\\n\\nB. Failure of lateral body folds to move ventrally and fuse in the midline - This embryologic issue results in the condition called diaphragmatic hernia, where abdominal organs herniate into the chest. It usually causes more severe respiratory issues in newborns due to the displacement of the lungs.\\n\\nC. Abnormal migration of the ventral pancreatic bud - While defects involving the pancreatic migration can affect the gastrointestinal tract, it is not a common cause for the specific symptoms described in this newborn.\\n\\nD. Complete failure of proximal duodenum to recanalize - This is known as duodenal atresia and typically results in bilious vomiting due to obstruction at or above the duodenum, which matches the clinical presentation of the infant in the question.\\n\\nThus, based on the symptoms and embryologic errors, the correct answer is D. Complete failure of proximal duodenum to recanalize.\\n\\n\\\\boxed{D}',\n",
       " 'qwen_32b-correct': False,\n",
       " 'reasoning': 'Okay, let me try to work through this question. Let\\'s start by reading it carefully.\\n\\nSo, the scenario is a 3-week-old infant with fussiness, regurgitating all feeds, and yellow vomit for the past 4 days. The baby was born without complications and no prior medical issues. On physical exam, the abdomen is minimally distended but otherwise normal. The question is asking which embryologic error could account for this presentation. The options are A through D: hypertrophic pylorus, failure of lateral body folds to ventrally fuse, abnormal ventral pancreatic bud migration, or complete failure of proximal duodenum to recanalize.\\n\\nFirst, the symptoms. Yellow vomit—bilious vomiting, right? So that\\'s a sign of possible intestinal obstruction distal to the ampulla of Vater, where bile is introduced into the intestine. So in a newborn, bilious vomiting makes me think of intestinal obstruction.\\n\\nThe baby is 3 weeks old. The fact that this started around 3 weeks makes me think about common conditions that present in the first few weeks. The abrupt onset over 4 days—so maybe 3-day-old baby started to have issues at age about 19 days. Let me think: conditions like pyloric stenosis usually present between 2-8 weeks, with non-bilious vomiting. But in this case, it\\'s bilious. So pyloric stenosis (option A) usually causes projectile, non-bilious vomiting. The vomit here being yellow suggests bile, so maybe obstruction lower down, like in the duodenum or beyond.\\n\\nExam: abdomen is minimally distended. Not severely distended. So perhaps an upper GI obstruction. Let\\'s think about possible embryological issues causing obstruction.\\n\\nThe options given:\\n\\nA. Abnormal hypertrophy of the pylorus: that\\'s pyloric stenosis. But the vomiting in pyloric stenosis would typically be non-bilious because the obstruction is before the ampulla. So if the vomit is yellow (bile), pyloric stenosis might not be the answer here.\\n\\nB. Failure of lateral body folds to move ventrally and fuse in the midline. Wait, lateral body folds in embryology... The lateral body folds contribute to forming the abdominal wall. Failure to fuse would lead to defects like gastroschisis, where the abdominal organs are outside the body without a covering sac. But gastroschisis is usually apparent at birth, not presenting with vomiting at 3 weeks. So maybe B is not correct.\\n\\nC. Abnormal migration of ventral pancreatic bud. The ventral pancreatic bud forms part of the pancreas and the duct. If it migrates abnormally, could lead to annular pancreas. Annular pancreas is a ring of pancreatic tissue around the duodenum, causing obstruction. So annular pancreas could present with duodenal obstruction. That sounds possible. But then when does annular pancreas present? It might present in infancy with vomiting. The vomiting could be bilious if the obstruction is distal to the ampulla. Hmm. So that\\'s option C.\\n\\nD. Complete failure of proximal duodenum to recanalize. The duodenum during development goes through a solid stage and then recanalizes. If that doesn\\'t happen, stenosis or atresia can occur. Duodenal atresia usually presents with polyhydramnios in pregnancy and vomiting in the first days of life. The vomiting might be bilious or non-bilious depending on the location relative to the ampulla. But classically, duodenal atresia is considered as a double bubble sign on X-ray, with distension of the stomach and proximal duodenum. However, since this baby is 3 weeks old, if there was complete failure to recanalize (the most severe form), they would likely present earlier, not at 3 weeks. But maybe with a partial obstruction (stenosis), which could present a bit later. But option D says \\'complete failure of proximal duodenum to recanalize\\', which would probably cause symptoms right after birth. So in this case, the baby started having issues at 3 weeks, maybe not immediately? Or maybe with a partial recanalization... Wait, but the answer options are probably for the obstruction described. Let\\'s think about presentations.\\n\\nAnother thought: hypertrophic pyloric stenosis (A) causes nonbilious vomiting, typically not yet present at 3 weeks, but some may develop later. However, in pyloric stenosis, the onset is usually around 3-5 weeks, so this age is right. But the coloring—yellow. Maybe possible if the obstruction is beyond the pylorus but something else. Wait, another possibility: malrotation with volvulus. But malrotation can cause bilious vomiting due to midgut volvulus, but usually acute onset, and more concerning for a surgical emergency. The baby would be really sick. But in this scenario, maybe not? The question is about embryologic errors. Malrotation is a failure of normal rotation of the gut, but does that fit any of the options?\\n\\nLooking back at the options: the given options are A to D as provided. So the answer has to be one of these four.\\n\\nSo, D is duodenal atresia due to failed recanalization. The classic time of presentation is in the first days of life. But perhaps incomplete obstruction could take longer. But the answer option D mentions \"complete failure,\" which suggests a complete atresia. So maybe not, since the baby was okay initially.\\n\\nC is annular pancreas. The annular pancreas can cause obstruction which can present at various times. If the obstruction is partial, maybe vomiting would present later. However, congenital duodenal obstruction due to annular pancreas usually presents in the first days of life. But perhaps with some variability. Maybe in some cases, partial obstruction may present later.\\n\\nOption B—failure of lateral body folds. Hmm. Lateral folds in embryology... The lateral body folds are involved in forming the anterior abdominal wall. Failure to fuse would lead to defects like gastroschisis or omphalocele. Gastroschisis is a defect typically to the right of the umbilicus, with intestines herniated, not covered. Omphalocele is failure to return the midgut to the abdomen, covered by a sac. Those would usually be evident at birth. Since those are structural defects, not related to vomiting at 3 weeks. So B is less likely.\\n\\nNow, D—complete duodenal atresia. I think that typically presents very early, often as soon as feeds start. Because the obstruction is complete, you can\\'t keep anything down. We\\'re told the baby is 3 weeks old and symptoms for 4 days, so starting around day 17. With full feeds possibly, maybe if the obstruction is partial, they might present later. Wait, but Option D says \\'complete failure\\' leading to atresia, which would not allow passage. But perhaps in some cases, even with atresia, if the mother is breast-feeding, maybe some passage? Not sure. Classic bilious vomiting in duodenal atresia would be if the atresia is distal to the ampulla of Vater. In 50% of cases it\\'s distal, causing bilious vomiting, and 50% proximal to the ampulla, causing nonbilious.\\n\\nSo, perhaps the presentation in this case is compatible with complete duodenal atresia (D), but why present at 3 weeks? Hmm.\\n\\nAlternatively, another possibility. The \\'yellow vomit\\' could be due to nonbilious emesis that is yellow from curdled milk. However, in pediatric cases, bilious vomiting is green or yellow-green and more concerning. Well, the given answer says \\'yellow in color\\' which could be interpreted as bile versus milk. But assuming the question is indicating bilious then, that would suggest obstruction beyond the ampulla. So, possible causes include midgut volvulus (from malrotation), jejunal atresia, ileal atresia, etc. But those aren\\'t options here. The options are A-D as given.\\n\\nSo between options C and D. Let\\'s see. Complete failure of proximal duodenum to recanalize (D) is duodenal atresia. Annular pancreas (C) is associated with failure of the ventral pancreatic bud\\'s migration. So the ventral bud should migrate and rotate around the duodenum to fuse with the dorsal bud. If it ends up surrounding the duodenum, causing annular pancreas, that\\'s option C.\\n\\nAssuming that\\'s correct, annular pancreas can cause duodenal obstruction. The age of presentation can vary, but often in the neonatal period. So in this case, perhaps presenting at three weeks. Now, since the question is about an embryologic error, annular pancreatic is due to abnormal migration of the ventral pancreatic bud (option C). Alternatively, if the duodenum itself never recanalizes (so the proximal part remains atretic), that\\'s D. What\\'s the timing of presentation? Duodenal atresia is usually immediately symptomatic, but annular pancreas can sometimes present later. But not sure. Alternatively, considering a web or stenosis may present slightly later.\\n\\nWait, other consideration: pyloric stenosis (A). The question says \\'abnormal hypertrophy of the pylorus,\\' which is correct. However, pyloric stenosis is typically non-bilious, projectile vomiting. So if a child is regurgitating all feeds with yellow vomit, would you have pyloric stenosis? Unlikely. The answer must be either C or D.\\n\\nAnother thought: the question states \\'complete failure of the proximal duodenum to recanalize.\\' Duodenal atresia is more common in the distal duodenum. But not necessarily.\\n\\nIn any case: duodenal atresia is a consideration here, especially with bilious emesis. Now, why the baby is presenting at three weeks. So maybe some kids with duodenal stenosis (as opposed to complete atresia) would present later? If option D is complete failure, it\\'s atresia and presents early. But maybe the child has a partially obstructing duodenal web (stenosis) which could cause gradual symptoms. However, the answer D is \\'complete failure,\\' so it would be atresia.\\n\\nAlternatively, perhaps annular pancreas. Annular pancreas compresses the duodenum and results in obstruction. How soon would that present? If the obstruction is complete, would present soon after birth. If partial, may present later. So at three weeks, maybe.\\n\\nBut the baby\\'s vomiting is yellow, as in bile. Then, depending on the location. If the annular pancreas compresses the duodenum distal to the ampulla, which would be the second part of duodenum, then the baby would have bilious vomiting.\\n\\nIf the annular pancreas is associated with duodenal atresia, both could be present. Hmm, but in that case, if the annulus is not the only problem, but with the atresia, the baby might present earlier.\\n\\nSo in this case, the baby is three weeks old and presented with emesis yellow. Maybe annular pancreas (option C) causing partial obstruction, leading to vomiting starting now. Or maybe duodenal stenosis from incomplete recanalization (option D, but option D is \\'complete failure\\' so perhaps that\\'s more severe).\\n\\nThe options:\\n\\nC. Abnormal migration of ventral pancreatic bud → annular pancreas.\\nD. Complete failure of proximal duodenum to recanalize → duodenal atresia.\\n\\nIf a three-week-old presented with bilious vomiting and signs of upper obstruction, is annular pancreas in the differential? Yes. But how likely versus duodenal atresia.\\n\\nDuodenal atresia, if complete, would likely present in first day or two. Since this baby is older, maybe partial obstruction. But option D\\'s description is complete failure, which would be a complete obstruction. So option D seems less likely considering the age of presentation.\\n\\nOn the other hand, annular pancreas causing an obstruction that might not be complete, so the baby can still tolerate feeds for a little while before developing symptoms.\\n\\nAnother clue in the physical exam: the abdomen is minimally distended. If it\\'s a proximal obstruction (like duodenal), there\\'s minimal abdominal distension, whereas a distal obstruction would present with more distension. But the duodenal atresia usually presents with upper abdomen distension (double bubble). But if they have not fed enough, maybe... but in this scenario, maybe. However, annular pancreas causing obstruction in the second part of the duodenum would cause similar results. So X-ray would show a double bubble (gastric and duodenal dilation) in both duodenal atresia and annular pancreas obstruction. Therefore, clinically they might look similar.\\n\\nBut the question here is which embryologic error, so the answer is the underlying defect.\\n\\nAssuming the answer options, C or D, perhaps based on timing.\\n\\nGiven that complete duodenal atresia is usually detected much earlier, it\\'s possible that the issue here is annular pancreas (option C), which may present partially and later, but I\\'m getting conflicting thoughts.\\n\\nAlternatively, perhaps D\\'s answer in the case of incomplete stenosis. But no, D is complete failure, leading to atresia. So unless there\\'s a missed diagnosis, it\\'s possible to present at three weeks if there\\'s misdiagnose, but it isn\\'t common. Maybe the answer is D.\\n\\nWait, but the baby\\'s symptoms started 4 days ago, meaning that in the third week. If complete duodenal atresia, perhaps why did the baby did well for the first few weeks? That seems contradictory. Unless the mother\\'s feeding was minimal? Maybe not. So if it\\'s duodenal atresia, the symptoms would start once feeds increase. The baby may have been able to handle very small amounts but unable as feeding volume increases. Hmm.\\n\\nAlternatively, could it be malrotation? But the options don\\'t include that. The question is about choosing between the four given answers.\\n\\nIf I have to choose between C and D: the fact that the baby was born without complications and was okay for several weeks might make D (complete failure to recanalize the duodenum) less likely because complete atresia would typically present earlier. Hence, annular pancreas (option C) might present with partial obstruction, possible in later weeks. However, I\\'m not sure how common that is.\\n\\nAnother angle: think embryology:\\n\\nFailure of recanalization (D) occurs when the duodenal lumen, which is initially a solid cord, fails to recanalize properly, leading to atresia. The solid core forms during the 5th to 6th week and recanalizes by the 8th week. If that fails, atresia.\\n\\nAnnular pancreas (C) is due to the ventral pancreatic bud not migrating properly, leading to a ring of pancreatic tissue around the duodenum.\\n\\nSo in a baby presenting at 3 weeks with bilious vomiting and possible upper obstruction, which of these developmental errors is the cause?\\n\\nIf both can cause similar obstruction, then maybe recanalization. But the timing doesn\\'t fit unless partial.\\n\\nAlternatively, could there be another possibility? Hypertrophy of pylorus (A), but then non-billious. Vomitus is yellow. Unless the bile is coming through a patent pylorus and then regurgitating. Not likely. Classic symptoms of pyloric stenosis are non-bilious.\\n\\nIf it\\'s a question of yellow vomit being interpreted as containing bile, which implies obstruction below the ampulla. Ampullary is located at the second part of duodenum. So annular pancreas wraps around distal to the ampulla. So annular pancreas would compress the second part of the duodenum. If so, yes, the vomiting would be bilious. In that case, answer C.\\n\\nBut if option D were complete failure to recanalize proximal duodenum, would that be above or below the ampulla?\\n\\nWait, the options specify \\'proximal duodenum\\'. Proximal duodenum refers to the first part. If the recanalization failed in the proximal duodenum which is before the ampulla (duodenum parts: first, second (has the ampulla), third, and fourth). So failure of recanalization in the proximal duodenum (D) would be before bile duct entry. Hence, vomiting is NON-bilious. Complete failure (atresia) of the first part of duodenum would present with non-bilious vomiting. However, if D is the problem, and since this baby has bilious vomiting (yellow), that can\\'t be D. So if the problem is distal to the ampulla, like in the second part (due to annular pancreas), then Option C which causes obstruction of the second part and hence bilious vomiting.\\n\\nBut in that scenario, the answer would be option C because failing migration of the ventral pancreatic bud causes annular pancreas.\\n\\nHmm, wait a sec. Annular pancreas is associated with duodenal atresia, but in other cases, it\\'s a standalone entity. The actual level of obstruction would dictate the vomiting\\'s nature.\\n\\nNow, in the scenario there\\'s bilious vomiting, which means obstruction distal to the ampulla. So if the atresia is in the second part of duodenum, post-ampulla, then vomiting is bilious. But the answer option D states \\'complete failure of proximal duodenum to recanalize\\'. Proximal would likely be first part, hence pre-ampulla. So, vomiting would be non-bilious. So if the baby has bilious vomiting, which they do (yellow), the obstruction must be post-ampullary. Hence, the embryologic error causing post-ampullary obstruction would be either annular pancreas (if in second part) or another distal etiology.\\n\\nThus, C (abnormal migration of ventral pancreatic bud leading to annular pancreas causing post-ampullary obstruction) would result in bilious vomiting (yellow). While failure of recanalization of the proximal duodenum (D) would result in non-bilious.\\n\\nSince the vomit here is yellow, bilious, implies post-ampullary.\\n\\nTherefore, option C. \\n\\nAlternatively, a midgut volvulus (which isn\\'t one of the options). But given the answer choices, maybe it\\'s option C.',\n",
       " 'distilled_answer_string': 'The infant’s presentation of bilious vomiting (yellow vomit) and mild abdominal distension suggests an intestinal obstruction **distal to the ampulla of Vater**, as bile enters the duodenum at this point. The key embryologic errors to consider are:\\n\\n1. **Option C: Abnormal migration of ventral pancreatic bud**  \\n   This error leads to **annular pancreas**, where pancreatic tissue encircles the duodenum, often causing obstruction in the second part of the duodenum (post-ampullary). Annular pancreas can present with bilious vomiting, which aligns with the described symptoms. While it typically manifests neonatally, partial obstruction may lead to delayed presentation (e.g., 3 weeks of age).\\n\\n2. **Option D: Complete failure of proximal duodenum to recanalize**  \\n   This results in **duodenal atresia**, which classically presents with vomiting immediately after birth. \"Proximal duodenum\" refers to the first or second part pre-ampullary, and complete failure would lead to **non-bilious** vomiting. However, the infant’s symptoms began at 3 weeks, making this timing inconsistent with complete atresia. Additionally, bilious vomiting indicates obstruction **distal to the ampulla**, which is less likely in \"proximal duodenal\" atresia.\\n\\n**Option A** (pyloric stenosis) causes **non-bilious** vomiting and does not fit the presentation. **Option B** (failure of lateral folds to fuse) leads to abdominal wall defects (e.g., gastroschisis), which are evident at birth. Thus, **Option C** is the most plausible embryologic error, as annular pancreas can obstruct the post-ampullary duodenum, resulting in bilious vomiting. \\n\\n**Answer:** \\\\boxed{C}',\n",
       " 'r1-extracted_answer_string': 'C',\n",
       " 'r1-model_answer_string': 'The infant’s presentation of bilious vomiting (yellow vomit) and mild abdominal distension suggests an intestinal obstruction **distal to the ampulla of Vater**, as bile enters the duodenum at this point. The key embryologic errors to consider are:\\n\\n1. **Option C: Abnormal migration of ventral pancreatic bud**  \\n   This error leads to **annular pancreas**, where pancreatic tissue encircles the duodenum, often causing obstruction in the second part of the duodenum (post-ampullary). Annular pancreas can present with bilious vomiting, which aligns with the described symptoms. While it typically manifests neonatally, partial obstruction may lead to delayed presentation (e.g., 3 weeks of age).\\n\\n2. **Option D: Complete failure of proximal duodenum to recanalize**  \\n   This results in **duodenal atresia**, which classically presents with vomiting immediately after birth. \"Proximal duodenum\" refers to the first or second part pre-ampullary, and complete failure would lead to **non-bilious** vomiting. However, the infant’s symptoms began at 3 weeks, making this timing inconsistent with complete atresia. Additionally, bilious vomiting indicates obstruction **distal to the ampulla**, which is less likely in \"proximal duodenal\" atresia.\\n\\n**Option A** (pyloric stenosis) causes **non-bilious** vomiting and does not fit the presentation. **Option B** (failure of lateral folds to fuse) leads to abdominal wall defects (e.g., gastroschisis), which are evident at birth. Thus, **Option C** is the most plausible embryologic error, as annular pancreas can obstruct the post-ampullary duodenum, resulting in bilious vomiting. \\n\\n**Answer:** \\\\boxed{C}',\n",
       " 'r1-correct': True,\n",
       " 'prompt_hash': 'cbabb28052dc1fd1',\n",
       " 'domain': 'The question is about a pediatric patient presenting with a possible congenital abnormality leading to vomiting and regurgitation. The focus is on the embryologic errors that could account for the symptoms (e.g., pyloric hypertrophy, duodenal recanalization failure, etc.). This aligns with the subject of \"Embryology\" as it primarily concerns embryologic factors contributing to postnatal disorders.\\n\\n**27**',\n",
       " 'domain_code': '27',\n",
       " 'domain_name': 'Embryology',\n",
       " 'is_domain_code_extracted': True}"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 23493/23493 [00:03<00:00, 7565.42it/s]\n"
     ]
    }
   ],
   "source": [
    "# Domain-source sampling\n",
    "domain_source_samples = {}\n",
    "for sample in tqdm.tqdm(dataset):\n",
    "    domain_code = sample[\"domain_code\"]\n",
    "    if domain_code not in domain_source_samples:\n",
    "        domain_source_samples[domain_code] = {}\n",
    "    \n",
    "    current_domain = domain_source_samples[domain_code]\n",
    "    source_name = sample[\"source\"]\n",
    "    if source_name not in current_domain:\n",
    "        current_domain[source_name] = []\n",
    "\n",
    "    current_domain[source_name].append(sample)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "27\n",
      "  GBaker/MedQA-USMLE-4-options: 12\n",
      "  openlifescienceai/headqa: 1\n",
      "  openlifescienceai/medmcqa: 245\n",
      "54\n",
      "  GBaker/MedQA-USMLE-4-options: 92\n",
      "  qiaojin/PubMedQA:pqa_labeled: 1\n",
      "  openlifescienceai/headqa: 3\n",
      "  openlifescienceai/medmcqa: 1093\n",
      "20\n",
      "  GBaker/MedQA-USMLE-4-options: 229\n",
      "  qiaojin/PubMedQA:pqa_labeled: 1\n",
      "  openlifescienceai/headqa: 17\n",
      "  openlifescienceai/medmcqa: 1681\n",
      "50\n",
      "  GBaker/MedQA-USMLE-4-options: 68\n",
      "  openlifescienceai/headqa: 2\n",
      "  openlifescienceai/medmcqa: 1532\n",
      "32\n",
      "  GBaker/MedQA-USMLE-4-options: 133\n",
      "  qiaojin/PubMedQA:pqa_labeled: 1\n",
      "  openlifescienceai/headqa: 5\n",
      "  openlifescienceai/medmcqa: 657\n",
      "65\n",
      "  GBaker/MedQA-USMLE-4-options: 5\n",
      "  qiaojin/PubMedQA:pqa_labeled: 2\n",
      "  openlifescienceai/headqa: 1\n",
      "  openlifescienceai/medmcqa: 749\n",
      "53\n",
      "  GBaker/MedQA-USMLE-4-options: 45\n",
      "  qiaojin/PubMedQA:pqa_labeled: 1\n",
      "  openlifescienceai/headqa: 17\n",
      "  openlifescienceai/medmcqa: 1168\n",
      "38\n",
      "  GBaker/MedQA-USMLE-4-options: 10\n",
      "  openlifescienceai/headqa: 1\n",
      "  openlifescienceai/medmcqa: 157\n",
      "42\n",
      "  GBaker/MedQA-USMLE-4-options: 15\n",
      "  openlifescienceai/headqa: 5\n",
      "  openlifescienceai/medmcqa: 158\n",
      "02\n",
      "  GBaker/MedQA-USMLE-4-options: 116\n",
      "  qiaojin/PubMedQA:pqa_labeled: 2\n",
      "  openlifescienceai/headqa: 2\n",
      "  openlifescienceai/medmcqa: 936\n",
      "01\n",
      "  GBaker/MedQA-USMLE-4-options: 9\n",
      "  openlifescienceai/headqa: 3\n",
      "  openlifescienceai/medmcqa: 590\n",
      "08\n",
      "  GBaker/MedQA-USMLE-4-options: 4\n",
      "  openlifescienceai/headqa: 3\n",
      "  openlifescienceai/medmcqa: 73\n",
      "19\n",
      "  GBaker/MedQA-USMLE-4-options: 58\n",
      "  qiaojin/PubMedQA:pqa_labeled: 1\n",
      "  openlifescienceai/headqa: 2\n",
      "  openlifescienceai/medmcqa: 245\n",
      "44\n",
      "  GBaker/MedQA-USMLE-4-options: 27\n",
      "  openlifescienceai/medmcqa: 358\n",
      "49\n",
      "  GBaker/MedQA-USMLE-4-options: 10\n",
      "  openlifescienceai/medmcqa: 71\n",
      "24\n",
      "  GBaker/MedQA-USMLE-4-options: 164\n",
      "  openlifescienceai/headqa: 6\n",
      "  openlifescienceai/medmcqa: 509\n",
      "45\n",
      "  GBaker/MedQA-USMLE-4-options: 3\n",
      "  openlifescienceai/medmcqa: 30\n",
      "37\n",
      "  GBaker/MedQA-USMLE-4-options: 12\n",
      "  openlifescienceai/headqa: 1\n",
      "  openlifescienceai/medmcqa: 430\n",
      "16\n",
      "  GBaker/MedQA-USMLE-4-options: 73\n",
      "  qiaojin/PubMedQA:pqa_labeled: 1\n",
      "  openlifescienceai/headqa: 2\n",
      "  openlifescienceai/medmcqa: 625\n",
      "17\n",
      "  GBaker/MedQA-USMLE-4-options: 59\n",
      "  openlifescienceai/medmcqa: 382\n",
      "33\n",
      "  GBaker/MedQA-USMLE-4-options: 66\n",
      "  openlifescienceai/headqa: 5\n",
      "  openlifescienceai/medmcqa: 293\n",
      "06\n",
      "  GBaker/MedQA-USMLE-4-options: 17\n",
      "  openlifescienceai/headqa: 3\n",
      "  openlifescienceai/medmcqa: 2328\n",
      "00\n",
      "  GBaker/MedQA-USMLE-4-options: 24\n",
      "  qiaojin/PubMedQA:pqa_labeled: 1\n",
      "  openlifescienceai/medmcqa: 646\n",
      "23\n",
      "  GBaker/MedQA-USMLE-4-options: 12\n",
      "  openlifescienceai/medmcqa: 107\n",
      "07\n",
      "  GBaker/MedQA-USMLE-4-options: 14\n",
      "  openlifescienceai/headqa: 2\n",
      "  openlifescienceai/medmcqa: 113\n",
      "55\n",
      "  GBaker/MedQA-USMLE-4-options: 17\n",
      "  openlifescienceai/medmcqa: 171\n",
      "30\n",
      "  GBaker/MedQA-USMLE-4-options: 8\n",
      "  openlifescienceai/headqa: 2\n",
      "  openlifescienceai/medmcqa: 20\n",
      "09\n",
      "  GBaker/MedQA-USMLE-4-options: 45\n",
      "  qiaojin/PubMedQA:pqa_labeled: 1\n",
      "  openlifescienceai/medmcqa: 360\n",
      "36\n",
      "  GBaker/MedQA-USMLE-4-options: 17\n",
      "  openlifescienceai/headqa: 2\n",
      "  openlifescienceai/medmcqa: 199\n",
      "34\n",
      "  GBaker/MedQA-USMLE-4-options: 8\n",
      "  qiaojin/PubMedQA:pqa_labeled: 1\n",
      "  openlifescienceai/headqa: 3\n",
      "  openlifescienceai/medmcqa: 245\n",
      "52\n",
      "  GBaker/MedQA-USMLE-4-options: 31\n",
      "  openlifescienceai/headqa: 1\n",
      "  openlifescienceai/medmcqa: 315\n",
      "56\n",
      "  GBaker/MedQA-USMLE-4-options: 17\n",
      "  openlifescienceai/headqa: 1\n",
      "  openlifescienceai/medmcqa: 131\n",
      "18\n",
      "  GBaker/MedQA-USMLE-4-options: 3\n",
      "  openlifescienceai/headqa: 1\n",
      "  openlifescienceai/medmcqa: 90\n",
      "67\n",
      "  GBaker/MedQA-USMLE-4-options: 45\n",
      "  qiaojin/PubMedQA:pqa_labeled: 2\n",
      "  openlifescienceai/headqa: 7\n",
      "  openlifescienceai/medmcqa: 373\n",
      "29\n",
      "  GBaker/MedQA-USMLE-4-options: 8\n",
      "  openlifescienceai/headqa: 3\n",
      "  openlifescienceai/medmcqa: 265\n",
      "66\n",
      "  GBaker/MedQA-USMLE-4-options: 5\n",
      "  openlifescienceai/headqa: 1\n",
      "  openlifescienceai/medmcqa: 230\n",
      "63\n",
      "  GBaker/MedQA-USMLE-4-options: 5\n",
      "  qiaojin/PubMedQA:pqa_labeled: 1\n",
      "  openlifescienceai/headqa: 2\n",
      "  openlifescienceai/medmcqa: 53\n",
      "21\n",
      "  GBaker/MedQA-USMLE-4-options: 25\n",
      "  qiaojin/PubMedQA:pqa_labeled: 4\n",
      "  openlifescienceai/headqa: 2\n",
      "  openlifescienceai/medmcqa: 720\n",
      "48\n",
      "  GBaker/MedQA-USMLE-4-options: 5\n",
      "  openlifescienceai/headqa: 1\n",
      "  openlifescienceai/medmcqa: 209\n",
      "13\n",
      "  GBaker/MedQA-USMLE-4-options: 16\n",
      "  openlifescienceai/headqa: 1\n",
      "  openlifescienceai/medmcqa: 82\n",
      "75\n",
      "  GBaker/MedQA-USMLE-4-options: 12\n",
      "  openlifescienceai/headqa: 5\n",
      "  openlifescienceai/medmcqa: 90\n",
      "10\n",
      "  GBaker/MedQA-USMLE-4-options: 19\n",
      "  openlifescienceai/medmcqa: 228\n",
      "57\n",
      "  GBaker/MedQA-USMLE-4-options: 20\n",
      "  qiaojin/PubMedQA:pqa_labeled: 1\n",
      "  openlifescienceai/headqa: 21\n",
      "  openlifescienceai/medmcqa: 165\n",
      "51\n",
      "  GBaker/MedQA-USMLE-4-options: 4\n",
      "  openlifescienceai/headqa: 6\n",
      "  openlifescienceai/medmcqa: 99\n",
      "73\n",
      "  GBaker/MedQA-USMLE-4-options: 11\n",
      "  openlifescienceai/medmcqa: 77\n",
      "11\n",
      "  GBaker/MedQA-USMLE-4-options: 7\n",
      "  openlifescienceai/medmcqa: 34\n",
      "61\n",
      "  GBaker/MedQA-USMLE-4-options: 2\n",
      "  openlifescienceai/medmcqa: 44\n",
      "43\n",
      "  GBaker/MedQA-USMLE-4-options: 1\n",
      "  qiaojin/PubMedQA:pqa_labeled: 1\n",
      "  openlifescienceai/headqa: 3\n",
      "  openlifescienceai/medmcqa: 260\n",
      "68\n",
      "  GBaker/MedQA-USMLE-4-options: 2\n",
      "  openlifescienceai/medmcqa: 29\n",
      "28\n",
      "  GBaker/MedQA-USMLE-4-options: 5\n",
      "  openlifescienceai/headqa: 3\n",
      "  openlifescienceai/medmcqa: 95\n",
      "05\n",
      "  GBaker/MedQA-USMLE-4-options: 2\n",
      "  qiaojin/PubMedQA:pqa_labeled: 2\n",
      "  openlifescienceai/headqa: 14\n",
      "  openlifescienceai/medmcqa: 222\n",
      "69\n",
      "  GBaker/MedQA-USMLE-4-options: 1\n",
      "  openlifescienceai/headqa: 1\n",
      "  openlifescienceai/medmcqa: 86\n",
      "03\n",
      "  GBaker/MedQA-USMLE-4-options: 2\n",
      "  openlifescienceai/headqa: 1\n",
      "  openlifescienceai/medmcqa: 20\n",
      "22\n",
      "  GBaker/MedQA-USMLE-4-options: 3\n",
      "  qiaojin/PubMedQA:pqa_labeled: 1\n",
      "  openlifescienceai/headqa: 2\n",
      "  openlifescienceai/medmcqa: 41\n",
      "62\n",
      "  GBaker/MedQA-USMLE-4-options: 2\n",
      "  openlifescienceai/medmcqa: 67\n",
      "58\n",
      "  GBaker/MedQA-USMLE-4-options: 1\n",
      "  openlifescienceai/medmcqa: 56\n",
      "04\n",
      "  GBaker/MedQA-USMLE-4-options: 1\n",
      "  openlifescienceai/medmcqa: 10\n",
      "25\n",
      "  GBaker/MedQA-USMLE-4-options: 1\n",
      "  qiaojin/PubMedQA:pqa_labeled: 1\n",
      "  openlifescienceai/headqa: 1\n",
      "  openlifescienceai/medmcqa: 17\n",
      "26\n",
      "  qiaojin/PubMedQA:pqa_labeled: 1\n",
      "  openlifescienceai/headqa: 1\n",
      "  openlifescienceai/medmcqa: 20\n",
      "47\n",
      "  qiaojin/PubMedQA:pqa_labeled: 1\n",
      "  openlifescienceai/headqa: 3\n",
      "  openlifescienceai/medmcqa: 111\n",
      "46\n",
      "  openlifescienceai/headqa: 5\n",
      "  openlifescienceai/medmcqa: 6\n",
      "35\n",
      "  openlifescienceai/headqa: 2\n",
      "  openlifescienceai/medmcqa: 72\n",
      "15\n",
      "  openlifescienceai/headqa: 5\n",
      "  openlifescienceai/medmcqa: 341\n",
      "14\n",
      "  openlifescienceai/headqa: 21\n",
      "  openlifescienceai/medmcqa: 342\n",
      "41\n",
      "  openlifescienceai/headqa: 1\n",
      "  openlifescienceai/medmcqa: 204\n",
      "12\n",
      "  openlifescienceai/headqa: 5\n",
      "  openlifescienceai/medmcqa: 1\n",
      "60\n",
      "  openlifescienceai/headqa: 1\n",
      "  openlifescienceai/medmcqa: 10\n",
      "59\n",
      "  openlifescienceai/medmcqa: 28\n",
      "39\n",
      "  openlifescienceai/medmcqa: 151\n",
      "70\n",
      "  openlifescienceai/medmcqa: 31\n",
      "31\n",
      "  openlifescienceai/medmcqa: 5\n",
      "40\n",
      "  openlifescienceai/medmcqa: 7\n",
      "64\n",
      "  openlifescienceai/medmcqa: 10\n",
      "72\n",
      "  openlifescienceai/medmcqa: 7\n",
      "71\n",
      "  openlifescienceai/medmcqa: 1\n",
      "74\n",
      "  openlifescienceai/medmcqa: 2\n"
     ]
    }
   ],
   "source": [
    "def get_num_samples_from_domain(domain_source_samples):\n",
    "    num_samples = 0\n",
    "    for domain_code in domain_source_samples:\n",
    "        domain = domain_source_samples[domain_code]\n",
    "        print(domain_code)\n",
    "        for source_name in domain:\n",
    "            print(f\"  {source_name}: {len(domain[source_name])}\")\n",
    "            num_samples += len(domain[source_name])\n",
    "    return num_samples\n",
    "\n",
    "num_samples = get_num_samples_from_domain(domain_source_samples)\n",
    "if num_samples != len(dataset):\n",
    "    raise ValueError(f\"Number of samples do not match, {num_samples} != {len(dataset)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Sampling:  75%|███████▌  | 753/1000 [00:00<00:00, 3403.71it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Removing source openlifescienceai/headqa from domain 38\n",
      "Removing source openlifescienceai/medmcqa from domain 71\n",
      "Removing domain 71\n",
      "Removing source openlifescienceai/headqa from domain 26\n",
      "Removing source openlifescienceai/headqa from domain 60\n",
      "Removing source GBaker/MedQA-USMLE-4-options from domain 43\n",
      "Removing source qiaojin/PubMedQA:pqa_labeled from domain 09\n",
      "Removing source openlifescienceai/headqa from domain 69\n",
      "Removing source qiaojin/PubMedQA:pqa_labeled from domain 00\n",
      "Removing source qiaojin/PubMedQA:pqa_labeled from domain 20\n",
      "Removing source openlifescienceai/headqa from domain 41\n",
      "Removing source openlifescienceai/headqa from domain 18\n",
      "Removing source openlifescienceai/headqa from domain 52\n",
      "Removing source qiaojin/PubMedQA:pqa_labeled from domain 19\n",
      "Removing source openlifescienceai/headqa from domain 25\n",
      "Removing source openlifescienceai/headqa from domain 03\n",
      "Removing source openlifescienceai/headqa from domain 21\n",
      "Removing source GBaker/MedQA-USMLE-4-options from domain 03\n",
      "Removing source GBaker/MedQA-USMLE-4-options from domain 58\n",
      "Removing source GBaker/MedQA-USMLE-4-options from domain 04\n",
      "Removing source qiaojin/PubMedQA:pqa_labeled from domain 43\n",
      "Removing source qiaojin/PubMedQA:pqa_labeled from domain 47\n",
      "Removing source openlifescienceai/headqa from domain 56\n",
      "Removing source qiaojin/PubMedQA:pqa_labeled from domain 53\n",
      "Removing source openlifescienceai/headqa from domain 07\n",
      "Removing source GBaker/MedQA-USMLE-4-options from domain 25\n",
      "Removing source openlifescienceai/headqa from domain 48\n",
      "Removing source qiaojin/PubMedQA:pqa_labeled from domain 26\n",
      "Removing source qiaojin/PubMedQA:pqa_labeled from domain 02\n",
      "Removing source openlifescienceai/headqa from domain 66\n",
      "Removing source openlifescienceai/headqa from domain 35\n",
      "Removing source openlifescienceai/headqa from domain 06\n",
      "Removing source openlifescienceai/headqa from domain 50\n",
      "Removing source GBaker/MedQA-USMLE-4-options from domain 18\n",
      "Removing source qiaojin/PubMedQA:pqa_labeled from domain 65\n",
      "Removing source openlifescienceai/medmcqa from domain 74\n",
      "Removing domain 74\n",
      "Removing source openlifescienceai/headqa from domain 02\n",
      "Removing source openlifescienceai/headqa from domain 27\n",
      "Removing source openlifescienceai/headqa from domain 63\n",
      "Removing source qiaojin/PubMedQA:pqa_labeled from domain 22\n",
      "Removing source qiaojin/PubMedQA:pqa_labeled from domain 67\n",
      "Removing source GBaker/MedQA-USMLE-4-options from domain 61\n",
      "Removing source openlifescienceai/medmcqa from domain 46\n",
      "Removing source openlifescienceai/headqa from domain 65\n",
      "Removing source openlifescienceai/headqa from domain 16\n",
      "Removing source GBaker/MedQA-USMLE-4-options from domain 68\n",
      "Removing source GBaker/MedQA-USMLE-4-options from domain 69\n",
      "Removing source GBaker/MedQA-USMLE-4-options from domain 62\n",
      "Removing source qiaojin/PubMedQA:pqa_labeled from domain 54\n",
      "Removing source openlifescienceai/headqa from domain 37\n",
      "Removing source qiaojin/PubMedQA:pqa_labeled from domain 25\n",
      "Removing source openlifescienceai/medmcqa from domain 31\n",
      "Removing domain 31\n",
      "Removing source openlifescienceai/headqa from domain 43\n",
      "Removing source qiaojin/PubMedQA:pqa_labeled from domain 63\n",
      "Removing source openlifescienceai/headqa from domain 15\n",
      "Removing source openlifescienceai/headqa from domain 54\n",
      "Removing source openlifescienceai/headqa from domain 46\n",
      "Removing domain 46\n",
      "Removing source qiaojin/PubMedQA:pqa_labeled from domain 21\n",
      "Removing source qiaojin/PubMedQA:pqa_labeled from domain 57\n",
      "Removing source qiaojin/PubMedQA:pqa_labeled from domain 32\n",
      "Removing source openlifescienceai/medmcqa from domain 40\n",
      "Removing domain 40\n",
      "Removing source openlifescienceai/headqa from domain 22\n",
      "Removing source openlifescienceai/headqa from domain 32\n",
      "Removing source openlifescienceai/headqa from domain 12\n",
      "Removing source qiaojin/PubMedQA:pqa_labeled from domain 16\n",
      "Removing source GBaker/MedQA-USMLE-4-options from domain 05\n",
      "Removing source openlifescienceai/headqa from domain 01\n",
      "Removing source openlifescienceai/headqa from domain 47\n",
      "Removing source openlifescienceai/headqa from domain 51\n",
      "Removing source openlifescienceai/medmcqa from domain 72\n",
      "Removing domain 72\n",
      "Removing source openlifescienceai/headqa from domain 13\n",
      "Removing source openlifescienceai/headqa from domain 30\n",
      "Removing source GBaker/MedQA-USMLE-4-options from domain 45\n",
      "Removing source openlifescienceai/headqa from domain 36\n",
      "Removing source openlifescienceai/headqa from domain 19\n",
      "Removing source qiaojin/PubMedQA:pqa_labeled from domain 05\n",
      "Removing source GBaker/MedQA-USMLE-4-options from domain 63\n",
      "Removing source GBaker/MedQA-USMLE-4-options from domain 65\n",
      "Removing source openlifescienceai/headqa from domain 29\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Sampling:  80%|███████▉  | 796/1000 [00:00<00:00, 4050.77it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Removing source openlifescienceai/medmcqa from domain 64\n",
      "Removing domain 64\n",
      "Removing source GBaker/MedQA-USMLE-4-options from domain 48\n",
      "Removing source openlifescienceai/medmcqa from domain 60\n",
      "Removing domain 60\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Sampling:  96%|█████████▌| 962/1000 [00:00<00:00, 4050.77it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Removing source openlifescienceai/headqa from domain 08\n",
      "Removing source GBaker/MedQA-USMLE-4-options from domain 66\n",
      "Removing source GBaker/MedQA-USMLE-4-options from domain 37\n",
      "Removing source GBaker/MedQA-USMLE-4-options from domain 08\n",
      "Removing source openlifescienceai/medmcqa from domain 12\n",
      "Removing domain 12\n",
      "Removing source openlifescienceai/medmcqa from domain 04\n",
      "Removing domain 04\n",
      "Removing source openlifescienceai/headqa from domain 75\n",
      "Removing source qiaojin/PubMedQA:pqa_labeled from domain 34\n"
     ]
    }
   ],
   "source": [
    "seed = 42\n",
    "rng = np.random.default_rng(42)\n",
    "\n",
    "target_num_samples = 1000\n",
    "selected_samples = []\n",
    "\n",
    "pbar = tqdm.tqdm(total=target_num_samples, desc=\"Sampling\")\n",
    "while len(selected_samples) < target_num_samples:\n",
    "    # sample a domain\n",
    "    domain_code = rng.choice(list(domain_source_samples.keys()))\n",
    "\n",
    "    chosen_domain = domain_source_samples[domain_code]\n",
    "    # remove empty domain\n",
    "    if len(chosen_domain) == 0:\n",
    "        pbar.write(f\"Before sampling sources, removing domain {domain_code}\")\n",
    "        domain_source_samples.pop(domain_code)\n",
    "        continue\n",
    "    # sample a source\n",
    "    source_name = rng.choice(list(chosen_domain.keys()))\n",
    "    chosen_domain_source = chosen_domain[source_name]\n",
    "\n",
    "    if not isinstance(chosen_domain_source, list):\n",
    "        raise ValueError(f\"Expected list, got {type(chosen_domain_source)}\")\n",
    "\n",
    "    # remove empty domain source\n",
    "    if len(chosen_domain_source) == 0:\n",
    "        pbar.write(f\"Before sampling samples, removing domain {domain_code} source {source_name}\")\n",
    "        chosen_domain.pop(source_name)\n",
    "        continue\n",
    "\n",
    "    # choose a sample, and remove the sample from the chosen_domain_source\n",
    "    before_rm_length = len(chosen_domain_source)\n",
    "    sample = rng.choice(chosen_domain_source)\n",
    "    chosen_domain_source.remove(sample)\n",
    "    after_rm_length = len(chosen_domain_source)\n",
    "    if before_rm_length != after_rm_length + 1:\n",
    "        raise ValueError(f\"Expected {before_rm_length} == {after_rm_length} + 1\")\n",
    "\n",
    "    # double check remove empty domain source\n",
    "    if len(chosen_domain_source) == 0:\n",
    "        pbar.write(f\"Removing source {source_name} from domain {domain_code}\")\n",
    "        chosen_domain.pop(source_name)\n",
    "    \n",
    "    # double check remove empty domain\n",
    "    if len(chosen_domain) == 0:\n",
    "        pbar.write(f\"Removing domain {domain_code}\")\n",
    "        domain_source_samples.pop(domain_code)\n",
    "\n",
    "    selected_samples.append(sample)\n",
    "    pbar.update(1)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total samples: 23493\n",
      "Selected 1000 samples\n",
      "27\n",
      "  GBaker/MedQA-USMLE-4-options: 5\n",
      "  openlifescienceai/medmcqa: 239\n",
      "54\n",
      "  GBaker/MedQA-USMLE-4-options: 87\n",
      "  openlifescienceai/medmcqa: 1087\n",
      "20\n",
      "  GBaker/MedQA-USMLE-4-options: 227\n",
      "  openlifescienceai/headqa: 14\n",
      "  openlifescienceai/medmcqa: 1677\n",
      "50\n",
      "  GBaker/MedQA-USMLE-4-options: 57\n",
      "  openlifescienceai/medmcqa: 1525\n",
      "32\n",
      "  GBaker/MedQA-USMLE-4-options: 129\n",
      "  openlifescienceai/medmcqa: 654\n",
      "65\n",
      "  openlifescienceai/medmcqa: 742\n",
      "53\n",
      "  GBaker/MedQA-USMLE-4-options: 40\n",
      "  openlifescienceai/headqa: 12\n",
      "  openlifescienceai/medmcqa: 1155\n",
      "38\n",
      "  GBaker/MedQA-USMLE-4-options: 7\n",
      "  openlifescienceai/medmcqa: 148\n",
      "42\n",
      "  GBaker/MedQA-USMLE-4-options: 14\n",
      "  openlifescienceai/headqa: 1\n",
      "  openlifescienceai/medmcqa: 153\n",
      "02\n",
      "  GBaker/MedQA-USMLE-4-options: 107\n",
      "  openlifescienceai/medmcqa: 929\n",
      "01\n",
      "  GBaker/MedQA-USMLE-4-options: 3\n",
      "  openlifescienceai/medmcqa: 581\n",
      "08\n",
      "  openlifescienceai/medmcqa: 67\n",
      "19\n",
      "  GBaker/MedQA-USMLE-4-options: 52\n",
      "  openlifescienceai/medmcqa: 242\n",
      "44\n",
      "  GBaker/MedQA-USMLE-4-options: 22\n",
      "  openlifescienceai/medmcqa: 350\n",
      "49\n",
      "  GBaker/MedQA-USMLE-4-options: 5\n",
      "  openlifescienceai/medmcqa: 67\n",
      "24\n",
      "  GBaker/MedQA-USMLE-4-options: 163\n",
      "  openlifescienceai/headqa: 2\n",
      "  openlifescienceai/medmcqa: 504\n",
      "45\n",
      "  openlifescienceai/medmcqa: 25\n",
      "37\n",
      "  openlifescienceai/medmcqa: 420\n",
      "16\n",
      "  GBaker/MedQA-USMLE-4-options: 67\n",
      "  openlifescienceai/medmcqa: 622\n",
      "17\n",
      "  GBaker/MedQA-USMLE-4-options: 52\n",
      "  openlifescienceai/medmcqa: 378\n",
      "33\n",
      "  GBaker/MedQA-USMLE-4-options: 62\n",
      "  openlifescienceai/headqa: 3\n",
      "  openlifescienceai/medmcqa: 290\n",
      "06\n",
      "  GBaker/MedQA-USMLE-4-options: 7\n",
      "  openlifescienceai/medmcqa: 2317\n",
      "00\n",
      "  GBaker/MedQA-USMLE-4-options: 14\n",
      "  openlifescienceai/medmcqa: 636\n",
      "23\n",
      "  GBaker/MedQA-USMLE-4-options: 1\n",
      "  openlifescienceai/medmcqa: 98\n",
      "07\n",
      "  GBaker/MedQA-USMLE-4-options: 7\n",
      "  openlifescienceai/medmcqa: 107\n",
      "55\n",
      "  GBaker/MedQA-USMLE-4-options: 12\n",
      "  openlifescienceai/medmcqa: 164\n",
      "30\n",
      "  GBaker/MedQA-USMLE-4-options: 3\n",
      "  openlifescienceai/medmcqa: 18\n",
      "09\n",
      "  GBaker/MedQA-USMLE-4-options: 38\n",
      "  openlifescienceai/medmcqa: 348\n",
      "36\n",
      "  GBaker/MedQA-USMLE-4-options: 7\n",
      "  openlifescienceai/medmcqa: 192\n",
      "34\n",
      "  GBaker/MedQA-USMLE-4-options: 8\n",
      "  openlifescienceai/headqa: 2\n",
      "  openlifescienceai/medmcqa: 240\n",
      "52\n",
      "  GBaker/MedQA-USMLE-4-options: 26\n",
      "  openlifescienceai/medmcqa: 309\n",
      "56\n",
      "  GBaker/MedQA-USMLE-4-options: 11\n",
      "  openlifescienceai/medmcqa: 121\n",
      "18\n",
      "  openlifescienceai/medmcqa: 72\n",
      "67\n",
      "  GBaker/MedQA-USMLE-4-options: 40\n",
      "  openlifescienceai/headqa: 4\n",
      "  openlifescienceai/medmcqa: 370\n",
      "29\n",
      "  GBaker/MedQA-USMLE-4-options: 4\n",
      "  openlifescienceai/medmcqa: 261\n",
      "66\n",
      "  openlifescienceai/medmcqa: 224\n",
      "63\n",
      "  openlifescienceai/medmcqa: 49\n",
      "21\n",
      "  GBaker/MedQA-USMLE-4-options: 18\n",
      "  openlifescienceai/medmcqa: 709\n",
      "48\n",
      "  openlifescienceai/medmcqa: 200\n",
      "13\n",
      "  GBaker/MedQA-USMLE-4-options: 11\n",
      "  openlifescienceai/medmcqa: 74\n",
      "75\n",
      "  GBaker/MedQA-USMLE-4-options: 6\n",
      "  openlifescienceai/medmcqa: 86\n",
      "10\n",
      "  GBaker/MedQA-USMLE-4-options: 13\n",
      "  openlifescienceai/medmcqa: 220\n",
      "57\n",
      "  GBaker/MedQA-USMLE-4-options: 20\n",
      "  openlifescienceai/headqa: 20\n",
      "  openlifescienceai/medmcqa: 161\n",
      "51\n",
      "  GBaker/MedQA-USMLE-4-options: 1\n",
      "  openlifescienceai/medmcqa: 98\n",
      "73\n",
      "  GBaker/MedQA-USMLE-4-options: 3\n",
      "  openlifescienceai/medmcqa: 71\n",
      "11\n",
      "  GBaker/MedQA-USMLE-4-options: 3\n",
      "  openlifescienceai/medmcqa: 24\n",
      "61\n",
      "  openlifescienceai/medmcqa: 38\n",
      "43\n",
      "  openlifescienceai/medmcqa: 251\n",
      "68\n",
      "  openlifescienceai/medmcqa: 22\n",
      "28\n",
      "  GBaker/MedQA-USMLE-4-options: 1\n",
      "  openlifescienceai/headqa: 2\n",
      "  openlifescienceai/medmcqa: 93\n",
      "05\n",
      "  openlifescienceai/headqa: 12\n",
      "  openlifescienceai/medmcqa: 216\n",
      "69\n",
      "  openlifescienceai/medmcqa: 71\n",
      "03\n",
      "  openlifescienceai/medmcqa: 7\n",
      "22\n",
      "  GBaker/MedQA-USMLE-4-options: 1\n",
      "  openlifescienceai/medmcqa: 38\n",
      "62\n",
      "  openlifescienceai/medmcqa: 57\n",
      "58\n",
      "  openlifescienceai/medmcqa: 33\n",
      "25\n",
      "  openlifescienceai/medmcqa: 6\n",
      "26\n",
      "  openlifescienceai/medmcqa: 8\n",
      "47\n",
      "  openlifescienceai/medmcqa: 101\n",
      "35\n",
      "  openlifescienceai/medmcqa: 67\n",
      "15\n",
      "  openlifescienceai/medmcqa: 333\n",
      "14\n",
      "  openlifescienceai/headqa: 14\n",
      "  openlifescienceai/medmcqa: 334\n",
      "41\n",
      "  openlifescienceai/medmcqa: 189\n",
      "59\n",
      "  openlifescienceai/medmcqa: 11\n",
      "39\n",
      "  openlifescienceai/medmcqa: 134\n",
      "70\n",
      "  openlifescienceai/medmcqa: 20\n",
      "Remaining samples: 22493\n"
     ]
    }
   ],
   "source": [
    "print(f\"Total samples: {len(dataset)}\")\n",
    "print(f\"Selected {len(selected_samples)} samples\")\n",
    "print(f\"Remaining samples: {get_num_samples_from_domain(domain_source_samples)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "selected_dataset = datasets.Dataset.from_list(selected_samples)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 19.23ba/s]\n",
      "Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.18it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/datasets/anonymous/m196k-dedup-decon-filter_easy-r1-filter_wrong-decon_eval-domain_source_1k-150325/commit/e84da43ae3c0a7d490c60c726645d19976858a12', commit_message='Upload dataset', commit_description='', oid='e84da43ae3c0a7d490c60c726645d19976858a12', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/anonymous/m196k-dedup-decon-filter_easy-r1-filter_wrong-decon_eval-domain_source_1k-150325', endpoint='https://huggingface.co', repo_type='dataset', repo_id='anonymous/m196k-dedup-decon-filter_easy-r1-filter_wrong-decon_eval-domain_source_1k-150325'), pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "push_repo_id = f\"{repo_id}-domain_source_1k\"\n",
    "selected_dataset.push_to_hub(push_repo_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'anonymous/m196k-dedup-decon-filter_easy-r1-filter_wrong-decon_eval-domain_source_1k-150325'"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "push_repo_id"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "anonymous-med_sipf",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
