{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "import re, numpy as np, pandas as pd\n",
    "from pathlib import Path\n",
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "REPO_ROOT = Path.cwd().parent   \n",
    "RAW_XLS   = REPO_ROOT / \"data\" / \"raw\" / \"web_fdaaa_clean.xlsx\"\n",
    "OUT_DIR  = REPO_ROOT / \"outputs\"\n",
    "OUT_CSV  = OUT_DIR / \"label_features.csv\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "########################################################################\n",
    "# 0.  LOAD\n",
    "########################################################################\n",
    "raw = pd.read_excel(RAW_XLS, dtype=str).rename(\n",
    "        columns=lambda c: c.strip())        # strip accidental spaces\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['FDA Application Number(s)', 'Pediatric Labeling Approval Date',\n",
       "       'Trade Name', 'Generic Name', 'Type of Legislation', 'Indication',\n",
       "       'Indication(s) Studied', 'Labeling Change Summary',\n",
       "       'Therapeutic Category', 'Dosage Form(s)', 'Route(s) of Administration',\n",
       "       'Pharmacological Class', 'Studied in Neonates', 'Indicated in Neonates',\n",
       "       'Product Labeling Link', 'Study Number', 'Ages Studied', 'Study Type',\n",
       "       'Study Design', 'Patients Enrolled', 'Patients Analyzed',\n",
       "       'Number of Centers', 'Number of Countries',\n",
       "       'Total # of Hispanic/Latino', 'Total # of Non-Hispanic/Non-Latino',\n",
       "       'Total #  of Unknown Ethnicity', 'Total #  of Asian',\n",
       "       'Total #  of Black', 'Total #  of White',\n",
       "       'Total #  of Native Hawaiian or Pacific Islander',\n",
       "       'Total #  of American Indian/Alaska Native', 'Total #  of Other Race',\n",
       "       'Total #  of Unknown Race', 'Country Names', 'Notes', 'kind', 'root',\n",
       "       'supplement', 'canon_id'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "########################################################################\n",
    "# 1.  -------- total_studies ------------------------------------------\n",
    "########################################################################\n",
    "def max_study_num(cell) -> int:\n",
    "    if pd.isna(cell):                   # NaN, None, etc.\n",
    "        return 0\n",
    "    txt  = \" \".join(cell) if isinstance(cell, list) else str(cell)\n",
    "    nums = re.findall(r\"\\d+(?:\\.\\d+)?\", txt)\n",
    "    return int(max(map(float, nums))) if nums else 0\n",
    "\n",
    "raw[\"total_studies\"] = raw[\"Study Number\"].apply(max_study_num)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_3758391/1318999687.py:42: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
      "  by_canon[col] = by_canon[\"Study Design\"].str.contains(pat, flags=re.I, regex=True).astype(int)\n",
      "/tmp/ipykernel_3758391/1318999687.py:42: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
      "  by_canon[col] = by_canon[\"Study Design\"].str.contains(pat, flags=re.I, regex=True).astype(int)\n"
     ]
    }
   ],
   "source": [
    "########################################################################\n",
    "# 2.  -------- Study-Type / Study-Design one-hots ----------------------\n",
    "########################################################################\n",
    "def join(series):                         # helper for groupby\n",
    "    return \"\\n\".join(str(v) for v in series if pd.notna(v) and str(v).strip())\n",
    "\n",
    "by_canon = (raw\n",
    "    .groupby(\"canon_id\", as_index=False)\n",
    "    .agg({\n",
    "        \"total_studies\":        \"max\",\n",
    "        \"Study Type\":           join,\n",
    "        \"Study Design\":         join,\n",
    "        \"Labeling Change Summary\": \"first\",\n",
    "        \"Indication\":           \"first\",\n",
    "    })\n",
    ")\n",
    "\n",
    "type_buckets = {\n",
    "    \"Efficacy\":        r\"\\befficacy\\b\",\n",
    "    \"Safety\":          r\"\\bsafety\\b\",\n",
    "    \"Pharmacokinetic\": r\"\\bpk\\b|\\bpharmacokinetic\",\n",
    "    \"Pharmacodynamic\": r\"\\bpd\\b|\\bpharmacodynamic\",\n",
    "    \"Tolerability\":    r\"\\btolerab\",\n",
    "}\n",
    "\n",
    "for col, pat in type_buckets.items():\n",
    "    by_canon[col] = by_canon[\"Study Type\"].str.contains(pat, flags=re.I, regex=True).astype(int)\n",
    "\n",
    "by_canon[\"Other_Type\"] = (by_canon[list(type_buckets)] .sum(axis=1) == 0).astype(int)\n",
    "\n",
    "design_buckets = {\n",
    "    \"Randomized_DoubleBlind\": r\"randomized.*double[- ]blind|double[- ]blind.*randomized\",\n",
    "    \"Randomized_SingleBlind\": r\"randomized.*single[- ]blind|single[- ]blind.*randomized\",\n",
    "    \"Open_Label\":             r\"open[- ]label\",\n",
    "    \"Placebo_Control\":        r\"placebo(?!,? *run[- ]in)\",\n",
    "    \"Active_Comparator\":      r\"active +(control|comparator)\",\n",
    "    \"Dose_Escalation\":        r\"dose +(escalation|ranging|finding)\",\n",
    "    \"Population_PK\":          r\"population +pk|pop +pk|traditional +pk|trad +pk\",\n",
    "}\n",
    "\n",
    "for col, pat in design_buckets.items():\n",
    "    by_canon[col] = by_canon[\"Study Design\"].str.contains(pat, flags=re.I, regex=True).astype(int)\n",
    "\n",
    "by_canon[\"Other_Design\"] = (by_canon[list(design_buckets)].sum(axis=1) == 0).astype(int)\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "########################################################################\n",
    "# 3.  -------- Ages: lowest / highest per canon -----------------------\n",
    "########################################################################\n",
    "_unit = {\"YEAR\":1, \"YEARS\":1, \"MONTH\":1/12, \"MONTHS\":1/12, \"DAY\":1/365,\n",
    "         \"DAYS\":1/365, \"HOUR\":1/8760, \"HOURS\":1/8760}\n",
    "rng  = re.compile(r\"(\\d+(?:\\.\\d+)?)\\s*([A-Z]+)\\s*(?:-|TO)\\s*(\\d+(?:\\.\\d+)?)\\s*([A-Z]+)\")\n",
    "older= re.compile(r\"(\\d+(?:\\.\\d+)?)\\s*([A-Z]+).*AND\\s+OLDER\")\n",
    "\n",
    "def ages_to_years(cell:str)->list[float]:\n",
    "    if not isinstance(cell,str): return []\n",
    "    cell = cell.upper().replace(\"–\",\"-\")\n",
    "    out=[]\n",
    "    for n1,u1,n2,u2 in rng.findall(cell):\n",
    "        out += [float(n1)*_unit.get(u1,1), float(n2)*_unit.get(u2,1)]\n",
    "    for n,u in older.findall(cell):\n",
    "        out.append(float(n)*_unit.get(u,1))\n",
    "    return out\n",
    "\n",
    "raw[\"__ages\"] = raw[\"Ages Studied\"].apply(ages_to_years)\n",
    "\n",
    "age_agg = (raw.groupby(\"canon_id\")[\"__ages\"]\n",
    "              .sum()                            # flatten lists\n",
    "              .apply(lambda lst: pd.Series({\n",
    "                  \"age_min\": min(lst) if lst else np.nan,\n",
    "                  \"age_max\": max(lst) if lst else np.nan}))\n",
    "              .reset_index())\n",
    "\n",
    "by_canon = by_canon.merge(age_agg, on=\"canon_id\", how=\"left\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "########################################################################\n",
    "# 4.  -------- Boolean neonate flags -----------------------------------\n",
    "########################################################################\n",
    "for fld in [\"Studied in Neonates\", \"Indicated in Neonates\"]:\n",
    "    by_canon[fld] = (raw\n",
    "        .groupby(\"canon_id\")[fld]\n",
    "        .first()\n",
    "        .fillna(\"\").str.strip().str.upper().eq(\"X\")\n",
    "        .astype(int)\n",
    "        .values)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# ------------------------------------------------------------------\n",
    "# 0.  helper to convert messy cell → list[int]\n",
    "# ------------------------------------------------------------------\n",
    "def to_int_list(cell):\n",
    "    \"\"\"\n",
    "    \"90\\n219\"  -> [90, 219]\n",
    "    \"NNPS\"     -> []\n",
    "    \"\" / NaN   -> []\n",
    "    686        -> [686]\n",
    "    \"\"\"\n",
    "    if cell is None or (isinstance(cell, float) and np.isnan(cell)):\n",
    "        return []\n",
    "    txt = str(cell).strip().lower()\n",
    "    if txt in {\"nnps\", \"n/a\", \"\"}:\n",
    "        return []\n",
    "    return list(map(int, re.findall(r\"\\d+\", txt)))\n",
    "\n",
    "# ------------------------------------------------------------------\n",
    "# 1.  turn *every* numeric column into lists of ints\n",
    "# ------------------------------------------------------------------\n",
    "numeric_cols = [\n",
    "    \"Patients Enrolled\", \"Patients Analyzed\",\n",
    "    \"Number of Centers\", \"Number of Countries\",\n",
    "    \"Total # of Hispanic/Latino\", \"Total # of Non-Hispanic/Non-Latino\",\n",
    "    \"Total #  of Unknown Ethnicity\", \"Total #  of Asian\",\n",
    "    \"Total #  of Black\", \"Total #  of White\",\n",
    "    \"Total #  of Native Hawaiian or Pacific Islander\",\n",
    "    \"Total #  of American Indian/Alaska Native\",\n",
    "    \"Total #  of Other Race\", \"Total #  of Unknown Race\",\n",
    "]\n",
    "\n",
    "for c in numeric_cols:\n",
    "    raw[c] = raw[c].apply(to_int_list)\n",
    "\n",
    "# ------------------------------------------------------------------\n",
    "# 2.  long-format all numeric columns separately, then stack\n",
    "# ------------------------------------------------------------------\n",
    "long_numeric = []\n",
    "\n",
    "for col in numeric_cols:\n",
    "    tmp = (raw[[\"canon_id\", col]]\n",
    "           .explode(col)                      # explode *one* column\n",
    "           .assign(metric=col)               # keep the column name\n",
    "           .rename(columns={col: \"value\"}))  # values → \"value\"\n",
    "    long_numeric.append(tmp)\n",
    "\n",
    "long_numeric = pd.concat(long_numeric, ignore_index=True)\n",
    "long_numeric[\"value\"] = pd.to_numeric(long_numeric[\"value\"], errors=\"coerce\")\n",
    "\n",
    "# ------------------------------------------------------------------\n",
    "# 3.  aggregate per canon_id / metric\n",
    "# ------------------------------------------------------------------\n",
    "# default: sum everything\n",
    "aggfunc = \"sum\"\n",
    "\n",
    "pivot = (long_numeric\n",
    "         .dropna(subset=[\"value\"])\n",
    "         .pivot_table(index=\"canon_id\",\n",
    "                      columns=\"metric\",\n",
    "                      values=\"value\",\n",
    "                      aggfunc=aggfunc))\n",
    "\n",
    "# fix the two “max” metrics\n",
    "for max_col in [\"Number of Centers\", \"Number of Countries\"]:\n",
    "    pivot[max_col] = (long_numeric\n",
    "                      .query(\"metric == @max_col\")\n",
    "                      .groupby(\"canon_id\")[\"value\"]\n",
    "                      .max())\n",
    "\n",
    "num_clean = (pivot\n",
    "             .fillna(0)          # empty → 0\n",
    "             .astype(int)\n",
    "             .reset_index())\n",
    "\n",
    "# ------------------------------------------------------------------\n",
    "# 4.  merge back into your feature table\n",
    "# ------------------------------------------------------------------\n",
    "by_canon = by_canon.merge(num_clean, on=\"canon_id\", how=\"left\")\n",
    "by_canon[numeric_cols] = by_canon[numeric_cols].fillna(0).astype(int)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['canon_id', 'total_studies', 'Study Type', 'Study Design',\n",
       "       'Labeling Change Summary', 'Indication', 'Efficacy', 'Safety',\n",
       "       'Pharmacokinetic', 'Pharmacodynamic', 'Tolerability', 'Other_Type',\n",
       "       'Randomized_DoubleBlind', 'Randomized_SingleBlind', 'Open_Label',\n",
       "       'Placebo_Control', 'Active_Comparator', 'Dose_Escalation',\n",
       "       'Population_PK', 'Other_Design', 'age_min', 'age_max',\n",
       "       'Studied in Neonates', 'Indicated in Neonates', 'Number of Centers',\n",
       "       'Number of Countries', 'Patients Analyzed', 'Patients Enrolled',\n",
       "       'Total #  of American Indian/Alaska Native', 'Total #  of Asian',\n",
       "       'Total #  of Black', 'Total #  of Native Hawaiian or Pacific Islander',\n",
       "       'Total #  of Other Race', 'Total #  of Unknown Ethnicity',\n",
       "       'Total #  of Unknown Race', 'Total #  of White',\n",
       "       'Total # of Hispanic/Latino', 'Total # of Non-Hispanic/Non-Latino'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "by_canon.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "#######################################################################\n",
    "# 6.  -------- metadata carry-over -----------------------------\n",
    "########################################################################\n",
    "meta_cols = [\n",
    "    'canon_id','Pediatric Labeling Approval Date','Trade Name','Generic Name',\n",
    "    'Type of Legislation','Indication','Indication(s) Studied', \n",
    "    'Therapeutic Category','Dosage Form(s)','Route(s) of Administration',\n",
    "    'Pharmacological Class','Product Labeling Link', 'kind','root','supplement'\n",
    "]\n",
    "\n",
    "\n",
    "meta_one = (raw[meta_cols]\n",
    "            .replace({'':np.nan})\n",
    "            .groupby('canon_id',as_index=False).first())\n",
    "\n",
    "by_canon = by_canon.merge(meta_one,on='canon_id',how='left')\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>canon_id</th>\n",
       "      <th>total_studies</th>\n",
       "      <th>Study Type</th>\n",
       "      <th>Study Design</th>\n",
       "      <th>Labeling Change Summary</th>\n",
       "      <th>Indication_x</th>\n",
       "      <th>Efficacy</th>\n",
       "      <th>Safety</th>\n",
       "      <th>Pharmacokinetic</th>\n",
       "      <th>Pharmacodynamic</th>\n",
       "      <th>...</th>\n",
       "      <th>Indication_y</th>\n",
       "      <th>Indication(s) Studied</th>\n",
       "      <th>Therapeutic Category</th>\n",
       "      <th>Dosage Form(s)</th>\n",
       "      <th>Route(s) of Administration</th>\n",
       "      <th>Pharmacological Class</th>\n",
       "      <th>Product Labeling Link</th>\n",
       "      <th>kind</th>\n",
       "      <th>root</th>\n",
       "      <th>supplement</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>ANDA_071961_ORIG1</td>\n",
       "      <td>2</td>\n",
       "      <td>Efficacy,Safety,Pharmacokinetic_x000D_\\nEffica...</td>\n",
       "      <td>Double-Blind,Parallel Group,Dose Ranging_x000D...</td>\n",
       "      <td>-\\tEfficacy in the pediatric population was es...</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>Immediate reduction of blood pressure in hyper...</td>\n",
       "      <td>Cardiac Drugs</td>\n",
       "      <td>INJECTABLE</td>\n",
       "      <td>INTRAVENOUS</td>\n",
       "      <td>None</td>\n",
       "      <td>https://www.accessdata.fda.gov/drugsatfda_docs...</td>\n",
       "      <td>ANDA</td>\n",
       "      <td>071961</td>\n",
       "      <td>ORIG1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>ANDA_072370</td>\n",
       "      <td>0</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>- Population PK analysis in 87 pediatric patie...</td>\n",
       "      <td>Partial-onset seizures</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>Partial-onset seizures</td>\n",
       "      <td>Status epilepticus in pediatric patients 3 mon...</td>\n",
       "      <td>Seizures</td>\n",
       "      <td>INJECTABLE</td>\n",
       "      <td>INTRAVENOUS</td>\n",
       "      <td>Benzodiazepine</td>\n",
       "      <td>https://cderoneanalytics.fda.gov/search360/?or...</td>\n",
       "      <td>ANDA</td>\n",
       "      <td>072370</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>ANDA_214745</td>\n",
       "      <td>0</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>- Population PK analysis in 87 pediatric patie...</td>\n",
       "      <td>Partial-onset seizures</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>Partial-onset seizures</td>\n",
       "      <td>Status epilepticus in pediatric patients 3 mon...</td>\n",
       "      <td>Seizures</td>\n",
       "      <td>INJECTABLE</td>\n",
       "      <td>INTRAVENOUS</td>\n",
       "      <td>Benzodiazepine</td>\n",
       "      <td>https://www.fda.gov/media/167713/download?atta...</td>\n",
       "      <td>ANDA</td>\n",
       "      <td>214745</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ANDA_999907_ORIG1</td>\n",
       "      <td>1</td>\n",
       "      <td>Safety,Pharmacokinetic</td>\n",
       "      <td>Multicenter,Open-Label</td>\n",
       "      <td>-\\tLabeling revised to include neonatal dosing...</td>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>Neonatal dosing for meningitis and septicemia</td>\n",
       "      <td>Antibacterials</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>https://www.fda.gov/media/127633/download</td>\n",
       "      <td>ANDA</td>\n",
       "      <td>999907</td>\n",
       "      <td>ORIG1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>BLA_101069_5846</td>\n",
       "      <td>1</td>\n",
       "      <td>Safety,Immunogenicity</td>\n",
       "      <td>Phase 3,Non-Inferiority</td>\n",
       "      <td>See Package Insert for new information on biol...</td>\n",
       "      <td>Other</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>Other</td>\n",
       "      <td>Active immunization for the prevention of meas...</td>\n",
       "      <td>Preventive Vaccine</td>\n",
       "      <td>INJECTABLE_x000D_\\nINJECTABLE</td>\n",
       "      <td>SUBCUTANEOUS_x000D_\\nINTRAMUSCULAR</td>\n",
       "      <td>Vaccine</td>\n",
       "      <td>https://www.fda.gov/media/75191/download</td>\n",
       "      <td>BLA</td>\n",
       "      <td>101069</td>\n",
       "      <td>5846</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1053</th>\n",
       "      <td>NDA_999847_ORIG1</td>\n",
       "      <td>0</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>-\\tExpanded the indication from adults to pedi...</td>\n",
       "      <td>Other</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>Other</td>\n",
       "      <td>Control of serum phosphorus in children 6 year...</td>\n",
       "      <td>Chronic kidney disease on dialysis</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>http://www.accessdata.fda.gov/drugsatfda_docs/...</td>\n",
       "      <td>NDA</td>\n",
       "      <td>999847</td>\n",
       "      <td>ORIG1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1054</th>\n",
       "      <td>NDA_999901_ORIG1</td>\n",
       "      <td>2</td>\n",
       "      <td>Efficacy,Safety</td>\n",
       "      <td>Placebo,Single-Blind</td>\n",
       "      <td>-\\tSafety and effectiveness for migraine preve...</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>Migraine Prophylaxis</td>\n",
       "      <td>Migraine</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>http://www.accessdata.fda.gov/drugsatfda_docs/...</td>\n",
       "      <td>NDA</td>\n",
       "      <td>999901</td>\n",
       "      <td>ORIG1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1055</th>\n",
       "      <td>NDA_999902_ORIG1</td>\n",
       "      <td>2</td>\n",
       "      <td>Efficacy,Safety</td>\n",
       "      <td>Placebo,Single-Blind</td>\n",
       "      <td>-\\tSafety and effectiveness for migraine preve...</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>Migraine Prophylaxis</td>\n",
       "      <td>Migraine</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>http://www.accessdata.fda.gov/drugsatfda_docs/...</td>\n",
       "      <td>NDA</td>\n",
       "      <td>999902</td>\n",
       "      <td>ORIG1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1056</th>\n",
       "      <td>NDA_999903_0049</td>\n",
       "      <td>1</td>\n",
       "      <td>Efficacy,Safety</td>\n",
       "      <td>Open-Label</td>\n",
       "      <td>-\\tInformation on postmarketing clinical study...</td>\n",
       "      <td>Influenza</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>Influenza</td>\n",
       "      <td>Prophylaxis of influenza</td>\n",
       "      <td>Antivirals</td>\n",
       "      <td>CAPSULE_x000D_\\nFOR SUSPENSION</td>\n",
       "      <td>ORAL_x000D_\\nORAL</td>\n",
       "      <td>Neuraminidase Inhibitor</td>\n",
       "      <td>http://www.accessdata.fda.gov/drugsatfda_docs/...</td>\n",
       "      <td>NDA</td>\n",
       "      <td>999903</td>\n",
       "      <td>0049</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1057</th>\n",
       "      <td>NDA_999905_ORIG1</td>\n",
       "      <td>1</td>\n",
       "      <td>Efficacy,Safety,Pharmacokinetic,Tolerability</td>\n",
       "      <td>Placebo,Parallel Group,Single-Blind,Population...</td>\n",
       "      <td>-\\tExtended release tablets are indicated as a...</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>Adjunctive therapy for partial onset seizures ...</td>\n",
       "      <td>Seizures</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>http://www.accessdata.fda.gov/drugsatfda_docs/...</td>\n",
       "      <td>NDA</td>\n",
       "      <td>999905</td>\n",
       "      <td>ORIG1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1058 rows × 52 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "               canon_id  total_studies  \\\n",
       "0     ANDA_071961_ORIG1              2   \n",
       "1           ANDA_072370              0   \n",
       "2           ANDA_214745              0   \n",
       "3     ANDA_999907_ORIG1              1   \n",
       "4       BLA_101069_5846              1   \n",
       "...                 ...            ...   \n",
       "1053   NDA_999847_ORIG1              0   \n",
       "1054   NDA_999901_ORIG1              2   \n",
       "1055   NDA_999902_ORIG1              2   \n",
       "1056    NDA_999903_0049              1   \n",
       "1057   NDA_999905_ORIG1              1   \n",
       "\n",
       "                                             Study Type  \\\n",
       "0     Efficacy,Safety,Pharmacokinetic_x000D_\\nEffica...   \n",
       "1                                                         \n",
       "2                                                         \n",
       "3                                Safety,Pharmacokinetic   \n",
       "4                                 Safety,Immunogenicity   \n",
       "...                                                 ...   \n",
       "1053                                                      \n",
       "1054                                    Efficacy,Safety   \n",
       "1055                                    Efficacy,Safety   \n",
       "1056                                    Efficacy,Safety   \n",
       "1057       Efficacy,Safety,Pharmacokinetic,Tolerability   \n",
       "\n",
       "                                           Study Design  \\\n",
       "0     Double-Blind,Parallel Group,Dose Ranging_x000D...   \n",
       "1                                                         \n",
       "2                                                         \n",
       "3                                Multicenter,Open-Label   \n",
       "4                               Phase 3,Non-Inferiority   \n",
       "...                                                 ...   \n",
       "1053                                                      \n",
       "1054                               Placebo,Single-Blind   \n",
       "1055                               Placebo,Single-Blind   \n",
       "1056                                         Open-Label   \n",
       "1057  Placebo,Parallel Group,Single-Blind,Population...   \n",
       "\n",
       "                                Labeling Change Summary  \\\n",
       "0     -\\tEfficacy in the pediatric population was es...   \n",
       "1     - Population PK analysis in 87 pediatric patie...   \n",
       "2     - Population PK analysis in 87 pediatric patie...   \n",
       "3     -\\tLabeling revised to include neonatal dosing...   \n",
       "4     See Package Insert for new information on biol...   \n",
       "...                                                 ...   \n",
       "1053  -\\tExpanded the indication from adults to pedi...   \n",
       "1054  -\\tSafety and effectiveness for migraine preve...   \n",
       "1055  -\\tSafety and effectiveness for migraine preve...   \n",
       "1056  -\\tInformation on postmarketing clinical study...   \n",
       "1057  -\\tExtended release tablets are indicated as a...   \n",
       "\n",
       "                Indication_x  Efficacy  Safety  Pharmacokinetic  \\\n",
       "0                       None         1       1                1   \n",
       "1     Partial-onset seizures         0       0                0   \n",
       "2     Partial-onset seizures         0       0                0   \n",
       "3                       None         0       1                1   \n",
       "4                      Other         0       1                0   \n",
       "...                      ...       ...     ...              ...   \n",
       "1053                   Other         0       0                0   \n",
       "1054                    None         1       1                0   \n",
       "1055                    None         1       1                0   \n",
       "1056               Influenza         1       1                0   \n",
       "1057                    None         1       1                1   \n",
       "\n",
       "      Pharmacodynamic  ...            Indication_y  \\\n",
       "0                   0  ...                    None   \n",
       "1                   0  ...  Partial-onset seizures   \n",
       "2                   0  ...  Partial-onset seizures   \n",
       "3                   0  ...                    None   \n",
       "4                   0  ...                   Other   \n",
       "...               ...  ...                     ...   \n",
       "1053                0  ...                   Other   \n",
       "1054                0  ...                    None   \n",
       "1055                0  ...                    None   \n",
       "1056                0  ...               Influenza   \n",
       "1057                0  ...                    None   \n",
       "\n",
       "                                  Indication(s) Studied  \\\n",
       "0     Immediate reduction of blood pressure in hyper...   \n",
       "1     Status epilepticus in pediatric patients 3 mon...   \n",
       "2     Status epilepticus in pediatric patients 3 mon...   \n",
       "3         Neonatal dosing for meningitis and septicemia   \n",
       "4     Active immunization for the prevention of meas...   \n",
       "...                                                 ...   \n",
       "1053  Control of serum phosphorus in children 6 year...   \n",
       "1054                               Migraine Prophylaxis   \n",
       "1055                               Migraine Prophylaxis   \n",
       "1056                           Prophylaxis of influenza   \n",
       "1057  Adjunctive therapy for partial onset seizures ...   \n",
       "\n",
       "                    Therapeutic Category                  Dosage Form(s)  \\\n",
       "0                          Cardiac Drugs                      INJECTABLE   \n",
       "1                               Seizures                      INJECTABLE   \n",
       "2                               Seizures                      INJECTABLE   \n",
       "3                         Antibacterials                            None   \n",
       "4                     Preventive Vaccine   INJECTABLE_x000D_\\nINJECTABLE   \n",
       "...                                  ...                             ...   \n",
       "1053  Chronic kidney disease on dialysis                            None   \n",
       "1054                            Migraine                            None   \n",
       "1055                            Migraine                            None   \n",
       "1056                          Antivirals  CAPSULE_x000D_\\nFOR SUSPENSION   \n",
       "1057                            Seizures                            None   \n",
       "\n",
       "              Route(s) of Administration    Pharmacological Class  \\\n",
       "0                            INTRAVENOUS                     None   \n",
       "1                            INTRAVENOUS           Benzodiazepine   \n",
       "2                            INTRAVENOUS           Benzodiazepine   \n",
       "3                                   None                     None   \n",
       "4     SUBCUTANEOUS_x000D_\\nINTRAMUSCULAR                  Vaccine   \n",
       "...                                  ...                      ...   \n",
       "1053                                None                     None   \n",
       "1054                                None                     None   \n",
       "1055                                None                     None   \n",
       "1056                   ORAL_x000D_\\nORAL  Neuraminidase Inhibitor   \n",
       "1057                                None                     None   \n",
       "\n",
       "                                  Product Labeling Link  kind    root  \\\n",
       "0     https://www.accessdata.fda.gov/drugsatfda_docs...  ANDA  071961   \n",
       "1     https://cderoneanalytics.fda.gov/search360/?or...  ANDA  072370   \n",
       "2     https://www.fda.gov/media/167713/download?atta...  ANDA  214745   \n",
       "3             https://www.fda.gov/media/127633/download  ANDA  999907   \n",
       "4              https://www.fda.gov/media/75191/download   BLA  101069   \n",
       "...                                                 ...   ...     ...   \n",
       "1053  http://www.accessdata.fda.gov/drugsatfda_docs/...   NDA  999847   \n",
       "1054  http://www.accessdata.fda.gov/drugsatfda_docs/...   NDA  999901   \n",
       "1055  http://www.accessdata.fda.gov/drugsatfda_docs/...   NDA  999902   \n",
       "1056  http://www.accessdata.fda.gov/drugsatfda_docs/...   NDA  999903   \n",
       "1057  http://www.accessdata.fda.gov/drugsatfda_docs/...   NDA  999905   \n",
       "\n",
       "      supplement  \n",
       "0          ORIG1  \n",
       "1           None  \n",
       "2           None  \n",
       "3          ORIG1  \n",
       "4           5846  \n",
       "...          ...  \n",
       "1053       ORIG1  \n",
       "1054       ORIG1  \n",
       "1055       ORIG1  \n",
       "1056        0049  \n",
       "1057       ORIG1  \n",
       "\n",
       "[1058 rows x 52 columns]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "by_canon"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['canon_id', 'total_studies', 'Study Type', 'Study Design',\n",
       "       'Labeling Change Summary', 'Indication_x', 'Efficacy', 'Safety',\n",
       "       'Pharmacokinetic', 'Pharmacodynamic', 'Tolerability', 'Other_Type',\n",
       "       'Randomized_DoubleBlind', 'Randomized_SingleBlind', 'Open_Label',\n",
       "       'Placebo_Control', 'Active_Comparator', 'Dose_Escalation',\n",
       "       'Population_PK', 'Other_Design', 'age_min', 'age_max',\n",
       "       'Studied in Neonates', 'Indicated in Neonates', 'Number of Centers',\n",
       "       'Number of Countries', 'Patients Analyzed', 'Patients Enrolled',\n",
       "       'Total #  of American Indian/Alaska Native', 'Total #  of Asian',\n",
       "       'Total #  of Black', 'Total #  of Native Hawaiian or Pacific Islander',\n",
       "       'Total #  of Other Race', 'Total #  of Unknown Ethnicity',\n",
       "       'Total #  of Unknown Race', 'Total #  of White',\n",
       "       'Total # of Hispanic/Latino', 'Total # of Non-Hispanic/Non-Latino',\n",
       "       'Pediatric Labeling Approval Date', 'Trade Name', 'Generic Name',\n",
       "       'Type of Legislation', 'Indication_y', 'Indication(s) Studied',\n",
       "       'Therapeutic Category', 'Dosage Form(s)', 'Route(s) of Administration',\n",
       "       'Pharmacological Class', 'Product Labeling Link', 'kind', 'root',\n",
       "       'supplement'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "by_canon.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1058, 52)"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "by_canon.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## now merging with llm and manually annotated datasets for the final dataset release"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "DATA_DIR   = REPO_ROOT / \"data\"\n",
    "SPLIT_DIR  = DATA_DIR / \"processed\"/\"splits\"          # data/splits/train.csv …\n",
    "OUTPUT_DIR = REPO_ROOT / \"outputs\"\n",
    "\n",
    "AUG_DIR    = OUTPUT_DIR / \"splits_with_feats\"\n",
    "AUG_DIR.mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "# ---------------------------------------------------------------\n",
    "# 2   Load the original splits\n",
    "# ---------------------------------------------------------------\n",
    "read_opts = dict(dtype=str, na_filter=False)\n",
    "\n",
    "df_train = pd.read_csv(SPLIT_DIR / \"train.csv\", **read_opts)\n",
    "df_dev   = pd.read_csv(SPLIT_DIR / \"dev.csv\",   **read_opts)\n",
    "df_test  = pd.read_csv(SPLIT_DIR / \"test.csv\",  **read_opts)\n",
    "\n",
    "for d in (df_train, df_dev, df_test):\n",
    "    d[\"canon_id\"] = d[\"canon_id\"].str.strip()   # hygiene"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ➋ augment with engineered features\n",
    "df_train = df_train.merge(by_canon, on=\"canon_id\", how=\"inner\")\n",
    "df_dev   = df_dev  .merge(by_canon, on=\"canon_id\", how=\"inner\")\n",
    "df_test  = df_test .merge(by_canon, on=\"canon_id\", how=\"inner\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ➌ save back alongside raw splits\n",
    "df_train.to_csv(AUG_DIR / \"train_full.csv\", index=False)\n",
    "df_dev  .to_csv(AUG_DIR / \"dev_full.csv\",   index=False)\n",
    "df_test .to_csv(AUG_DIR / \"test_full.csv\",  index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ➍ publish one combined file, handy for NeurIPS camera-ready release\n",
    "combined = pd.concat([df_train, df_dev, df_test], ignore_index=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['canon_id', 'resolved_label', 'peds_study_type', 'efficacy_summary',\n",
       "       'pk_summary', 'lowest_age_band', 'highest_age_band', 'rationale',\n",
       "       'confidence', 'summary_json', 'txt_file', 'manual_label_resolved',\n",
       "       'label', 'is_gold', 'total_studies', 'Study Type', 'Study Design',\n",
       "       'Labeling Change Summary', 'Indication_x', 'Efficacy', 'Safety',\n",
       "       'Pharmacokinetic', 'Pharmacodynamic', 'Tolerability', 'Other_Type',\n",
       "       'Randomized_DoubleBlind', 'Randomized_SingleBlind', 'Open_Label',\n",
       "       'Placebo_Control', 'Active_Comparator', 'Dose_Escalation',\n",
       "       'Population_PK', 'Other_Design', 'age_min', 'age_max',\n",
       "       'Studied in Neonates', 'Indicated in Neonates', 'Number of Centers',\n",
       "       'Number of Countries', 'Patients Analyzed', 'Patients Enrolled',\n",
       "       'Total #  of American Indian/Alaska Native', 'Total #  of Asian',\n",
       "       'Total #  of Black', 'Total #  of Native Hawaiian or Pacific Islander',\n",
       "       'Total #  of Other Race', 'Total #  of Unknown Ethnicity',\n",
       "       'Total #  of Unknown Race', 'Total #  of White',\n",
       "       'Total # of Hispanic/Latino', 'Total # of Non-Hispanic/Non-Latino',\n",
       "       'Pediatric Labeling Approval Date', 'Trade Name', 'Generic Name',\n",
       "       'Type of Legislation', 'Indication_y', 'Indication(s) Studied',\n",
       "       'Therapeutic Category', 'Dosage Form(s)', 'Route(s) of Administration',\n",
       "       'Pharmacological Class', 'Product Labeling Link', 'kind', 'root',\n",
       "       'supplement'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>canon_id</th>\n",
       "      <th>resolved_label</th>\n",
       "      <th>peds_study_type</th>\n",
       "      <th>efficacy_summary</th>\n",
       "      <th>pk_summary</th>\n",
       "      <th>lowest_age_band</th>\n",
       "      <th>highest_age_band</th>\n",
       "      <th>rationale</th>\n",
       "      <th>confidence</th>\n",
       "      <th>summary_json</th>\n",
       "      <th>...</th>\n",
       "      <th>Indication_y</th>\n",
       "      <th>Indication(s) Studied</th>\n",
       "      <th>Therapeutic Category</th>\n",
       "      <th>Dosage Form(s)</th>\n",
       "      <th>Route(s) of Administration</th>\n",
       "      <th>Pharmacological Class</th>\n",
       "      <th>Product Labeling Link</th>\n",
       "      <th>kind</th>\n",
       "      <th>root</th>\n",
       "      <th>supplement</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>BLA_103976_5231</td>\n",
       "      <td>NotExtrapolated</td>\n",
       "      <td>RCT</td>\n",
       "      <td>Asthma: Two controlled RCT trials in patients ...</td>\n",
       "      <td>Safety profile assessed in the asthma studies.</td>\n",
       "      <td>6</td>\n",
       "      <td>&lt;12</td>\n",
       "      <td>Pediatric evidence for asthma includes two con...</td>\n",
       "      <td>high</td>\n",
       "      <td>{\"PediatricSummary\":[{\"section\":\"Asthma\",\"summ...</td>\n",
       "      <td>...</td>\n",
       "      <td>Asthma</td>\n",
       "      <td>Postmarketing study</td>\n",
       "      <td>Antiasthmatic</td>\n",
       "      <td>INJECTABLE</td>\n",
       "      <td>SUBCUTANEOUS</td>\n",
       "      <td>None</td>\n",
       "      <td>https://www.accessdata.fda.gov/drugsatfda_docs...</td>\n",
       "      <td>BLA</td>\n",
       "      <td>103976</td>\n",
       "      <td>5231</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>BLA_125294_0045</td>\n",
       "      <td>Partial</td>\n",
       "      <td>PK+Safety</td>\n",
       "      <td>There is no pediatric efficacy RCT; rather, sa...</td>\n",
       "      <td>Pediatric pharmacokinetic data (geometric mean...</td>\n",
       "      <td>1 month</td>\n",
       "      <td>&lt;17 years</td>\n",
       "      <td>The extrapolation is based on PK and safety ev...</td>\n",
       "      <td>high</td>\n",
       "      <td>{\"PediatricSummary\":[{\"section\":\"8.4 Pediatric...</td>\n",
       "      <td>...</td>\n",
       "      <td>Other</td>\n",
       "      <td>Reduce the duration of severe neutropenia in  ...</td>\n",
       "      <td>Hematologic Malignancies</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>Leukocyte Growth Factor</td>\n",
       "      <td>https://www.accessdata.fda.gov/drugsatfda_docs...</td>\n",
       "      <td>BLA</td>\n",
       "      <td>125294</td>\n",
       "      <td>0045</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>BLA_125477_0039</td>\n",
       "      <td>Partial</td>\n",
       "      <td>PK+Safety</td>\n",
       "      <td>A single-arm, open-label study in 23 pediatric...</td>\n",
       "      <td>Pharmacokinetic data in pediatric patients wer...</td>\n",
       "      <td>1</td>\n",
       "      <td>16</td>\n",
       "      <td>While there is available pediatric evidence de...</td>\n",
       "      <td>medium</td>\n",
       "      <td>{\"PediatricSummary\":[{\"section\":\"8.4 Pediatric...</td>\n",
       "      <td>...</td>\n",
       "      <td>Solid tumor</td>\n",
       "      <td>Relapsed or refractory solid tumors</td>\n",
       "      <td>Solid Tumor</td>\n",
       "      <td>INJECTABLE</td>\n",
       "      <td>INTRAVENOUS</td>\n",
       "      <td>Other</td>\n",
       "      <td>https://www.accessdata.fda.gov/drugsatfda_docs...</td>\n",
       "      <td>BLA</td>\n",
       "      <td>125477</td>\n",
       "      <td>0039</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>BLA_125526_ORIG1</td>\n",
       "      <td>NotExtrapolated</td>\n",
       "      <td>RCT</td>\n",
       "      <td>The clinical program included adolescents aged...</td>\n",
       "      <td>Adolescents showed a mean apparent clearance a...</td>\n",
       "      <td>12</td>\n",
       "      <td>17</td>\n",
       "      <td>Pediatric efficacy evidence exists from an RCT...</td>\n",
       "      <td>medium</td>\n",
       "      <td>{\"PediatricSummary\":[{\"section\":\"8.4 Pediatric...</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>Treatment in severe asthma in patients 12 year...</td>\n",
       "      <td>Antiasthmatic</td>\n",
       "      <td>INJECTABLE</td>\n",
       "      <td>INTRAVENOUS</td>\n",
       "      <td>Interleukin-5 Antagonist</td>\n",
       "      <td>http://www.accessdata.fda.gov/drugsatfda_docs/...</td>\n",
       "      <td>BLA</td>\n",
       "      <td>125526</td>\n",
       "      <td>ORIG1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>BLA_761039_0015</td>\n",
       "      <td>Partial</td>\n",
       "      <td>PK+Safety</td>\n",
       "      <td>Effectiveness in pediatric patients is extrapo...</td>\n",
       "      <td>Pediatric pharmacokinetic data and exposure mo...</td>\n",
       "      <td>Not specified</td>\n",
       "      <td>Not specified</td>\n",
       "      <td>No pediatric efficacy RCT was conducted; inste...</td>\n",
       "      <td>high</td>\n",
       "      <td>{\"PediatricSummary\":[{\"section\":\"8.4 Pediatric...</td>\n",
       "      <td>...</td>\n",
       "      <td>Chemotherapy induced neutropenia</td>\n",
       "      <td>Decrease the incidence of infection as manifes...</td>\n",
       "      <td>Hematopoietic Growth Factors</td>\n",
       "      <td>INJECTABLE</td>\n",
       "      <td>SUBCUTANEOUS</td>\n",
       "      <td>Leukocyte Growth Factor</td>\n",
       "      <td>https://www.accessdata.fda.gov/drugsatfda_docs...</td>\n",
       "      <td>BLA</td>\n",
       "      <td>761039</td>\n",
       "      <td>0015</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>732</th>\n",
       "      <td>NDA_212477_ORIG1</td>\n",
       "      <td>NotExtrapolated</td>\n",
       "      <td>RCT</td>\n",
       "      <td>Study 1116, an open‐label multicenter clinical...</td>\n",
       "      <td>Pharmacokinetic data supports weight‐based dos...</td>\n",
       "      <td>3 years</td>\n",
       "      <td>18 years</td>\n",
       "      <td>The availability of a pediatric efficacy trial...</td>\n",
       "      <td>high</td>\n",
       "      <td>{\"PediatricSummary\":[{\"section\":\"8.4 Pediatric...</td>\n",
       "      <td>...</td>\n",
       "      <td>Hepatitis C</td>\n",
       "      <td>Treatment of chronic hepatitis C virus</td>\n",
       "      <td>Antivirals</td>\n",
       "      <td>PELLETS</td>\n",
       "      <td>ORAL</td>\n",
       "      <td>Hepatitis C Virus NS5A Inhibitor; Hepatitis C ...</td>\n",
       "      <td>https://www.accessdata.fda.gov/drugsatfda_docs...</td>\n",
       "      <td>NDA</td>\n",
       "      <td>212477</td>\n",
       "      <td>ORIG1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>733</th>\n",
       "      <td>NDA_212887_0006</td>\n",
       "      <td>Partial</td>\n",
       "      <td>PK+Safety</td>\n",
       "      <td>Safety and efficacy in adolescents have been e...</td>\n",
       "      <td>The MOCHA trial provided pharmacokinetic data ...</td>\n",
       "      <td>12 years</td>\n",
       "      <td>18 years</td>\n",
       "      <td>Pediatric evidence is limited to adolescents w...</td>\n",
       "      <td>high</td>\n",
       "      <td>{\"PediatricSummary\":[{\"section\":\"8.4 Pediatric...</td>\n",
       "      <td>...</td>\n",
       "      <td>HIV – 1 infection (prevention of transmission ...</td>\n",
       "      <td>HIV-1 treatment and pre-exposure prophylaxis</td>\n",
       "      <td>Antivirals</td>\n",
       "      <td>TABLET</td>\n",
       "      <td>ORAL</td>\n",
       "      <td>Human Immunodeficiency Virus Integrase Strand ...</td>\n",
       "      <td>https://www.accessdata.fda.gov/drugsatfda_docs...</td>\n",
       "      <td>NDA</td>\n",
       "      <td>212887</td>\n",
       "      <td>0006</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>734</th>\n",
       "      <td>NDA_213871_0004</td>\n",
       "      <td>NotExtrapolated</td>\n",
       "      <td>RCT</td>\n",
       "      <td>Clinical studies, including Trial-AD04, evalua...</td>\n",
       "      <td>Although specific PK details were not provided...</td>\n",
       "      <td>12</td>\n",
       "      <td>18</td>\n",
       "      <td>The pediatric label is supported by at least o...</td>\n",
       "      <td>high</td>\n",
       "      <td>{\"PediatricSummary\":[{\"section\":\"8.4 Pediatric...</td>\n",
       "      <td>...</td>\n",
       "      <td>Atopic dermatitis</td>\n",
       "      <td>Moderate-to-severe atopic dermatitis in pediat...</td>\n",
       "      <td>Dermatitis Agents</td>\n",
       "      <td>TABLET</td>\n",
       "      <td>ORAL</td>\n",
       "      <td>Janus Kinase Inhibitor</td>\n",
       "      <td>https://www.accessdata.fda.gov/drugsatfda_docs...</td>\n",
       "      <td>NDA</td>\n",
       "      <td>213871</td>\n",
       "      <td>0004</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>735</th>\n",
       "      <td>NDA_761055_0012</td>\n",
       "      <td>NotExtrapolated</td>\n",
       "      <td>RCT</td>\n",
       "      <td>Atopic Dermatitis: Dupixent has been establish...</td>\n",
       "      <td>In the asthma study, higher Dupilumab exposure...</td>\n",
       "      <td>12 years</td>\n",
       "      <td>17 years</td>\n",
       "      <td>The available pediatric data include randomize...</td>\n",
       "      <td>high</td>\n",
       "      <td>{\"PediatricSummary\":[{\"section\":\"Atopic Dermat...</td>\n",
       "      <td>...</td>\n",
       "      <td>Atopic dermatitis</td>\n",
       "      <td>Moderate-to-severe atopic dermatitis in pediat...</td>\n",
       "      <td>Dermatitis Agents</td>\n",
       "      <td>INJECTABLE</td>\n",
       "      <td>SUBCUTANEOUS</td>\n",
       "      <td>Interleukin-4 Receptor alpha Antagonist</td>\n",
       "      <td>https://www.accessdata.fda.gov/drugsatfda_docs...</td>\n",
       "      <td>NDA</td>\n",
       "      <td>761055</td>\n",
       "      <td>0012</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>736</th>\n",
       "      <td>NDA_215650</td>\n",
       "      <td>Unlabeled</td>\n",
       "      <td></td>\n",
       "      <td>The approval for pediatric use in patients age...</td>\n",
       "      <td>There is no pediatric PK or safety study data ...</td>\n",
       "      <td>12 years</td>\n",
       "      <td>Not specified</td>\n",
       "      <td>Since the indication for patients aged 12 year...</td>\n",
       "      <td>high</td>\n",
       "      <td>{\"PediatricSummary\":[{\"section\":\"8.4 Pediatric...</td>\n",
       "      <td>...</td>\n",
       "      <td>Bacterial vaginosis</td>\n",
       "      <td>Bacterial vaginosis in female patients 12 year...</td>\n",
       "      <td>Antibacterials</td>\n",
       "      <td>GEL</td>\n",
       "      <td>VAGINAL</td>\n",
       "      <td>Lincosamide Antibacterial</td>\n",
       "      <td>https://www.accessdata.fda.gov/drugsatfda_docs...</td>\n",
       "      <td>NDA</td>\n",
       "      <td>215650</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>737 rows × 65 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "             canon_id   resolved_label peds_study_type  \\\n",
       "0     BLA_103976_5231  NotExtrapolated             RCT   \n",
       "1     BLA_125294_0045          Partial       PK+Safety   \n",
       "2     BLA_125477_0039          Partial       PK+Safety   \n",
       "3    BLA_125526_ORIG1  NotExtrapolated             RCT   \n",
       "4     BLA_761039_0015          Partial       PK+Safety   \n",
       "..                ...              ...             ...   \n",
       "732  NDA_212477_ORIG1  NotExtrapolated             RCT   \n",
       "733   NDA_212887_0006          Partial       PK+Safety   \n",
       "734   NDA_213871_0004  NotExtrapolated             RCT   \n",
       "735   NDA_761055_0012  NotExtrapolated             RCT   \n",
       "736        NDA_215650        Unlabeled                   \n",
       "\n",
       "                                      efficacy_summary  \\\n",
       "0    Asthma: Two controlled RCT trials in patients ...   \n",
       "1    There is no pediatric efficacy RCT; rather, sa...   \n",
       "2    A single-arm, open-label study in 23 pediatric...   \n",
       "3    The clinical program included adolescents aged...   \n",
       "4    Effectiveness in pediatric patients is extrapo...   \n",
       "..                                                 ...   \n",
       "732  Study 1116, an open‐label multicenter clinical...   \n",
       "733  Safety and efficacy in adolescents have been e...   \n",
       "734  Clinical studies, including Trial-AD04, evalua...   \n",
       "735  Atopic Dermatitis: Dupixent has been establish...   \n",
       "736  The approval for pediatric use in patients age...   \n",
       "\n",
       "                                            pk_summary lowest_age_band  \\\n",
       "0       Safety profile assessed in the asthma studies.               6   \n",
       "1    Pediatric pharmacokinetic data (geometric mean...         1 month   \n",
       "2    Pharmacokinetic data in pediatric patients wer...               1   \n",
       "3    Adolescents showed a mean apparent clearance a...              12   \n",
       "4    Pediatric pharmacokinetic data and exposure mo...   Not specified   \n",
       "..                                                 ...             ...   \n",
       "732  Pharmacokinetic data supports weight‐based dos...         3 years   \n",
       "733  The MOCHA trial provided pharmacokinetic data ...        12 years   \n",
       "734  Although specific PK details were not provided...              12   \n",
       "735  In the asthma study, higher Dupilumab exposure...        12 years   \n",
       "736  There is no pediatric PK or safety study data ...        12 years   \n",
       "\n",
       "    highest_age_band                                          rationale  \\\n",
       "0                <12  Pediatric evidence for asthma includes two con...   \n",
       "1          <17 years  The extrapolation is based on PK and safety ev...   \n",
       "2                 16  While there is available pediatric evidence de...   \n",
       "3                 17  Pediatric efficacy evidence exists from an RCT...   \n",
       "4      Not specified  No pediatric efficacy RCT was conducted; inste...   \n",
       "..               ...                                                ...   \n",
       "732         18 years  The availability of a pediatric efficacy trial...   \n",
       "733         18 years  Pediatric evidence is limited to adolescents w...   \n",
       "734               18  The pediatric label is supported by at least o...   \n",
       "735         17 years  The available pediatric data include randomize...   \n",
       "736    Not specified  Since the indication for patients aged 12 year...   \n",
       "\n",
       "    confidence                                       summary_json  ...  \\\n",
       "0         high  {\"PediatricSummary\":[{\"section\":\"Asthma\",\"summ...  ...   \n",
       "1         high  {\"PediatricSummary\":[{\"section\":\"8.4 Pediatric...  ...   \n",
       "2       medium  {\"PediatricSummary\":[{\"section\":\"8.4 Pediatric...  ...   \n",
       "3       medium  {\"PediatricSummary\":[{\"section\":\"8.4 Pediatric...  ...   \n",
       "4         high  {\"PediatricSummary\":[{\"section\":\"8.4 Pediatric...  ...   \n",
       "..         ...                                                ...  ...   \n",
       "732       high  {\"PediatricSummary\":[{\"section\":\"8.4 Pediatric...  ...   \n",
       "733       high  {\"PediatricSummary\":[{\"section\":\"8.4 Pediatric...  ...   \n",
       "734       high  {\"PediatricSummary\":[{\"section\":\"8.4 Pediatric...  ...   \n",
       "735       high  {\"PediatricSummary\":[{\"section\":\"Atopic Dermat...  ...   \n",
       "736       high  {\"PediatricSummary\":[{\"section\":\"8.4 Pediatric...  ...   \n",
       "\n",
       "                                          Indication_y  \\\n",
       "0                                               Asthma   \n",
       "1                                                Other   \n",
       "2                                          Solid tumor   \n",
       "3                                                 None   \n",
       "4                     Chemotherapy induced neutropenia   \n",
       "..                                                 ...   \n",
       "732                                        Hepatitis C   \n",
       "733  HIV – 1 infection (prevention of transmission ...   \n",
       "734                                  Atopic dermatitis   \n",
       "735                                  Atopic dermatitis   \n",
       "736                                Bacterial vaginosis   \n",
       "\n",
       "                                 Indication(s) Studied  \\\n",
       "0                                  Postmarketing study   \n",
       "1    Reduce the duration of severe neutropenia in  ...   \n",
       "2                  Relapsed or refractory solid tumors   \n",
       "3    Treatment in severe asthma in patients 12 year...   \n",
       "4    Decrease the incidence of infection as manifes...   \n",
       "..                                                 ...   \n",
       "732             Treatment of chronic hepatitis C virus   \n",
       "733       HIV-1 treatment and pre-exposure prophylaxis   \n",
       "734  Moderate-to-severe atopic dermatitis in pediat...   \n",
       "735  Moderate-to-severe atopic dermatitis in pediat...   \n",
       "736  Bacterial vaginosis in female patients 12 year...   \n",
       "\n",
       "             Therapeutic Category Dosage Form(s)  Route(s) of Administration  \\\n",
       "0                   Antiasthmatic     INJECTABLE                SUBCUTANEOUS   \n",
       "1        Hematologic Malignancies           None                        None   \n",
       "2                     Solid Tumor     INJECTABLE                 INTRAVENOUS   \n",
       "3                   Antiasthmatic     INJECTABLE                 INTRAVENOUS   \n",
       "4    Hematopoietic Growth Factors     INJECTABLE                SUBCUTANEOUS   \n",
       "..                            ...            ...                         ...   \n",
       "732                    Antivirals        PELLETS                        ORAL   \n",
       "733                    Antivirals         TABLET                        ORAL   \n",
       "734             Dermatitis Agents         TABLET                        ORAL   \n",
       "735             Dermatitis Agents     INJECTABLE                SUBCUTANEOUS   \n",
       "736                Antibacterials            GEL                     VAGINAL   \n",
       "\n",
       "                                 Pharmacological Class  \\\n",
       "0                                                 None   \n",
       "1                              Leukocyte Growth Factor   \n",
       "2                                                Other   \n",
       "3                             Interleukin-5 Antagonist   \n",
       "4                              Leukocyte Growth Factor   \n",
       "..                                                 ...   \n",
       "732  Hepatitis C Virus NS5A Inhibitor; Hepatitis C ...   \n",
       "733  Human Immunodeficiency Virus Integrase Strand ...   \n",
       "734                             Janus Kinase Inhibitor   \n",
       "735            Interleukin-4 Receptor alpha Antagonist   \n",
       "736                          Lincosamide Antibacterial   \n",
       "\n",
       "                                 Product Labeling Link kind    root  \\\n",
       "0    https://www.accessdata.fda.gov/drugsatfda_docs...  BLA  103976   \n",
       "1    https://www.accessdata.fda.gov/drugsatfda_docs...  BLA  125294   \n",
       "2    https://www.accessdata.fda.gov/drugsatfda_docs...  BLA  125477   \n",
       "3    http://www.accessdata.fda.gov/drugsatfda_docs/...  BLA  125526   \n",
       "4    https://www.accessdata.fda.gov/drugsatfda_docs...  BLA  761039   \n",
       "..                                                 ...  ...     ...   \n",
       "732  https://www.accessdata.fda.gov/drugsatfda_docs...  NDA  212477   \n",
       "733  https://www.accessdata.fda.gov/drugsatfda_docs...  NDA  212887   \n",
       "734  https://www.accessdata.fda.gov/drugsatfda_docs...  NDA  213871   \n",
       "735  https://www.accessdata.fda.gov/drugsatfda_docs...  NDA  761055   \n",
       "736  https://www.accessdata.fda.gov/drugsatfda_docs...  NDA  215650   \n",
       "\n",
       "     supplement  \n",
       "0          5231  \n",
       "1          0045  \n",
       "2          0039  \n",
       "3         ORIG1  \n",
       "4          0015  \n",
       "..          ...  \n",
       "732       ORIG1  \n",
       "733        0006  \n",
       "734        0004  \n",
       "735        0012  \n",
       "736        None  \n",
       "\n",
       "[737 rows x 65 columns]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "combined"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "column_order = ['canon_id', 'kind', 'root',\n",
    "       'supplement', 'Product Labeling Link', 'label', 'is_gold', 'peds_study_type', 'efficacy_summary', 'pk_summary', 'rationale',\n",
    "       'confidence', 'Pediatric Labeling Approval Date', 'Trade Name', 'Generic Name',\n",
    "       'Type of Legislation', 'Indication_y', 'Indication(s) Studied',\n",
    "       'Therapeutic Category', 'Dosage Form(s)', 'Route(s) of Administration',\n",
    "       'Pharmacological Class','Labeling Change Summary', 'age_min', 'age_max', 'total_studies','Efficacy', 'Safety',\n",
    "       'Pharmacokinetic', 'Pharmacodynamic', 'Tolerability', 'Other_Type',\n",
    "       'Randomized_DoubleBlind', 'Randomized_SingleBlind', 'Open_Label',\n",
    "       'Placebo_Control', 'Active_Comparator', 'Dose_Escalation',\n",
    "       'Population_PK', 'Other_Design', \n",
    "       'Studied in Neonates', 'Indicated in Neonates', 'Number of Centers',\n",
    "       'Number of Countries', 'Patients Analyzed', 'Patients Enrolled',\n",
    "       'Total #  of American Indian/Alaska Native', 'Total #  of Asian',\n",
    "       'Total #  of Black', 'Total #  of Native Hawaiian or Pacific Islander',\n",
    "       'Total #  of Other Race', 'Total #  of Unknown Ethnicity',\n",
    "       'Total #  of Unknown Race', 'Total #  of White',\n",
    "       'Total # of Hispanic/Latino', 'Total # of Non-Hispanic/Non-Latino' ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "combined = combined[column_order]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>canon_id</th>\n",
       "      <th>kind</th>\n",
       "      <th>root</th>\n",
       "      <th>supplement</th>\n",
       "      <th>Product Labeling Link</th>\n",
       "      <th>label</th>\n",
       "      <th>is_gold</th>\n",
       "      <th>peds_study_type</th>\n",
       "      <th>efficacy_summary</th>\n",
       "      <th>pk_summary</th>\n",
       "      <th>...</th>\n",
       "      <th>Total #  of American Indian/Alaska Native</th>\n",
       "      <th>Total #  of Asian</th>\n",
       "      <th>Total #  of Black</th>\n",
       "      <th>Total #  of Native Hawaiian or Pacific Islander</th>\n",
       "      <th>Total #  of Other Race</th>\n",
       "      <th>Total #  of Unknown Ethnicity</th>\n",
       "      <th>Total #  of Unknown Race</th>\n",
       "      <th>Total #  of White</th>\n",
       "      <th>Total # of Hispanic/Latino</th>\n",
       "      <th>Total # of Non-Hispanic/Non-Latino</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>BLA_103976_5231</td>\n",
       "      <td>BLA</td>\n",
       "      <td>103976</td>\n",
       "      <td>5231</td>\n",
       "      <td>https://www.accessdata.fda.gov/drugsatfda_docs...</td>\n",
       "      <td>NotExtrapolated</td>\n",
       "      <td>1</td>\n",
       "      <td>RCT</td>\n",
       "      <td>Asthma: Two controlled RCT trials in patients ...</td>\n",
       "      <td>Safety profile assessed in the asthma studies.</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>BLA_125294_0045</td>\n",
       "      <td>BLA</td>\n",
       "      <td>125294</td>\n",
       "      <td>0045</td>\n",
       "      <td>https://www.accessdata.fda.gov/drugsatfda_docs...</td>\n",
       "      <td>Partial</td>\n",
       "      <td>1</td>\n",
       "      <td>PK+Safety</td>\n",
       "      <td>There is no pediatric efficacy RCT; rather, sa...</td>\n",
       "      <td>Pediatric pharmacokinetic data (geometric mean...</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>50</td>\n",
       "      <td>0</td>\n",
       "      <td>50</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>BLA_125477_0039</td>\n",
       "      <td>BLA</td>\n",
       "      <td>125477</td>\n",
       "      <td>0039</td>\n",
       "      <td>https://www.accessdata.fda.gov/drugsatfda_docs...</td>\n",
       "      <td>Partial</td>\n",
       "      <td>1</td>\n",
       "      <td>PK+Safety</td>\n",
       "      <td>A single-arm, open-label study in 23 pediatric...</td>\n",
       "      <td>Pharmacokinetic data in pediatric patients wer...</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>23</td>\n",
       "      <td>6</td>\n",
       "      <td>22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>BLA_125526_ORIG1</td>\n",
       "      <td>BLA</td>\n",
       "      <td>125526</td>\n",
       "      <td>ORIG1</td>\n",
       "      <td>http://www.accessdata.fda.gov/drugsatfda_docs/...</td>\n",
       "      <td>Partial</td>\n",
       "      <td>1</td>\n",
       "      <td>RCT</td>\n",
       "      <td>The clinical program included adolescents aged...</td>\n",
       "      <td>Adolescents showed a mean apparent clearance a...</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>BLA_761039_0015</td>\n",
       "      <td>BLA</td>\n",
       "      <td>761039</td>\n",
       "      <td>0015</td>\n",
       "      <td>https://www.accessdata.fda.gov/drugsatfda_docs...</td>\n",
       "      <td>Partial</td>\n",
       "      <td>1</td>\n",
       "      <td>PK+Safety</td>\n",
       "      <td>Effectiveness in pediatric patients is extrapo...</td>\n",
       "      <td>Pediatric pharmacokinetic data and exposure mo...</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>732</th>\n",
       "      <td>NDA_212477_ORIG1</td>\n",
       "      <td>NDA</td>\n",
       "      <td>212477</td>\n",
       "      <td>ORIG1</td>\n",
       "      <td>https://www.accessdata.fda.gov/drugsatfda_docs...</td>\n",
       "      <td>Partial</td>\n",
       "      <td>1</td>\n",
       "      <td>RCT</td>\n",
       "      <td>Study 1116, an open‐label multicenter clinical...</td>\n",
       "      <td>Pharmacokinetic data supports weight‐based dos...</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>733</th>\n",
       "      <td>NDA_212887_0006</td>\n",
       "      <td>NDA</td>\n",
       "      <td>212887</td>\n",
       "      <td>0006</td>\n",
       "      <td>https://www.accessdata.fda.gov/drugsatfda_docs...</td>\n",
       "      <td>Partial</td>\n",
       "      <td>1</td>\n",
       "      <td>PK+Safety</td>\n",
       "      <td>Safety and efficacy in adolescents have been e...</td>\n",
       "      <td>The MOCHA trial provided pharmacokinetic data ...</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>734</th>\n",
       "      <td>NDA_213871_0004</td>\n",
       "      <td>NDA</td>\n",
       "      <td>213871</td>\n",
       "      <td>0004</td>\n",
       "      <td>https://www.accessdata.fda.gov/drugsatfda_docs...</td>\n",
       "      <td>NotExtrapolated</td>\n",
       "      <td>1</td>\n",
       "      <td>RCT</td>\n",
       "      <td>Clinical studies, including Trial-AD04, evalua...</td>\n",
       "      <td>Although specific PK details were not provided...</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>735</th>\n",
       "      <td>NDA_761055_0012</td>\n",
       "      <td>NDA</td>\n",
       "      <td>761055</td>\n",
       "      <td>0012</td>\n",
       "      <td>https://www.accessdata.fda.gov/drugsatfda_docs...</td>\n",
       "      <td>NotExtrapolated</td>\n",
       "      <td>1</td>\n",
       "      <td>RCT</td>\n",
       "      <td>Atopic Dermatitis: Dupixent has been establish...</td>\n",
       "      <td>In the asthma study, higher Dupilumab exposure...</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>736</th>\n",
       "      <td>NDA_215650</td>\n",
       "      <td>NDA</td>\n",
       "      <td>215650</td>\n",
       "      <td>None</td>\n",
       "      <td>https://www.accessdata.fda.gov/drugsatfda_docs...</td>\n",
       "      <td>Full</td>\n",
       "      <td>1</td>\n",
       "      <td></td>\n",
       "      <td>The approval for pediatric use in patients age...</td>\n",
       "      <td>There is no pediatric PK or safety study data ...</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>172</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>126</td>\n",
       "      <td>78</td>\n",
       "      <td>228</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>737 rows × 56 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "             canon_id kind    root supplement  \\\n",
       "0     BLA_103976_5231  BLA  103976       5231   \n",
       "1     BLA_125294_0045  BLA  125294       0045   \n",
       "2     BLA_125477_0039  BLA  125477       0039   \n",
       "3    BLA_125526_ORIG1  BLA  125526      ORIG1   \n",
       "4     BLA_761039_0015  BLA  761039       0015   \n",
       "..                ...  ...     ...        ...   \n",
       "732  NDA_212477_ORIG1  NDA  212477      ORIG1   \n",
       "733   NDA_212887_0006  NDA  212887       0006   \n",
       "734   NDA_213871_0004  NDA  213871       0004   \n",
       "735   NDA_761055_0012  NDA  761055       0012   \n",
       "736        NDA_215650  NDA  215650       None   \n",
       "\n",
       "                                 Product Labeling Link            label  \\\n",
       "0    https://www.accessdata.fda.gov/drugsatfda_docs...  NotExtrapolated   \n",
       "1    https://www.accessdata.fda.gov/drugsatfda_docs...          Partial   \n",
       "2    https://www.accessdata.fda.gov/drugsatfda_docs...          Partial   \n",
       "3    http://www.accessdata.fda.gov/drugsatfda_docs/...          Partial   \n",
       "4    https://www.accessdata.fda.gov/drugsatfda_docs...          Partial   \n",
       "..                                                 ...              ...   \n",
       "732  https://www.accessdata.fda.gov/drugsatfda_docs...          Partial   \n",
       "733  https://www.accessdata.fda.gov/drugsatfda_docs...          Partial   \n",
       "734  https://www.accessdata.fda.gov/drugsatfda_docs...  NotExtrapolated   \n",
       "735  https://www.accessdata.fda.gov/drugsatfda_docs...  NotExtrapolated   \n",
       "736  https://www.accessdata.fda.gov/drugsatfda_docs...             Full   \n",
       "\n",
       "    is_gold peds_study_type  \\\n",
       "0         1             RCT   \n",
       "1         1       PK+Safety   \n",
       "2         1       PK+Safety   \n",
       "3         1             RCT   \n",
       "4         1       PK+Safety   \n",
       "..      ...             ...   \n",
       "732       1             RCT   \n",
       "733       1       PK+Safety   \n",
       "734       1             RCT   \n",
       "735       1             RCT   \n",
       "736       1                   \n",
       "\n",
       "                                      efficacy_summary  \\\n",
       "0    Asthma: Two controlled RCT trials in patients ...   \n",
       "1    There is no pediatric efficacy RCT; rather, sa...   \n",
       "2    A single-arm, open-label study in 23 pediatric...   \n",
       "3    The clinical program included adolescents aged...   \n",
       "4    Effectiveness in pediatric patients is extrapo...   \n",
       "..                                                 ...   \n",
       "732  Study 1116, an open‐label multicenter clinical...   \n",
       "733  Safety and efficacy in adolescents have been e...   \n",
       "734  Clinical studies, including Trial-AD04, evalua...   \n",
       "735  Atopic Dermatitis: Dupixent has been establish...   \n",
       "736  The approval for pediatric use in patients age...   \n",
       "\n",
       "                                            pk_summary  ...  \\\n",
       "0       Safety profile assessed in the asthma studies.  ...   \n",
       "1    Pediatric pharmacokinetic data (geometric mean...  ...   \n",
       "2    Pharmacokinetic data in pediatric patients wer...  ...   \n",
       "3    Adolescents showed a mean apparent clearance a...  ...   \n",
       "4    Pediatric pharmacokinetic data and exposure mo...  ...   \n",
       "..                                                 ...  ...   \n",
       "732  Pharmacokinetic data supports weight‐based dos...  ...   \n",
       "733  The MOCHA trial provided pharmacokinetic data ...  ...   \n",
       "734  Although specific PK details were not provided...  ...   \n",
       "735  In the asthma study, higher Dupilumab exposure...  ...   \n",
       "736  There is no pediatric PK or safety study data ...  ...   \n",
       "\n",
       "    Total #  of American Indian/Alaska Native Total #  of Asian  \\\n",
       "0                                           0                 0   \n",
       "1                                           0                 0   \n",
       "2                                           0                 0   \n",
       "3                                           0                 0   \n",
       "4                                           0                 0   \n",
       "..                                        ...               ...   \n",
       "732                                         0                 0   \n",
       "733                                         0                 0   \n",
       "734                                         0                 0   \n",
       "735                                         0                 0   \n",
       "736                                         0                 0   \n",
       "\n",
       "    Total #  of Black Total #  of Native Hawaiian or Pacific Islander  \\\n",
       "0                   0                                               0   \n",
       "1                   0                                               0   \n",
       "2                   3                                               0   \n",
       "3                   0                                               0   \n",
       "4                   0                                               0   \n",
       "..                ...                                             ...   \n",
       "732                 0                                               0   \n",
       "733                 0                                               0   \n",
       "734                 0                                               0   \n",
       "735                 0                                               0   \n",
       "736               172                                               0   \n",
       "\n",
       "    Total #  of Other Race Total #  of Unknown Ethnicity  \\\n",
       "0                        0                             0   \n",
       "1                        0                             0   \n",
       "2                        0                             1   \n",
       "3                        0                             0   \n",
       "4                        0                             0   \n",
       "..                     ...                           ...   \n",
       "732                      0                             0   \n",
       "733                      0                             0   \n",
       "734                      0                             0   \n",
       "735                      0                             0   \n",
       "736                      9                             0   \n",
       "\n",
       "    Total #  of Unknown Race Total #  of White Total # of Hispanic/Latino  \\\n",
       "0                          0                 0                          0   \n",
       "1                          0                50                          0   \n",
       "2                          3                23                          6   \n",
       "3                          0                 0                          0   \n",
       "4                          0                 0                          0   \n",
       "..                       ...               ...                        ...   \n",
       "732                        0                 0                          0   \n",
       "733                        0                 0                          0   \n",
       "734                        0                 0                          0   \n",
       "735                        0                 0                          0   \n",
       "736                        0               126                         78   \n",
       "\n",
       "    Total # of Non-Hispanic/Non-Latino  \n",
       "0                                    0  \n",
       "1                                   50  \n",
       "2                                   22  \n",
       "3                                    0  \n",
       "4                                    0  \n",
       "..                                 ...  \n",
       "732                                  0  \n",
       "733                                  0  \n",
       "734                                  0  \n",
       "735                                  0  \n",
       "736                                228  \n",
       "\n",
       "[737 rows x 56 columns]"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "combined"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "combined = combined.rename(columns={'Indication_y': 'Indication'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(737, 56)"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "combined.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "combined = combined.rename(columns={'canon_id': 'fda_application_id'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
