{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"This is a public data source from TableShift.\"\"\"\n",
    "# For more information on datasets and access in TableShift, see:\n",
    "# * https://tableshift.org/datasets.html\n",
    "# * https://github.com/mlfoundations/tableshift\n",
    "import os\n",
    "import zipfile\n",
    "from abc import ABC, abstractmethod\n",
    "from typing import Sequence, Callable\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "import utils\n",
    "# diabetes_readmission.py and tabular_dataset.py are from TableShift\n",
    "from diabetes_readmission import \\\n",
    "    DIABETES_READMISSION_RESOURCES, preprocess_diabetes_readmission, DIABETES_READMISSION_FEATURES, get_icd9\n",
    "from tabular_dataset import *\n",
    "\n",
    "import pandas as pd\n",
    "\n",
    "\n",
    "def convert_numeric_dtypes(df: pd.DataFrame) -> pd.DataFrame:\n",
    "    \"\"\"Utility function for automatically casting int-valued columns to float.\"\"\"\n",
    "    for c in df.columns:\n",
    "        df[c] = df[c].convert_dtypes(convert_string=False,\n",
    "                                     convert_boolean=False)\n",
    "    return df\n",
    "\n",
    "\n",
    "def complete_cases(df: pd.DataFrame) -> pd.DataFrame:\n",
    "    return df.dropna(axis=0, how='any')\n",
    "\n",
    "\n",
    "def apply_column_missingness_threshold(df: pd.DataFrame,\n",
    "                                       missingness_threshold=0.8) -> pd.DataFrame:\n",
    "    miss = pd.isnull(df).sum() / len(df)\n",
    "\n",
    "    dropcols = miss.index[miss >= missingness_threshold].tolist()\n",
    "    df.drop(columns=dropcols, inplace=True)\n",
    "    return df\n",
    "\n",
    "\n",
    "\n",
    "class DataSource(ABC):\n",
    "    \"\"\"Abstract class to represent a generic data source.\"\"\"\n",
    "\n",
    "    def __init__(self, cache_dir: str,\n",
    "                 preprocess_fn: Callable[[pd.DataFrame], pd.DataFrame],\n",
    "                 resources: Sequence[str] = None,\n",
    "                 download: bool = True,\n",
    "                 ):\n",
    "        self.cache_dir = cache_dir\n",
    "        self.download = download\n",
    "\n",
    "        self.preprocess_fn = preprocess_fn\n",
    "        self.resources = resources\n",
    "        self._initialize_cache_dir()\n",
    "\n",
    "    def _initialize_cache_dir(self):\n",
    "        \"\"\"Create cache_dir if it does not exist.\"\"\"\n",
    "        utils.initialize_dir(self.cache_dir)\n",
    "\n",
    "    def get_data(self) -> pd.DataFrame:\n",
    "        \"\"\"Fetch data from local cache or download if necessary.\"\"\"\n",
    "        self._download_if_not_cached()\n",
    "        raw_data = self._load_data()\n",
    "        return self.preprocess_fn(raw_data)\n",
    "\n",
    "    def _download_if_not_cached(self):\n",
    "        \"\"\"Download files if they are not already cached.\"\"\"\n",
    "        for url in self.resources:\n",
    "            utils.download_file(url, self.cache_dir)\n",
    "\n",
    "    @abstractmethod\n",
    "    def _load_data(self) -> pd.DataFrame:\n",
    "        \"\"\"Load the raw data from disk and return it.\n",
    "\n",
    "        Any preprocessing should be performed in preprocess_fn, not here.\"\"\"\n",
    "        raise\n",
    "\n",
    "    @property\n",
    "    def is_cached(self) -> bool:\n",
    "        \"\"\"Check whether all resources exist in cache dir.\"\"\"\n",
    "        for url in self.resources:\n",
    "            basename = utils.basename_from_url(url)\n",
    "            fp = os.path.join(self.cache_dir, basename)\n",
    "            if not os.path.exists(fp):\n",
    "                return False\n",
    "        return True\n",
    "\n",
    "class DiabetesReadmissionDataSource(DataSource):\n",
    "    def __init__(self, resources=DIABETES_READMISSION_RESOURCES,\n",
    "                 preprocess_fn=preprocess_diabetes_readmission, **kwargs):\n",
    "        super().__init__(resources=resources, preprocess_fn=preprocess_fn,\n",
    "                         **kwargs)\n",
    "\n",
    "    def _load_data(self) -> pd.DataFrame:\n",
    "        # unzip the file\n",
    "        zip_fp = os.path.join(self.cache_dir, \"dataset_diabetes.zip\")\n",
    "        with zipfile.ZipFile(zip_fp, 'r') as zf:\n",
    "            zf.extractall(self.cache_dir)\n",
    "        # read the dataframe\n",
    "        df = pd.read_csv(os.path.join(self.cache_dir, \"dataset_diabetes\",\n",
    "                                      \"diabetic_data.csv\"),\n",
    "                         na_values=\"?\",\n",
    "                         low_memory=False)\n",
    "        return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "num_lab_procedures    int64\n",
       "num_procedures        int64\n",
       "num_medications       int64\n",
       "number_outpatient     int64\n",
       "number_emergency      int64\n",
       "number_inpatient      int64\n",
       "number_diagnoses      int64\n",
       "dtype: object"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_DiabetesReadmission = DiabetesReadmissionDataSource(cache_dir = '/Users/ruiqil/Documents/DistShift/tableshift_fetch/DiabetesReadmission').get_data()\n",
    "# df_DiabetesReadmission = df_DiabetesReadmission[~(df_DiabetesReadmission.race == 'Other')]\n",
    "df_DiabetesReadmission[['num_lab_procedures', 'num_procedures', 'num_medications',\n",
    "       'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']].dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Summary for column: race\n",
      "race\n",
      "Asian                521\n",
      "Hispanic            1419\n",
      "AfricanAmerican    14693\n",
      "Caucasian          56982\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: gender\n",
      "gender\n",
      "1    33773\n",
      "0    39842\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: age\n",
      "age\n",
      "[20-30)      1257\n",
      "[90-100)     1924\n",
      "[30-40)      2828\n",
      "[40-50)      7101\n",
      "[80-90)     12406\n",
      "[50-60)     12815\n",
      "[60-70)     16696\n",
      "[70-80)     18588\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: weight\n",
      "weight\n",
      ">200             2\n",
      "[0-25)          10\n",
      "[175-200)       10\n",
      "[150-175)       27\n",
      "[25-50)         45\n",
      "[125-150)      107\n",
      "[100-125)      479\n",
      "[50-75)        516\n",
      "[75-100)       874\n",
      "NaN          71545\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Index(['gender', 'admission_type_id', 'discharge_disposition_id',\n",
       "       'admission_source_id', 'time_in_hospital', 'num_lab_procedures',\n",
       "       'num_procedures', 'num_medications', 'number_outpatient',\n",
       "       'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3',\n",
       "       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin',\n",
       "       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',\n",
       "       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',\n",
       "       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',\n",
       "       'tolazamide', 'examide', 'citoglipton', 'insulin',\n",
       "       'glyburide-metformin', 'glipizide-metformin',\n",
       "       'glimepiride-pioglitazone', 'metformin-rosiglitazone',\n",
       "       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted',\n",
       "       'race_Caucasian', 'race_AfricanAmerican', 'race_Hispanic', 'race_Asian',\n",
       "       'age>=70'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# for col in ['race', 'gender', 'age', 'weight']:\n",
    "#     print(f\"Summary for column: {col}\")\n",
    "#     print(df_DiabetesReadmission[col].value_counts(dropna=False).sort_values())\n",
    "#     print(\"-\" * 30)\n",
    "df_DiabetesReadmission = df_DiabetesReadmission[~(df_DiabetesReadmission['gender'] == 'Unknown/Invalid')]\n",
    "df_DiabetesReadmission['race_Caucasian'] = np.where((df_DiabetesReadmission['race'] == 'Caucasian'), 1, 0)\n",
    "df_DiabetesReadmission['race_AfricanAmerican'] = np.where((df_DiabetesReadmission['race'] == 'AfricanAmerican'), 1, 0)\n",
    "df_DiabetesReadmission['race_Hispanic'] = np.where((df_DiabetesReadmission['race'] == 'Hispanic'), 1, 0)\n",
    "df_DiabetesReadmission['race_Asian'] = np.where((df_DiabetesReadmission['race'] == 'Asian'), 1, 0)\n",
    "df_DiabetesReadmission = df_DiabetesReadmission[~(df_DiabetesReadmission.race == 'Other')]\n",
    "df_DiabetesReadmission = df_DiabetesReadmission[(df_DiabetesReadmission.admission_source_id == 1) | (df_DiabetesReadmission.admission_source_id == 7)]\n",
    "df_DiabetesReadmission = df_DiabetesReadmission[(df_DiabetesReadmission.admission_type_id == 1) | (df_DiabetesReadmission.admission_type_id == 2) | (df_DiabetesReadmission.admission_type_id == 3)]\n",
    "df_DiabetesReadmission = df_DiabetesReadmission[df_DiabetesReadmission.discharge_disposition_id.isin([1, 3, 6, 22, 2, 5, 4, 23])]\n",
    "\n",
    "df_DiabetesReadmission = df_DiabetesReadmission[~((df_DiabetesReadmission.age == '[0-10)') | (df_DiabetesReadmission.age == '[10-20)'))]\n",
    "\n",
    "# df_DiabetesReadmission = df_DiabetesReadmission[~(df_DiabetesReadmission.race == 'Asian')]\n",
    "df_DiabetesReadmission['gender'] = np.where((df_DiabetesReadmission['gender'] == 'Male'), 1, 0)\n",
    "# df_DiabetesReadmission = df_DiabetesReadmission.join(pd.get_dummies(df_DiabetesReadmission[\"age\"], prefix=\"age\"))\n",
    "# for c in [\"payer_code\", \"medical_specialty\"]:\n",
    "#     freq = df_DiabetesReadmission[c].value_counts(normalize=True)\n",
    "#     keep_categories = freq[freq > 0.01].index\n",
    "#     df_DiabetesReadmission[c] = df_DiabetesReadmission[c].where(\n",
    "#         df_DiabetesReadmission[c].isin(keep_categories), \n",
    "#         other=np.nan\n",
    "#     )\n",
    "#     df_DiabetesReadmission = df_DiabetesReadmission.join(pd.get_dummies(df_DiabetesReadmission[c], prefix=c))\n",
    "df_DiabetesReadmission['age>=70'] = np.where(df_DiabetesReadmission.age.isin(['[70-80)', '[80-90)', '[90-100)']), 1, 0)\n",
    "for col in ['race', 'gender', 'age', 'weight']:\n",
    "    print(f\"Summary for column: {col}\")\n",
    "    print(df_DiabetesReadmission[col].value_counts(dropna=False).sort_values())\n",
    "    print(\"-\" * 30)\n",
    "df_DiabetesReadmission = df_DiabetesReadmission.drop(columns=['encounter_id', 'patient_nbr', 'race', 'age', 'weight', \"payer_code\", \"medical_specialty\"]).reset_index(drop=True)\n",
    "df_DiabetesReadmission.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Summary for column: repaglinide\n",
      "repaglinide\n",
      "Down         31\n",
      "Up           86\n",
      "Steady     1199\n",
      "No        72299\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: nateglinide\n",
      "nateglinide\n",
      "Down          9\n",
      "Up           20\n",
      "Steady      579\n",
      "No        73007\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: chlorpropamide\n",
      "chlorpropamide\n",
      "Down          1\n",
      "Up            4\n",
      "Steady       37\n",
      "No        73573\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: glimepiride\n",
      "glimepiride\n",
      "Down        147\n",
      "Up          229\n",
      "Steady     3712\n",
      "No        69527\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: acetohexamide\n",
      "acetohexamide\n",
      "Steady        1\n",
      "No        73614\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: glipizide\n",
      "glipizide\n",
      "Down        406\n",
      "Up          527\n",
      "Steady     8474\n",
      "No        64208\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: glyburide\n",
      "glyburide\n",
      "Down        357\n",
      "Up          548\n",
      "Steady     6425\n",
      "No        66285\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: tolbutamide\n",
      "tolbutamide\n",
      "Steady       13\n",
      "No        73602\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: pioglitazone\n",
      "pioglitazone\n",
      "Down         84\n",
      "Up          169\n",
      "Steady     5398\n",
      "No        67964\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: rosiglitazone\n",
      "rosiglitazone\n",
      "Down         53\n",
      "Up          122\n",
      "Steady     4535\n",
      "No        68905\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: acarbose\n",
      "acarbose\n",
      "Down          3\n",
      "Up            6\n",
      "Steady      220\n",
      "No        73386\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: miglitol\n",
      "miglitol\n",
      "Up            1\n",
      "Down          5\n",
      "Steady       27\n",
      "No        73582\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: troglitazone\n",
      "troglitazone\n",
      "Steady        2\n",
      "No        73613\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: tolazamide\n",
      "tolazamide\n",
      "Steady       15\n",
      "No        73600\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: examide\n",
      "examide\n",
      "No    73615\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: citoglipton\n",
      "citoglipton\n",
      "No    73615\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: glyburide-metformin\n",
      "glyburide-metformin\n",
      "Down          2\n",
      "Up            4\n",
      "Steady      612\n",
      "No        72997\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: glipizide-metformin\n",
      "glipizide-metformin\n",
      "Steady       12\n",
      "No        73603\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: glimepiride-pioglitazone\n",
      "glimepiride-pioglitazone\n",
      "Steady        1\n",
      "No        73614\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: metformin-rosiglitazone\n",
      "metformin-rosiglitazone\n",
      "No    73615\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: metformin-pioglitazone\n",
      "metformin-pioglitazone\n",
      "Steady        1\n",
      "No        73614\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Index(['gender', 'admission_type_id', 'discharge_disposition_id',\n",
       "       'admission_source_id', 'time_in_hospital', 'num_lab_procedures',\n",
       "       'num_procedures', 'num_medications', 'number_outpatient',\n",
       "       'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3',\n",
       "       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin',\n",
       "       'insulin', 'change', 'diabetesMed', 'readmitted', 'race_Caucasian',\n",
       "       'race_AfricanAmerican', 'race_Hispanic', 'race_Asian', 'age>=70'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# for col in ['max_glu_serum', 'A1Cresult', 'metformin', 'insulin', 'change', 'diabetesMed', 'readmitted']:\n",
    "low_quality_col = ['repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',\n",
    "            'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', \n",
    "            'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone']\n",
    "for col in low_quality_col:\n",
    "    print(f\"Summary for column: {col}\")\n",
    "    print(df_DiabetesReadmission[col].value_counts(dropna=False).sort_values())\n",
    "    print(\"-\" * 30)\n",
    "\n",
    "df_DiabetesReadmission = df_DiabetesReadmission.drop(columns=low_quality_col).reset_index(drop=True)\n",
    "df_DiabetesReadmission.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Summary for column: max_glu_serum\n",
      "max_glu_serum\n",
      ">200        7\n",
      "Norm       44\n",
      ">300      255\n",
      "NaN     73309\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: A1Cresult\n",
      "A1Cresult\n",
      ">7       2887\n",
      "Norm     3912\n",
      ">8       5694\n",
      "NaN     61122\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: metformin\n",
      "metformin\n",
      "Down        425\n",
      "Up          771\n",
      "Steady    13994\n",
      "No        58425\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: insulin\n",
      "insulin\n",
      "Up         8354\n",
      "Down       9390\n",
      "Steady    22139\n",
      "No        33732\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: change\n",
      "change\n",
      "Ch    35103\n",
      "No    38512\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: diabetesMed\n",
      "diabetesMed\n",
      "No     16442\n",
      "Yes    57173\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>gender</th>\n",
       "      <th>admission_type_id</th>\n",
       "      <th>discharge_disposition_id</th>\n",
       "      <th>admission_source_id</th>\n",
       "      <th>time_in_hospital</th>\n",
       "      <th>num_lab_procedures</th>\n",
       "      <th>num_procedures</th>\n",
       "      <th>num_medications</th>\n",
       "      <th>number_outpatient</th>\n",
       "      <th>number_emergency</th>\n",
       "      <th>...</th>\n",
       "      <th>max_glu_serum&gt;200</th>\n",
       "      <th>max_glu_serum&gt;300</th>\n",
       "      <th>A1Cresult&gt;7</th>\n",
       "      <th>A1Cresult&gt;8</th>\n",
       "      <th>metformin_Up</th>\n",
       "      <th>metformin_Down</th>\n",
       "      <th>metformin_Steady</th>\n",
       "      <th>insulin_Up</th>\n",
       "      <th>insulin_Down</th>\n",
       "      <th>insulin_Steady</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "      <td>2</td>\n",
       "      <td>11</td>\n",
       "      <td>5</td>\n",
       "      <td>13</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "      <td>2</td>\n",
       "      <td>44</td>\n",
       "      <td>1</td>\n",
       "      <td>16</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>51</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "      <td>5</td>\n",
       "      <td>73</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "      <td>9</td>\n",
       "      <td>47</td>\n",
       "      <td>2</td>\n",
       "      <td>17</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73610</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "      <td>6</td>\n",
       "      <td>45</td>\n",
       "      <td>1</td>\n",
       "      <td>25</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73611</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "      <td>3</td>\n",
       "      <td>51</td>\n",
       "      <td>0</td>\n",
       "      <td>16</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73612</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>53</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73613</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "      <td>10</td>\n",
       "      <td>45</td>\n",
       "      <td>2</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73614</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "      <td>6</td>\n",
       "      <td>13</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>73615 rows × 35 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       gender  admission_type_id  discharge_disposition_id  \\\n",
       "0           0                  1                         1   \n",
       "1           1                  1                         1   \n",
       "2           1                  1                         1   \n",
       "3           1                  1                         1   \n",
       "4           0                  1                         1   \n",
       "...       ...                ...                       ...   \n",
       "73610       0                  1                         1   \n",
       "73611       1                  1                         3   \n",
       "73612       1                  1                         1   \n",
       "73613       0                  2                         3   \n",
       "73614       1                  1                         1   \n",
       "\n",
       "       admission_source_id  time_in_hospital  num_lab_procedures  \\\n",
       "0                        7                 2                  11   \n",
       "1                        7                 2                  44   \n",
       "2                        7                 1                  51   \n",
       "3                        7                 5                  73   \n",
       "4                        7                 9                  47   \n",
       "...                    ...               ...                 ...   \n",
       "73610                    7                 6                  45   \n",
       "73611                    7                 3                  51   \n",
       "73612                    7                 1                  53   \n",
       "73613                    7                10                  45   \n",
       "73614                    7                 6                  13   \n",
       "\n",
       "       num_procedures  num_medications  number_outpatient  number_emergency  \\\n",
       "0                   5               13                  2                 0   \n",
       "1                   1               16                  0                 0   \n",
       "2                   0                8                  0                 0   \n",
       "3                   0               12                  0                 0   \n",
       "4                   2               17                  0                 0   \n",
       "...               ...              ...                ...               ...   \n",
       "73610               1               25                  3                 1   \n",
       "73611               0               16                  0                 0   \n",
       "73612               0                9                  1                 0   \n",
       "73613               2               21                  0                 0   \n",
       "73614               3                3                  0                 0   \n",
       "\n",
       "       ...  max_glu_serum>200 max_glu_serum>300 A1Cresult>7 A1Cresult>8  \\\n",
       "0      ...                NaN               NaN         NaN         NaN   \n",
       "1      ...                NaN               NaN         NaN         NaN   \n",
       "2      ...                NaN               NaN         NaN         NaN   \n",
       "3      ...                NaN               NaN         NaN         NaN   \n",
       "4      ...                NaN               NaN         NaN         NaN   \n",
       "...    ...                ...               ...         ...         ...   \n",
       "73610  ...                NaN               NaN         NaN         NaN   \n",
       "73611  ...                NaN               NaN         1.0         1.0   \n",
       "73612  ...                NaN               NaN         NaN         NaN   \n",
       "73613  ...                NaN               NaN         NaN         NaN   \n",
       "73614  ...                NaN               NaN         NaN         NaN   \n",
       "\n",
       "       metformin_Up  metformin_Down  metformin_Steady  insulin_Up  \\\n",
       "0                 0               0                 0           0   \n",
       "1                 0               0                 0           1   \n",
       "2                 0               0                 0           0   \n",
       "3                 0               0                 0           0   \n",
       "4                 0               0                 0           0   \n",
       "...             ...             ...               ...         ...   \n",
       "73610             0               0                 0           0   \n",
       "73611             0               0                 1           0   \n",
       "73612             0               0                 1           0   \n",
       "73613             0               0                 0           1   \n",
       "73614             0               0                 0           0   \n",
       "\n",
       "       insulin_Down  insulin_Steady  \n",
       "0                 0               0  \n",
       "1                 0               0  \n",
       "2                 0               1  \n",
       "3                 0               0  \n",
       "4                 0               1  \n",
       "...             ...             ...  \n",
       "73610             1               0  \n",
       "73611             1               0  \n",
       "73612             1               0  \n",
       "73613             0               0  \n",
       "73614             0               0  \n",
       "\n",
       "[73615 rows x 35 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "for col in ['max_glu_serum', 'A1Cresult', 'metformin', 'insulin', 'change', 'diabetesMed']:\n",
    "    print(f\"Summary for column: {col}\")\n",
    "    print(df_DiabetesReadmission[col].value_counts(dropna=False).sort_values())\n",
    "    print(\"-\" * 30)\n",
    "df_DiabetesReadmission[\"max_glu_serum>200\"] = df_DiabetesReadmission[\"max_glu_serum\"].map({\"None\": 0, \"Norm\": 0, \">200\": 1, \">300\": 1})\n",
    "df_DiabetesReadmission[\"max_glu_serum>300\"] = df_DiabetesReadmission[\"max_glu_serum\"].map({\"None\": 0, \"Norm\": 0, \">200\": 0, \">300\": 1})\n",
    "df_DiabetesReadmission[\"max_glu_serum\"] = df_DiabetesReadmission[\"max_glu_serum\"].map({\"None\": 0, \"Norm\": 1, \">200\": 1, \">300\": 1})\n",
    "df_DiabetesReadmission[\"A1Cresult>7\"] = df_DiabetesReadmission[\"A1Cresult\"].map({\"None\": 0, \"Norm\": 0, \">7\": 1, \">8\": 1})\n",
    "df_DiabetesReadmission[\"A1Cresult>8\"] = df_DiabetesReadmission[\"A1Cresult\"].map({\"None\": 0, \"Norm\": 0, \">7\": 0, \">8\": 1})\n",
    "df_DiabetesReadmission[\"A1Cresult\"] = df_DiabetesReadmission[\"A1Cresult\"].map({\"None\": 0, \"Norm\": 1, \">7\": 1, \">8\": 1})\n",
    "df_DiabetesReadmission['change'] = np.where((df_DiabetesReadmission['change'] == 'Ch'), 1, 0)\n",
    "df_DiabetesReadmission['diabetesMed'] = np.where((df_DiabetesReadmission['diabetesMed'] == 'Yes'), 1, 0)\n",
    "df_DiabetesReadmission['metformin_Up'] = np.where((df_DiabetesReadmission['metformin'] == 'Up'), 1, 0)\n",
    "df_DiabetesReadmission['metformin_Down'] = np.where((df_DiabetesReadmission['metformin'] == 'Down'), 1, 0)\n",
    "df_DiabetesReadmission['metformin_Steady'] = np.where((df_DiabetesReadmission['metformin'] == 'Steady'), 1, 0)\n",
    "df_DiabetesReadmission['insulin_Up'] = np.where((df_DiabetesReadmission['insulin'] == 'Up'), 1, 0)\n",
    "df_DiabetesReadmission['insulin_Down'] = np.where((df_DiabetesReadmission['insulin'] == 'Down'), 1, 0)\n",
    "df_DiabetesReadmission['insulin_Steady'] = np.where((df_DiabetesReadmission['insulin'] == 'Steady'), 1, 0)\n",
    "df_DiabetesReadmission = df_DiabetesReadmission.drop(columns=['metformin', 'insulin'])\n",
    "df_DiabetesReadmission"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "from features import *\n",
    "transforms = make_value_map_transforms(DIABETES_READMISSION_FEATURES)\n",
    "# ColumnTransformer(transforms)\n",
    "filtered_transforms = [\n",
    "    t for t in transforms if any(col in t[2] for col in ['admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'diag_1', 'diag_2', 'diag_3'])\n",
    "]\n",
    "\n",
    "# Create a ColumnTransformer with only the filtered transformations\n",
    "ct = ColumnTransformer(filtered_transforms)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>admission_type_id</th>\n",
       "      <th>discharge_disposition_id</th>\n",
       "      <th>admission_source_id</th>\n",
       "      <th>diag_1</th>\n",
       "      <th>diag_2</th>\n",
       "      <th>diag_3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Emergency</td>\n",
       "      <td>Discharged to home</td>\n",
       "      <td>Emergency Room</td>\n",
       "      <td>Other current conditions in the mother classif...</td>\n",
       "      <td>Diabetes mellitus</td>\n",
       "      <td>Outcome of delivery</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Emergency</td>\n",
       "      <td>Discharged to home</td>\n",
       "      <td>Emergency Room</td>\n",
       "      <td>Intestinal infections due to other organisms</td>\n",
       "      <td>Diabetes mellitus type 1, uncontrolled Diabete...</td>\n",
       "      <td>Hypertensive chronic kidney disease</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Emergency</td>\n",
       "      <td>Discharged to home</td>\n",
       "      <td>Emergency Room</td>\n",
       "      <td>Secondary malignant neoplasm of respiratory an...</td>\n",
       "      <td>Malignant neoplasm of pancreas</td>\n",
       "      <td>Diabetes mellitus</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Emergency</td>\n",
       "      <td>Discharged to home</td>\n",
       "      <td>Emergency Room</td>\n",
       "      <td>Heart failure</td>\n",
       "      <td>Emphysema</td>\n",
       "      <td>Diabetes mellitus</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Emergency</td>\n",
       "      <td>Discharged to home</td>\n",
       "      <td>Emergency Room</td>\n",
       "      <td>Diabetes with peripheral circulatory disorders</td>\n",
       "      <td>Hypertensive chronic kidney disease</td>\n",
       "      <td>Complications peculiar to certain specified pr...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73610</th>\n",
       "      <td>Emergency</td>\n",
       "      <td>Discharged to home</td>\n",
       "      <td>Emergency Room</td>\n",
       "      <td>Epilepsy and recurrent seizures</td>\n",
       "      <td>Late effects of cerebrovascular disease</td>\n",
       "      <td>Old myocardial infarction</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73611</th>\n",
       "      <td>Emergency</td>\n",
       "      <td>Discharged/transferred to SNF</td>\n",
       "      <td>Emergency Room</td>\n",
       "      <td>Diabetes mellitus type 1, uncontrolled Diabete...</td>\n",
       "      <td>Alcohol-induced mental disorders</td>\n",
       "      <td>Hypotension</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73612</th>\n",
       "      <td>Emergency</td>\n",
       "      <td>Discharged to home</td>\n",
       "      <td>Emergency Room</td>\n",
       "      <td>Septicemia</td>\n",
       "      <td>Infections of kidney</td>\n",
       "      <td>Episodic mood disorders</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73613</th>\n",
       "      <td>Urgent</td>\n",
       "      <td>Discharged/transferred to SNF</td>\n",
       "      <td>Emergency Room</td>\n",
       "      <td>Complications peculiar to certain specified pr...</td>\n",
       "      <td>Other and unspecified anemias</td>\n",
       "      <td>Other complications of procedures, NEC</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73614</th>\n",
       "      <td>Emergency</td>\n",
       "      <td>Discharged to home</td>\n",
       "      <td>Emergency Room</td>\n",
       "      <td>Diseases of esophagus</td>\n",
       "      <td>Diseases of esophagus</td>\n",
       "      <td>Symptoms involving digestive system</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>73615 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      admission_type_id       discharge_disposition_id admission_source_id  \\\n",
       "0             Emergency             Discharged to home      Emergency Room   \n",
       "1             Emergency             Discharged to home      Emergency Room   \n",
       "2             Emergency             Discharged to home      Emergency Room   \n",
       "3             Emergency             Discharged to home      Emergency Room   \n",
       "4             Emergency             Discharged to home      Emergency Room   \n",
       "...                 ...                            ...                 ...   \n",
       "73610         Emergency             Discharged to home      Emergency Room   \n",
       "73611         Emergency  Discharged/transferred to SNF      Emergency Room   \n",
       "73612         Emergency             Discharged to home      Emergency Room   \n",
       "73613            Urgent  Discharged/transferred to SNF      Emergency Room   \n",
       "73614         Emergency             Discharged to home      Emergency Room   \n",
       "\n",
       "                                                  diag_1  \\\n",
       "0      Other current conditions in the mother classif...   \n",
       "1           Intestinal infections due to other organisms   \n",
       "2      Secondary malignant neoplasm of respiratory an...   \n",
       "3                                          Heart failure   \n",
       "4         Diabetes with peripheral circulatory disorders   \n",
       "...                                                  ...   \n",
       "73610                    Epilepsy and recurrent seizures   \n",
       "73611  Diabetes mellitus type 1, uncontrolled Diabete...   \n",
       "73612                                         Septicemia   \n",
       "73613  Complications peculiar to certain specified pr...   \n",
       "73614                              Diseases of esophagus   \n",
       "\n",
       "                                                  diag_2  \\\n",
       "0                                      Diabetes mellitus   \n",
       "1      Diabetes mellitus type 1, uncontrolled Diabete...   \n",
       "2                         Malignant neoplasm of pancreas   \n",
       "3                                              Emphysema   \n",
       "4                    Hypertensive chronic kidney disease   \n",
       "...                                                  ...   \n",
       "73610            Late effects of cerebrovascular disease   \n",
       "73611                   Alcohol-induced mental disorders   \n",
       "73612                               Infections of kidney   \n",
       "73613                      Other and unspecified anemias   \n",
       "73614                              Diseases of esophagus   \n",
       "\n",
       "                                                  diag_3  \n",
       "0                                    Outcome of delivery  \n",
       "1                    Hypertensive chronic kidney disease  \n",
       "2                                      Diabetes mellitus  \n",
       "3                                      Diabetes mellitus  \n",
       "4      Complications peculiar to certain specified pr...  \n",
       "...                                                  ...  \n",
       "73610                          Old myocardial infarction  \n",
       "73611                                        Hypotension  \n",
       "73612                            Episodic mood disorders  \n",
       "73613             Other complications of procedures, NEC  \n",
       "73614                Symptoms involving digestive system  \n",
       "\n",
       "[73615 rows x 6 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_transformed = ct.fit_transform(df_DiabetesReadmission[['admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'diag_1', 'diag_2', 'diag_3']].fillna(\"nan\"))\n",
    "df_transformed = pd.DataFrame(df_transformed, columns=remove_verbose_prefixes(ct.get_feature_names_out()))\n",
    "df_transformed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "admission_type_id\n",
       "2    13864\n",
       "3    15565\n",
       "1    44186\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_DiabetesReadmission['admission_type_id'].value_counts(dropna=False).sort_values()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "admission_type_id\n",
       "Urgent       13864\n",
       "Elective     15565\n",
       "Emergency    44186\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_transformed['admission_type_id'].value_counts(dropna=False).sort_values()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "admission_source_id\n",
       "1    24402\n",
       "7    49213\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_DiabetesReadmission['admission_source_id'].value_counts(dropna=False).sort_values()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "admission_source_id\n",
       "Physician Referral    24402\n",
       "Emergency Room        49213\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_transformed['admission_source_id'].value_counts(dropna=False).sort_values()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "discharge_disposition_id\n",
       "23      339\n",
       "4       600\n",
       "5       807\n",
       "2      1588\n",
       "22     1805\n",
       "6     10084\n",
       "3     10231\n",
       "1     48161\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_DiabetesReadmission['discharge_disposition_id'].value_counts(dropna=False).sort_values()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "discharge_disposition_id\n",
       "Discharged/transferred to a long term care hospital.                                  339\n",
       "Discharged/transferred to ICF                                                         600\n",
       "Discharged/transferred to another type of inpatient care institution                  807\n",
       "Discharged/transferred to another short term hospital                                1588\n",
       "Discharged/transferred to another rehab fac including rehab units of a hospital.     1805\n",
       "Discharged/transferred to home with home health service                             10084\n",
       "Discharged/transferred to SNF                                                       10231\n",
       "Discharged to home                                                                  48161\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_transformed['discharge_disposition_id'].value_counts(dropna=False).sort_values()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def map_discharge_simple(x):\n",
    "    # Treat NULL, \"Not Mapped\", or empty strings as missing\n",
    "    if pd.isnull(x) or x.strip() == \"\" or x.lower() in [\"null\", \"not mapped\"]:\n",
    "        return None\n",
    "    x_lower = x.lower()\n",
    "    # Group anything with \"home\" (for example: 'Discharged to home', 'Discharged/transferred to home with home health service')\n",
    "    if \"home\" in x_lower:\n",
    "        return \"Home\"\n",
    "    # For others that mention 'transferred', 'discharged/transferred', or similar phrases\n",
    "    elif \"transferred\" in x_lower or \"discharged/transferred\" in x_lower:\n",
    "        return \"Transferred\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "discharge_group\n",
      "Transferred    15370\n",
      "Home           58245\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "df_transformed[\"discharge_group\"] = df_transformed[\"discharge_disposition_id\"].apply(map_discharge_simple)\n",
    "print(df_transformed['discharge_group'].value_counts(dropna=False).sort_values())\n",
    "df_DiabetesReadmission['Transferred'] = np.where((df_transformed['discharge_group'] == 'Transferred'), 1, 0)\n",
    "df_DiabetesReadmission['Home'] = np.where((df_transformed['discharge_group'] == 'Home'), 1, 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['gender', 'time_in_hospital', 'num_lab_procedures', 'num_procedures',\n",
       "       'num_medications', 'number_outpatient', 'number_emergency',\n",
       "       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',\n",
       "       'max_glu_serum', 'A1Cresult', 'change', 'diabetesMed', 'readmitted',\n",
       "       'race_Caucasian', 'race_AfricanAmerican', 'race_Hispanic', 'race_Asian',\n",
       "       'age>=70', 'max_glu_serum>200', 'max_glu_serum>300', 'A1Cresult>7',\n",
       "       'A1Cresult>8', 'metformin_Up', 'metformin_Down', 'metformin_Steady',\n",
       "       'insulin_Up', 'insulin_Down', 'insulin_Steady', 'Transferred', 'Home',\n",
       "       'Emergency_admission', 'Elective_admission', 'Urgent_admission',\n",
       "       'Emergency Room'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_DiabetesReadmission['Emergency_admission'] = np.where((df_DiabetesReadmission['admission_type_id'] == 1), 1, 0)\n",
    "df_DiabetesReadmission['Elective_admission'] = np.where((df_DiabetesReadmission['admission_type_id'] == 3), 1, 0)\n",
    "df_DiabetesReadmission['Urgent_admission'] = np.where((df_DiabetesReadmission['admission_type_id'] == 2), 1, 0)\n",
    "\n",
    "df_DiabetesReadmission['Emergency Room'] = np.where((df_DiabetesReadmission['admission_source_id'] == 7), 1, 0)\n",
    "\n",
    "df_DiabetesReadmission = df_DiabetesReadmission.drop(columns=['admission_type_id', 'admission_source_id', 'discharge_disposition_id'])\n",
    "df_DiabetesReadmission.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "428 0.0598563701649925 Heart failure\n",
      "250 0.05867460569974092 Diabetes mellitus\n",
      "276 0.04618880959956365 Disorders of fluid, electrolyte, and acid-base balance\n",
      "401 0.04193445752465797 Essential hypertension\n",
      "414 0.04143902549884096 Other forms of chronic ischemic heart disease\n",
      "427 0.03868005999727285 Cardiac dysrhythmias\n",
      "599 0.022885323394391165 Other disorders of urethra and urinary tract\n",
      "403 0.019585473387573292 Hypertensive chronic kidney disease\n",
      "496 0.019535475660197265 Chronic airway obstruction, not elsewhere classified\n",
      "786 0.01752647606926958 Symptoms involving respiratory system and other chest symptoms\n",
      "486 0.017499204581609928 Pneumonia, organism unspecified\n",
      "780 0.016081087223308033 General symptoms\n",
      "491 0.01519021862642607 Chronic bronchitis\n",
      "682 0.014953865733375756 Other cellulitis and abscess\n",
      "250.02 0.014431162219899096 Diabetes mellitus type 2, uncontrolled Diabetes mellitus type 1 Diabetes mellitus type 2 without mention of complication\n",
      "585 0.014381164492523068 Chronic kidney disease (CKD)\n",
      "584 0.013658470069542293 Acute renal failure\n",
      "410 0.012253988455070225 Acute myocardial infarction\n",
      "707 0.01183128039634562 Chronic ulcer of skin\n",
      "285 0.010722239898186447 Other and unspecified anemias\n",
      "250.6 0.010576791964001636 Diabetes with neurological manifestations\n",
      "518 0.009926821508113267 Other diseases of lung\n",
      "411 0.009849552293077588 Other acute and subacute forms of ischemic heart disease\n",
      "996 0.009040498159174582 Complications peculiar to certain specified procedures\n",
      "493 0.008872323985273397 Asthma\n",
      "425 0.00884959774555702 Cardiomyopathy\n",
      "715 0.00810417708285987 Osteoarthrosis and allied disorders\n",
      "272 0.007976910140448162 Disorders of lipoid metabolism\n",
      "250.8 0.007467842370801327 Diabetes with other specified manifestations\n",
      "434 0.0074314803872551245 Occlusion of cerebral arteries\n",
      "424 0.007395118403708922 Other diseases of endocardium\n",
      "250.01 0.007199672742148084 Diabetes mellitus type 1 Diabetes mellitus type 2 without mention of complication\n",
      "38 0.007095132039452752 Septicemia\n",
      "V45 0.0061269942275351125 Other postprocedural states\n",
      "305 0.005517931003136221 Nondependent abuse of drugs\n",
      "998 0.005477023771646743 Other complications of procedures, NEC\n",
      "560 0.005340666333348484 Intestinal obstruction without mention of hernia\n",
      "577 0.00519976364710695 Diseases of pancreas\n",
      "574 0.005004317985546111 Cholelithiasis\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>diag_428</th>\n",
       "      <th>diag_250</th>\n",
       "      <th>diag_276</th>\n",
       "      <th>diag_401</th>\n",
       "      <th>diag_414</th>\n",
       "      <th>diag_427</th>\n",
       "      <th>diag_599</th>\n",
       "      <th>diag_403</th>\n",
       "      <th>diag_496</th>\n",
       "      <th>diag_786</th>\n",
       "      <th>...</th>\n",
       "      <th>diag_434</th>\n",
       "      <th>diag_424</th>\n",
       "      <th>diag_250.01</th>\n",
       "      <th>diag_38</th>\n",
       "      <th>diag_V45</th>\n",
       "      <th>diag_305</th>\n",
       "      <th>diag_998</th>\n",
       "      <th>diag_560</th>\n",
       "      <th>diag_577</th>\n",
       "      <th>diag_574</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73610</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73611</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73612</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73613</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73614</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>73615 rows × 39 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       diag_428  diag_250  diag_276  diag_401  diag_414  diag_427  diag_599  \\\n",
       "0             0         1         0         0         0         0         0   \n",
       "1             0         0         0         0         0         0         0   \n",
       "2             0         1         0         0         0         0         0   \n",
       "3             1         1         0         0         0         0         0   \n",
       "4             0         0         0         0         0         0         0   \n",
       "...         ...       ...       ...       ...       ...       ...       ...   \n",
       "73610         0         0         0         0         0         0         0   \n",
       "73611         0         0         0         0         0         0         0   \n",
       "73612         0         0         0         0         0         0         0   \n",
       "73613         0         0         0         0         0         0         0   \n",
       "73614         0         0         0         0         0         0         0   \n",
       "\n",
       "       diag_403  diag_496  diag_786  ...  diag_434  diag_424  diag_250.01  \\\n",
       "0             0         0         0  ...         0         0            0   \n",
       "1             1         0         0  ...         0         0            0   \n",
       "2             0         0         0  ...         0         0            0   \n",
       "3             0         0         0  ...         0         0            0   \n",
       "4             1         0         0  ...         0         0            0   \n",
       "...         ...       ...       ...  ...       ...       ...          ...   \n",
       "73610         0         0         0  ...         0         0            0   \n",
       "73611         0         0         0  ...         0         0            0   \n",
       "73612         0         0         0  ...         0         0            0   \n",
       "73613         0         0         0  ...         0         0            0   \n",
       "73614         0         0         0  ...         0         0            0   \n",
       "\n",
       "       diag_38  diag_V45  diag_305  diag_998  diag_560  diag_577  diag_574  \n",
       "0            0         0         0         0         0         0         0  \n",
       "1            0         0         0         0         0         0         0  \n",
       "2            0         0         0         0         0         0         0  \n",
       "3            0         0         0         0         0         0         0  \n",
       "4            0         0         0         0         0         0         0  \n",
       "...        ...       ...       ...       ...       ...       ...       ...  \n",
       "73610        0         0         0         0         0         0         0  \n",
       "73611        0         0         0         0         0         0         0  \n",
       "73612        1         0         0         0         0         0         0  \n",
       "73613        0         0         0         1         0         0         0  \n",
       "73614        0         0         0         0         0         0         0  \n",
       "\n",
       "[73615 rows x 39 columns]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# -- 1) Calculate overall frequencies of diagnoses --\n",
    "\n",
    "# Flatten all diagnosis columns into one long Series\n",
    "all_diags = pd.concat([\n",
    "    df_DiabetesReadmission[\"diag_1\"],\n",
    "    df_DiabetesReadmission[\"diag_2\"],\n",
    "    df_DiabetesReadmission[\"diag_3\"]\n",
    "], axis=0).dropna()\n",
    "\n",
    "# Compute relative frequency\n",
    "diag_counts = all_diags.value_counts(normalize=True)\n",
    "\n",
    "# Use threshold = 0.005 for 0.5%\n",
    "diag_keep = diag_counts[diag_counts > 0.005].index\n",
    "\n",
    "icd_mapping = get_icd9()\n",
    "for diag in diag_keep:\n",
    "    print(diag, diag_counts[diag], icd_mapping[diag])\n",
    "\n",
    "# -- 2) Create one-hot/multi-hot columns for each frequent diagnosis --\n",
    "df_encoded = pd.DataFrame(index=df_DiabetesReadmission.index)\n",
    "\n",
    "for diag in diag_keep:\n",
    "    df_encoded[f'diag_{diag}'] = (\n",
    "        df_DiabetesReadmission[['diag_1', 'diag_2', 'diag_3']]\n",
    "        .isin([diag])                      # check if any diag matches this diag\n",
    "        .any(axis=1)                      # True if diag_1 OR diag_2 OR diag_3 = diag\n",
    "        .astype(int)\n",
    "    )\n",
    "\n",
    "df_encoded\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>gender</th>\n",
       "      <th>time_in_hospital</th>\n",
       "      <th>num_lab_procedures</th>\n",
       "      <th>num_procedures</th>\n",
       "      <th>num_medications</th>\n",
       "      <th>number_outpatient</th>\n",
       "      <th>number_emergency</th>\n",
       "      <th>number_inpatient</th>\n",
       "      <th>number_diagnoses</th>\n",
       "      <th>max_glu_serum</th>\n",
       "      <th>...</th>\n",
       "      <th>metformin_Steady</th>\n",
       "      <th>insulin_Up</th>\n",
       "      <th>insulin_Down</th>\n",
       "      <th>insulin_Steady</th>\n",
       "      <th>Transferred</th>\n",
       "      <th>Home</th>\n",
       "      <th>Emergency_admission</th>\n",
       "      <th>Elective_admission</th>\n",
       "      <th>Urgent_admission</th>\n",
       "      <th>Emergency Room</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>11</td>\n",
       "      <td>5</td>\n",
       "      <td>13</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>44</td>\n",
       "      <td>1</td>\n",
       "      <td>16</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>51</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>73</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>47</td>\n",
       "      <td>2</td>\n",
       "      <td>17</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73610</th>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>45</td>\n",
       "      <td>1</td>\n",
       "      <td>25</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>9</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73611</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>51</td>\n",
       "      <td>0</td>\n",
       "      <td>16</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73612</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>53</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>13</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73613</th>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "      <td>45</td>\n",
       "      <td>2</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>9</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73614</th>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>13</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>73615 rows × 35 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       gender  time_in_hospital  num_lab_procedures  num_procedures  \\\n",
       "0           0                 2                  11               5   \n",
       "1           1                 2                  44               1   \n",
       "2           1                 1                  51               0   \n",
       "3           1                 5                  73               0   \n",
       "4           0                 9                  47               2   \n",
       "...       ...               ...                 ...             ...   \n",
       "73610       0                 6                  45               1   \n",
       "73611       1                 3                  51               0   \n",
       "73612       1                 1                  53               0   \n",
       "73613       0                10                  45               2   \n",
       "73614       1                 6                  13               3   \n",
       "\n",
       "       num_medications  number_outpatient  number_emergency  number_inpatient  \\\n",
       "0                   13                  2                 0                 1   \n",
       "1                   16                  0                 0                 0   \n",
       "2                    8                  0                 0                 0   \n",
       "3                   12                  0                 0                 0   \n",
       "4                   17                  0                 0                 0   \n",
       "...                ...                ...               ...               ...   \n",
       "73610               25                  3                 1                 2   \n",
       "73611               16                  0                 0                 0   \n",
       "73612                9                  1                 0                 0   \n",
       "73613               21                  0                 0                 1   \n",
       "73614                3                  0                 0                 0   \n",
       "\n",
       "       number_diagnoses  max_glu_serum  ...  metformin_Steady  insulin_Up  \\\n",
       "0                     6            NaN  ...                 0           0   \n",
       "1                     7            NaN  ...                 0           1   \n",
       "2                     5            NaN  ...                 0           0   \n",
       "3                     8            NaN  ...                 0           0   \n",
       "4                     9            NaN  ...                 0           0   \n",
       "...                 ...            ...  ...               ...         ...   \n",
       "73610                 9            NaN  ...                 0           0   \n",
       "73611                 9            NaN  ...                 1           0   \n",
       "73612                13            NaN  ...                 1           0   \n",
       "73613                 9            NaN  ...                 0           1   \n",
       "73614                 9            NaN  ...                 0           0   \n",
       "\n",
       "       insulin_Down  insulin_Steady  Transferred  Home  Emergency_admission  \\\n",
       "0                 0               0            0     1                    1   \n",
       "1                 0               0            0     1                    1   \n",
       "2                 0               1            0     1                    1   \n",
       "3                 0               0            0     1                    1   \n",
       "4                 0               1            0     1                    1   \n",
       "...             ...             ...          ...   ...                  ...   \n",
       "73610             1               0            0     1                    1   \n",
       "73611             1               0            1     0                    1   \n",
       "73612             1               0            0     1                    1   \n",
       "73613             0               0            1     0                    0   \n",
       "73614             0               0            0     1                    1   \n",
       "\n",
       "       Elective_admission  Urgent_admission  Emergency Room  \n",
       "0                       0                 0               1  \n",
       "1                       0                 0               1  \n",
       "2                       0                 0               1  \n",
       "3                       0                 0               1  \n",
       "4                       0                 0               1  \n",
       "...                   ...               ...             ...  \n",
       "73610                   0                 0               1  \n",
       "73611                   0                 0               1  \n",
       "73612                   0                 0               1  \n",
       "73613                   0                 1               1  \n",
       "73614                   0                 0               1  \n",
       "\n",
       "[73615 rows x 35 columns]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# df_DiabetesReadmission = df_DiabetesReadmission.join(df_encoded)\n",
    "df_DiabetesReadmission = df_DiabetesReadmission.drop(columns=['diag_1', 'diag_2', 'diag_3'])\n",
    "df_DiabetesReadmission"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "gender                    33773.0\n",
       "time_in_hospital         318312.0\n",
       "num_lab_procedures      3194456.0\n",
       "num_procedures            96625.0\n",
       "num_medications         1180804.0\n",
       "number_outpatient         28075.0\n",
       "number_emergency          16005.0\n",
       "number_inpatient          48553.0\n",
       "number_diagnoses         558652.0\n",
       "max_glu_serum               306.0\n",
       "A1Cresult                 12493.0\n",
       "change                    35103.0\n",
       "diabetesMed               57173.0\n",
       "readmitted                35880.0\n",
       "race_Caucasian            56982.0\n",
       "race_AfricanAmerican      14693.0\n",
       "race_Hispanic              1419.0\n",
       "race_Asian                  521.0\n",
       "age>=70                   32918.0\n",
       "max_glu_serum>200           262.0\n",
       "max_glu_serum>300           255.0\n",
       "A1Cresult>7                8581.0\n",
       "A1Cresult>8                5694.0\n",
       "metformin_Up                771.0\n",
       "metformin_Down              425.0\n",
       "metformin_Steady          13994.0\n",
       "insulin_Up                 8354.0\n",
       "insulin_Down               9390.0\n",
       "insulin_Steady            22139.0\n",
       "Transferred               15370.0\n",
       "Home                      58245.0\n",
       "Emergency_admission       44186.0\n",
       "Elective_admission        15565.0\n",
       "Urgent_admission          13864.0\n",
       "Emergency Room            49213.0\n",
       "dtype: float64"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_DiabetesReadmission.sum(axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['gender', 'time_in_hospital', 'num_lab_procedures', 'num_procedures',\n",
       "       'num_medications', 'number_outpatient', 'number_emergency',\n",
       "       'number_inpatient', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',\n",
       "       'change', 'diabetesMed', 'race_Caucasian', 'race_AfricanAmerican',\n",
       "       'race_Hispanic', 'race_Asian', 'age>=70', 'max_glu_serum>200',\n",
       "       'max_glu_serum>300', 'A1Cresult>7', 'A1Cresult>8', 'metformin_Up',\n",
       "       'metformin_Down', 'metformin_Steady', 'insulin_Up', 'insulin_Down',\n",
       "       'insulin_Steady', 'Transferred', 'Home', 'Emergency_admission',\n",
       "       'Elective_admission', 'Urgent_admission', 'Emergency Room'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_DiabetesReadmission.columns.drop('readmitted')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['gender', 'time_in_hospital', 'num_lab_procedures', 'num_procedures',\n",
       "       'num_medications', 'number_outpatient', 'number_emergency',\n",
       "       'number_inpatient', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',\n",
       "       'change', 'diabetesMed', 'race_Caucasian', 'race_AfricanAmerican',\n",
       "       'race_Hispanic', 'race_Asian', 'age>=70', 'max_glu_serum>200',\n",
       "       'max_glu_serum>300', 'A1Cresult>7', 'A1Cresult>8', 'metformin_Up',\n",
       "       'metformin_Down', 'metformin_Steady', 'insulin_Up', 'insulin_Down',\n",
       "       'insulin_Steady', 'Transferred', 'Home', 'Emergency_admission',\n",
       "       'Elective_admission', 'Urgent_admission', 'Emergency Room',\n",
       "       'readmitted'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_DiabetesReadmission['readmitted'] = df_DiabetesReadmission['readmitted'].astype(int)\n",
    "df_DiabetesReadmission = df_DiabetesReadmission[['gender', 'time_in_hospital', 'num_lab_procedures', 'num_procedures',\n",
    "       'num_medications', 'number_outpatient', 'number_emergency',\n",
    "       'number_inpatient', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',\n",
    "       'change', 'diabetesMed', 'race_Caucasian', 'race_AfricanAmerican',\n",
    "       'race_Hispanic', 'race_Asian', 'age>=70', 'max_glu_serum>200',\n",
    "       'max_glu_serum>300', 'A1Cresult>7', 'A1Cresult>8', 'metformin_Up',\n",
    "       'metformin_Down', 'metformin_Steady', 'insulin_Up', 'insulin_Down',\n",
    "       'insulin_Steady', 'Transferred', 'Home', 'Emergency_admission',\n",
    "       'Elective_admission', 'Urgent_admission', 'Emergency Room', 'readmitted']]\n",
    "df_DiabetesReadmission.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "24402 49213\n"
     ]
    }
   ],
   "source": [
    "df_out0 = df_DiabetesReadmission[df_DiabetesReadmission['Emergency Room'] == 0]\n",
    "df_out1 = df_DiabetesReadmission[df_DiabetesReadmission['Emergency Room'] == 1]\n",
    "print(len(df_out0), len(df_out1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>gender</th>\n",
       "      <th>time_in_hospital</th>\n",
       "      <th>num_lab_procedures</th>\n",
       "      <th>num_procedures</th>\n",
       "      <th>num_medications</th>\n",
       "      <th>number_outpatient</th>\n",
       "      <th>number_emergency</th>\n",
       "      <th>number_inpatient</th>\n",
       "      <th>number_diagnoses</th>\n",
       "      <th>max_glu_serum</th>\n",
       "      <th>...</th>\n",
       "      <th>insulin_Up</th>\n",
       "      <th>insulin_Down</th>\n",
       "      <th>insulin_Steady</th>\n",
       "      <th>Transferred</th>\n",
       "      <th>Home</th>\n",
       "      <th>Emergency_admission</th>\n",
       "      <th>Elective_admission</th>\n",
       "      <th>Urgent_admission</th>\n",
       "      <th>Emergency Room</th>\n",
       "      <th>readmitted</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>0</td>\n",
       "      <td>11</td>\n",
       "      <td>42</td>\n",
       "      <td>2</td>\n",
       "      <td>19</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>25</td>\n",
       "      <td>2</td>\n",
       "      <td>11</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>0</td>\n",
       "      <td>13</td>\n",
       "      <td>48</td>\n",
       "      <td>2</td>\n",
       "      <td>18</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>8</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>0</td>\n",
       "      <td>14</td>\n",
       "      <td>28</td>\n",
       "      <td>3</td>\n",
       "      <td>28</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>57</td>\n",
       "      <td>1</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73596</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73599</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>57</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73603</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>27</td>\n",
       "      <td>1</td>\n",
       "      <td>29</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73604</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>31</td>\n",
       "      <td>2</td>\n",
       "      <td>24</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73605</th>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>77</td>\n",
       "      <td>6</td>\n",
       "      <td>65</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>16</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>24402 rows × 35 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       gender  time_in_hospital  num_lab_procedures  num_procedures  \\\n",
       "11          0                11                  42               2   \n",
       "13          0                 2                  25               2   \n",
       "22          0                13                  48               2   \n",
       "32          0                14                  28               3   \n",
       "36          0                 3                  57               1   \n",
       "...       ...               ...                 ...             ...   \n",
       "73596       1                 1                   1               5   \n",
       "73599       1                 3                  57               0   \n",
       "73603       0                 3                  27               1   \n",
       "73604       0                 3                  31               2   \n",
       "73605       1                13                  77               6   \n",
       "\n",
       "       num_medications  number_outpatient  number_emergency  number_inpatient  \\\n",
       "11                  19                  0                 0                 0   \n",
       "13                  11                  0                 0                 0   \n",
       "22                  18                  0                 0                 1   \n",
       "32                  28                  0                 0                 0   \n",
       "36                  21                  0                 0                 0   \n",
       "...                ...                ...               ...               ...   \n",
       "73596                8                  0                 0                 0   \n",
       "73599                7                  0                 1                 0   \n",
       "73603               29                  0                 1                 0   \n",
       "73604               24                  0                 0                 0   \n",
       "73605               65                  0                 0                 0   \n",
       "\n",
       "       number_diagnoses  max_glu_serum  ...  insulin_Up  insulin_Down  \\\n",
       "11                    8            NaN  ...           0             0   \n",
       "13                    3            NaN  ...           0             0   \n",
       "22                    8            NaN  ...           0             0   \n",
       "32                    8            NaN  ...           0             1   \n",
       "36                    6            NaN  ...           1             0   \n",
       "...                 ...            ...  ...         ...           ...   \n",
       "73596                 4            NaN  ...           0             0   \n",
       "73599                 3            NaN  ...           0             0   \n",
       "73603                 9            NaN  ...           0             0   \n",
       "73604                 9            NaN  ...           0             1   \n",
       "73605                16            NaN  ...           1             0   \n",
       "\n",
       "       insulin_Steady  Transferred  Home  Emergency_admission  \\\n",
       "11                  0            0     1                    0   \n",
       "13                  1            0     1                    1   \n",
       "22                  1            0     1                    0   \n",
       "32                  0            0     1                    0   \n",
       "36                  0            1     0                    0   \n",
       "...               ...          ...   ...                  ...   \n",
       "73596               1            0     1                    0   \n",
       "73599               0            0     1                    0   \n",
       "73603               1            0     1                    0   \n",
       "73604               0            0     1                    0   \n",
       "73605               0            0     1                    0   \n",
       "\n",
       "       Elective_admission  Urgent_admission  Emergency Room  readmitted  \n",
       "11                      0                 1               0           1  \n",
       "13                      0                 0               0           1  \n",
       "22                      0                 1               0           1  \n",
       "32                      1                 0               0           0  \n",
       "36                      1                 0               0           1  \n",
       "...                   ...               ...             ...         ...  \n",
       "73596                   1                 0               0           0  \n",
       "73599                   0                 1               0           0  \n",
       "73603                   1                 0               0           0  \n",
       "73604                   1                 0               0           1  \n",
       "73605                   1                 0               0           0  \n",
       "\n",
       "[24402 rows x 35 columns]"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_out0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>gender</th>\n",
       "      <th>time_in_hospital</th>\n",
       "      <th>num_lab_procedures</th>\n",
       "      <th>num_procedures</th>\n",
       "      <th>num_medications</th>\n",
       "      <th>number_outpatient</th>\n",
       "      <th>number_emergency</th>\n",
       "      <th>number_inpatient</th>\n",
       "      <th>number_diagnoses</th>\n",
       "      <th>max_glu_serum</th>\n",
       "      <th>...</th>\n",
       "      <th>insulin_Up</th>\n",
       "      <th>insulin_Down</th>\n",
       "      <th>insulin_Steady</th>\n",
       "      <th>Transferred</th>\n",
       "      <th>Home</th>\n",
       "      <th>Emergency_admission</th>\n",
       "      <th>Elective_admission</th>\n",
       "      <th>Urgent_admission</th>\n",
       "      <th>Emergency Room</th>\n",
       "      <th>readmitted</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>11</td>\n",
       "      <td>5</td>\n",
       "      <td>13</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>44</td>\n",
       "      <td>1</td>\n",
       "      <td>16</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>51</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>73</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>47</td>\n",
       "      <td>2</td>\n",
       "      <td>17</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73610</th>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>45</td>\n",
       "      <td>1</td>\n",
       "      <td>25</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>9</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73611</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>51</td>\n",
       "      <td>0</td>\n",
       "      <td>16</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73612</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>53</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>13</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73613</th>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "      <td>45</td>\n",
       "      <td>2</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>9</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73614</th>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>13</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>49213 rows × 35 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       gender  time_in_hospital  num_lab_procedures  num_procedures  \\\n",
       "0           0                 2                  11               5   \n",
       "1           1                 2                  44               1   \n",
       "2           1                 1                  51               0   \n",
       "3           1                 5                  73               0   \n",
       "4           0                 9                  47               2   \n",
       "...       ...               ...                 ...             ...   \n",
       "73610       0                 6                  45               1   \n",
       "73611       1                 3                  51               0   \n",
       "73612       1                 1                  53               0   \n",
       "73613       0                10                  45               2   \n",
       "73614       1                 6                  13               3   \n",
       "\n",
       "       num_medications  number_outpatient  number_emergency  number_inpatient  \\\n",
       "0                   13                  2                 0                 1   \n",
       "1                   16                  0                 0                 0   \n",
       "2                    8                  0                 0                 0   \n",
       "3                   12                  0                 0                 0   \n",
       "4                   17                  0                 0                 0   \n",
       "...                ...                ...               ...               ...   \n",
       "73610               25                  3                 1                 2   \n",
       "73611               16                  0                 0                 0   \n",
       "73612                9                  1                 0                 0   \n",
       "73613               21                  0                 0                 1   \n",
       "73614                3                  0                 0                 0   \n",
       "\n",
       "       number_diagnoses  max_glu_serum  ...  insulin_Up  insulin_Down  \\\n",
       "0                     6            NaN  ...           0             0   \n",
       "1                     7            NaN  ...           1             0   \n",
       "2                     5            NaN  ...           0             0   \n",
       "3                     8            NaN  ...           0             0   \n",
       "4                     9            NaN  ...           0             0   \n",
       "...                 ...            ...  ...         ...           ...   \n",
       "73610                 9            NaN  ...           0             1   \n",
       "73611                 9            NaN  ...           0             1   \n",
       "73612                13            NaN  ...           0             1   \n",
       "73613                 9            NaN  ...           1             0   \n",
       "73614                 9            NaN  ...           0             0   \n",
       "\n",
       "       insulin_Steady  Transferred  Home  Emergency_admission  \\\n",
       "0                   0            0     1                    1   \n",
       "1                   0            0     1                    1   \n",
       "2                   1            0     1                    1   \n",
       "3                   0            0     1                    1   \n",
       "4                   1            0     1                    1   \n",
       "...               ...          ...   ...                  ...   \n",
       "73610               0            0     1                    1   \n",
       "73611               0            1     0                    1   \n",
       "73612               0            0     1                    1   \n",
       "73613               0            1     0                    0   \n",
       "73614               0            0     1                    1   \n",
       "\n",
       "       Elective_admission  Urgent_admission  Emergency Room  readmitted  \n",
       "0                       0                 0               1           0  \n",
       "1                       0                 0               1           0  \n",
       "2                       0                 0               1           0  \n",
       "3                       0                 0               1           1  \n",
       "4                       0                 0               1           1  \n",
       "...                   ...               ...             ...         ...  \n",
       "73610                   0                 0               1           1  \n",
       "73611                   0                 0               1           1  \n",
       "73612                   0                 0               1           0  \n",
       "73613                   0                 1               1           0  \n",
       "73614                   0                 0               1           0  \n",
       "\n",
       "[49213 rows x 35 columns]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_out1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_out0.drop(columns=['Emergency Room']).to_csv('target_DiabetesReadmission.csv', index=False)\n",
    "# df_out1.drop(columns=['Emergency Room']).to_csv('source_DiabetesReadmission.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "struct",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
