{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "9ca12137-cf97-4039-a4c1-70ea59203379",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Cloning into 'clinical-outcome-prediction'...\n",
      "remote: Enumerating objects: 117, done.\u001b[K\n",
      "remote: Counting objects: 100% (117/117), done.\u001b[K\n",
      "remote: Compressing objects: 100% (97/97), done.\u001b[K\n",
      "remote: Total 117 (delta 31), reused 88 (delta 10), pack-reused 0 (from 0)\u001b[K\n",
      "Receiving objects: 100% (117/117), 199.93 KiB | 2.10 MiB/s, done.\n",
      "Resolving deltas: 100% (31/31), done.\n"
     ]
    }
   ],
   "source": [
    "!git clone https://github.com/bvanaken/clinical-outcome-prediction.git"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "911dc186-78a7-4fb0-a39e-801f2267fcc8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/fs01/home/dorsasm/clinical-outcome-prediction\n"
     ]
    }
   ],
   "source": [
    "cd clinical-outcome-prediction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bc7eaa14-0bda-450a-9c16-412c695d0ac2",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "cc305135-d85f-496e-a9da-01b19333cc1d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting numpy (from -r tasks/requirements.txt (line 1))\n",
      "  Downloading numpy-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)\n",
      "Collecting pandas (from -r tasks/requirements.txt (line 2))\n",
      "  Downloading pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)\n",
      "Collecting nltk (from -r tasks/requirements.txt (line 3))\n",
      "  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)\n",
      "Collecting python-dateutil>=2.8.2 (from pandas->-r tasks/requirements.txt (line 2))\n",
      "  Downloading python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)\n",
      "Collecting pytz>=2020.1 (from pandas->-r tasks/requirements.txt (line 2))\n",
      "  Downloading pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)\n",
      "Collecting tzdata>=2022.7 (from pandas->-r tasks/requirements.txt (line 2))\n",
      "  Downloading tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)\n",
      "Collecting click (from nltk->-r tasks/requirements.txt (line 3))\n",
      "  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)\n",
      "Collecting joblib (from nltk->-r tasks/requirements.txt (line 3))\n",
      "  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)\n",
      "Collecting regex>=2021.8.3 (from nltk->-r tasks/requirements.txt (line 3))\n",
      "  Downloading regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)\n",
      "Collecting tqdm (from nltk->-r tasks/requirements.txt (line 3))\n",
      "  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)\n",
      "Collecting six>=1.5 (from python-dateutil>=2.8.2->pandas->-r tasks/requirements.txt (line 2))\n",
      "  Downloading six-1.17.0-py2.py3-none-any.whl.metadata (1.7 kB)\n",
      "Downloading numpy-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.4 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m16.4/16.4 MB\u001b[0m \u001b[31m139.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.1/13.1 MB\u001b[0m \u001b[31m181.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading nltk-3.9.1-py3-none-any.whl (1.5 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m94.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading python_dateutil-2.9.0.post0-py2.py3-none-any.whl (229 kB)\n",
      "Downloading pytz-2024.2-py2.py3-none-any.whl (508 kB)\n",
      "Downloading regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (781 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m781.7/781.7 kB\u001b[0m \u001b[31m48.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading tzdata-2024.2-py2.py3-none-any.whl (346 kB)\n",
      "Downloading click-8.1.7-py3-none-any.whl (97 kB)\n",
      "Downloading joblib-1.4.2-py3-none-any.whl (301 kB)\n",
      "Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)\n",
      "Downloading six-1.17.0-py2.py3-none-any.whl (11 kB)\n",
      "Installing collected packages: pytz, tzdata, tqdm, six, regex, numpy, joblib, click, python-dateutil, nltk, pandas\n",
      "Successfully installed click-8.1.7 joblib-1.4.2 nltk-3.9.1 numpy-2.2.0 pandas-2.2.3 python-dateutil-2.9.0.post0 pytz-2024.2 regex-2024.11.6 six-1.17.0 tqdm-4.67.1 tzdata-2024.2\n"
     ]
    }
   ],
   "source": [
    "!pip install -r tasks/requirements.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "1c1a9695-1807-42dc-bd3a-6956410762b3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/fs01/home/dorsasm\n"
     ]
    }
   ],
   "source": [
    "cd /h/dorsasm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "92e37018-4d16-41b7-8114-3a0b0bd27446",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_2594/343354512.py:7: DtypeWarning: Columns (4,5) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  df = pd.read_csv(input_file, compression='gzip')\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "File saved successfully at /scratch/ssd004/scratch/dorsasm/datasets/unpackedmimiciii/NOTEEVENTS.csv!\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# Input and output file paths\n",
    "input_file = \"/projects/privacyllm/mimiciii_data/physionet.org/files/mimiciii/1.4/NOTEEVENTS.csv.gz\"\n",
    "output_file = \"/scratch/ssd004/scratch/dorsasm/datasets/unpackedmimiciii/NOTEEVENTS.csv\"  # Change to your desired path\n",
    "def save_csv(input_file,output_file):\n",
    "    df = pd.read_csv(input_file, compression='gzip')\n",
    "    df.to_csv(output_file, index=False)\n",
    "\n",
    "    print(f\"File saved successfully at {output_file}!\")\n",
    "\n",
    "save_csv(\"/projects/privacyllm/mimiciii_data/physionet.org/files/mimiciii/1.4/NOTEEVENTS.csv.gz\",\"/scratch/ssd004/scratch/dorsasm/datasets/unpackedmimiciii/NOTEEVENTS.csv\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "d8b48560-5e6c-4da3-a488-1df2a4babd0b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "File saved successfully at /scratch/ssd004/scratch/dorsasm/datasets/unpackedmimiciii/ADMISSIONS.csv!\n",
      "File saved successfully at /scratch/ssd004/scratch/dorsasm/datasets/unpackedmimiciii/DIAGNOSES_ICD.csv!\n",
      "File saved successfully at /scratch/ssd004/scratch/dorsasm/datasets/unpackedmimiciii/PROCEDURES_ICD.csv!\n"
     ]
    }
   ],
   "source": [
    "save_csv(\"/projects/privacyllm/mimiciii_data/physionet.org/files/mimiciii/1.4/ADMISSIONS.csv.gz\",\"/scratch/ssd004/scratch/dorsasm/datasets/unpackedmimiciii/ADMISSIONS.csv\")\n",
    "save_csv(\"/projects/privacyllm/mimiciii_data/physionet.org/files/mimiciii/1.4/DIAGNOSES_ICD.csv.gz\",\"/scratch/ssd004/scratch/dorsasm/datasets/unpackedmimiciii/DIAGNOSES_ICD.csv\")\n",
    "save_csv(\"/projects/privacyllm/mimiciii_data/physionet.org/files/mimiciii/1.4/PROCEDURES_ICD.csv.gz\",\"/scratch/ssd004/scratch/dorsasm/datasets/unpackedmimiciii/PROCEDURES_ICD.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "ab30277e-fa93-43be-a24e-ea81dfe774ee",
   "metadata": {},
   "outputs": [],
   "source": [
    "patients = pd.read_csv('/projects/privacyllm/mimiciii_data/physionet.org/files/mimiciii/1.4/PATIENTS.csv.gz')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c17fa0d5-027b-453a-9bc5-c60cfdd48f9a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "cad181c4-2991-4abc-8fac-83ee9d97bf50",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_2594/258791858.py:23: DtypeWarning: Columns (4,5) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  mimic_notes = pd.read_csv(\"/scratch/ssd004/scratch/dorsasm/datasets/unpackedmimiciii/NOTEEVENTS.csv\")\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import sys\n",
    "\n",
    "#sys.path.append('.')\n",
    "from tasks import mimic_utils\n",
    "import pandas as pd\n",
    "\n",
    "#def los_mimic(mimic_dir: str, save_dir: str, seed: int, admission_only: bool):\n",
    "\"\"\"\n",
    "Extracts information needed for the task from the MIMIC dataset. Namely \"TEXT\" column from NOTEEVENTS.csv and\n",
    "\"ADMITTIME\" and \"DISCHTIME\" from ADMISSIONS.csv.\n",
    "Creates 70/10/20 split over patients for train/val/test sets.\n",
    "\"\"\"\n",
    "admission_only=False\n",
    "# set task name\n",
    "task_name = \"LOS_WEEKS\"\n",
    "if admission_only:\n",
    "    task_name = f\"{task_name}_adm\"    \n",
    "\n",
    "mimic_dir='/scratch/ssd004/scratch/dorsasm/datasets/unpackedmimiciii/'\n",
    "\n",
    "# load dataframes\n",
    "mimic_notes = pd.read_csv(\"/scratch/ssd004/scratch/dorsasm/datasets/unpackedmimiciii/NOTEEVENTS.csv\")\n",
    "mimic_admissions = pd.read_csv(\"/scratch/ssd004/scratch/dorsasm/datasets/unpackedmimiciii/ADMISSIONS.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "id": "cbb75193-db77-41fd-bd59-c31d81f708df",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ROW_ID</th>\n",
       "      <th>SUBJECT_ID</th>\n",
       "      <th>HADM_ID</th>\n",
       "      <th>ADMITTIME</th>\n",
       "      <th>DISCHTIME</th>\n",
       "      <th>DEATHTIME</th>\n",
       "      <th>ADMISSION_TYPE</th>\n",
       "      <th>ADMISSION_LOCATION</th>\n",
       "      <th>DISCHARGE_LOCATION</th>\n",
       "      <th>INSURANCE</th>\n",
       "      <th>LANGUAGE</th>\n",
       "      <th>RELIGION</th>\n",
       "      <th>MARITAL_STATUS</th>\n",
       "      <th>ETHNICITY</th>\n",
       "      <th>EDREGTIME</th>\n",
       "      <th>EDOUTTIME</th>\n",
       "      <th>DIAGNOSIS</th>\n",
       "      <th>HOSPITAL_EXPIRE_FLAG</th>\n",
       "      <th>HAS_CHARTEVENTS_DATA</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>21</td>\n",
       "      <td>22</td>\n",
       "      <td>165315</td>\n",
       "      <td>2196-04-09 12:26:00</td>\n",
       "      <td>2196-04-10 15:54:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>EMERGENCY</td>\n",
       "      <td>EMERGENCY ROOM ADMIT</td>\n",
       "      <td>DISC-TRAN CANCER/CHLDRN H</td>\n",
       "      <td>Private</td>\n",
       "      <td>NaN</td>\n",
       "      <td>UNOBTAINABLE</td>\n",
       "      <td>MARRIED</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>2196-04-09 10:06:00</td>\n",
       "      <td>2196-04-09 13:24:00</td>\n",
       "      <td>BENZODIAZEPINE OVERDOSE</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>22</td>\n",
       "      <td>23</td>\n",
       "      <td>152223</td>\n",
       "      <td>2153-09-03 07:15:00</td>\n",
       "      <td>2153-09-08 19:10:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ELECTIVE</td>\n",
       "      <td>PHYS REFERRAL/NORMAL DELI</td>\n",
       "      <td>HOME HEALTH CARE</td>\n",
       "      <td>Medicare</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CATHOLIC</td>\n",
       "      <td>MARRIED</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CORONARY ARTERY DISEASE\\CORONARY ARTERY BYPASS...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>23</td>\n",
       "      <td>23</td>\n",
       "      <td>124321</td>\n",
       "      <td>2157-10-18 19:34:00</td>\n",
       "      <td>2157-10-25 14:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>EMERGENCY</td>\n",
       "      <td>TRANSFER FROM HOSP/EXTRAM</td>\n",
       "      <td>HOME HEALTH CARE</td>\n",
       "      <td>Medicare</td>\n",
       "      <td>ENGL</td>\n",
       "      <td>CATHOLIC</td>\n",
       "      <td>MARRIED</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>BRAIN MASS</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>24</td>\n",
       "      <td>24</td>\n",
       "      <td>161859</td>\n",
       "      <td>2139-06-06 16:14:00</td>\n",
       "      <td>2139-06-09 12:48:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>EMERGENCY</td>\n",
       "      <td>TRANSFER FROM HOSP/EXTRAM</td>\n",
       "      <td>HOME</td>\n",
       "      <td>Private</td>\n",
       "      <td>NaN</td>\n",
       "      <td>PROTESTANT QUAKER</td>\n",
       "      <td>SINGLE</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>INTERIOR MYOCARDIAL INFARCTION</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>25</td>\n",
       "      <td>25</td>\n",
       "      <td>129635</td>\n",
       "      <td>2160-11-02 02:06:00</td>\n",
       "      <td>2160-11-05 14:55:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>EMERGENCY</td>\n",
       "      <td>EMERGENCY ROOM ADMIT</td>\n",
       "      <td>HOME</td>\n",
       "      <td>Private</td>\n",
       "      <td>NaN</td>\n",
       "      <td>UNOBTAINABLE</td>\n",
       "      <td>MARRIED</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>2160-11-02 01:01:00</td>\n",
       "      <td>2160-11-02 04:27:00</td>\n",
       "      <td>ACUTE CORONARY SYNDROME</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ROW_ID  SUBJECT_ID  HADM_ID            ADMITTIME            DISCHTIME  \\\n",
       "0      21          22   165315  2196-04-09 12:26:00  2196-04-10 15:54:00   \n",
       "1      22          23   152223  2153-09-03 07:15:00  2153-09-08 19:10:00   \n",
       "2      23          23   124321  2157-10-18 19:34:00  2157-10-25 14:00:00   \n",
       "3      24          24   161859  2139-06-06 16:14:00  2139-06-09 12:48:00   \n",
       "4      25          25   129635  2160-11-02 02:06:00  2160-11-05 14:55:00   \n",
       "\n",
       "  DEATHTIME ADMISSION_TYPE         ADMISSION_LOCATION  \\\n",
       "0       NaN      EMERGENCY       EMERGENCY ROOM ADMIT   \n",
       "1       NaN       ELECTIVE  PHYS REFERRAL/NORMAL DELI   \n",
       "2       NaN      EMERGENCY  TRANSFER FROM HOSP/EXTRAM   \n",
       "3       NaN      EMERGENCY  TRANSFER FROM HOSP/EXTRAM   \n",
       "4       NaN      EMERGENCY       EMERGENCY ROOM ADMIT   \n",
       "\n",
       "          DISCHARGE_LOCATION INSURANCE LANGUAGE           RELIGION  \\\n",
       "0  DISC-TRAN CANCER/CHLDRN H   Private      NaN       UNOBTAINABLE   \n",
       "1           HOME HEALTH CARE  Medicare      NaN           CATHOLIC   \n",
       "2           HOME HEALTH CARE  Medicare     ENGL           CATHOLIC   \n",
       "3                       HOME   Private      NaN  PROTESTANT QUAKER   \n",
       "4                       HOME   Private      NaN       UNOBTAINABLE   \n",
       "\n",
       "  MARITAL_STATUS ETHNICITY            EDREGTIME            EDOUTTIME  \\\n",
       "0        MARRIED     WHITE  2196-04-09 10:06:00  2196-04-09 13:24:00   \n",
       "1        MARRIED     WHITE                  NaN                  NaN   \n",
       "2        MARRIED     WHITE                  NaN                  NaN   \n",
       "3         SINGLE     WHITE                  NaN                  NaN   \n",
       "4        MARRIED     WHITE  2160-11-02 01:01:00  2160-11-02 04:27:00   \n",
       "\n",
       "                                           DIAGNOSIS  HOSPITAL_EXPIRE_FLAG  \\\n",
       "0                            BENZODIAZEPINE OVERDOSE                     0   \n",
       "1  CORONARY ARTERY DISEASE\\CORONARY ARTERY BYPASS...                     0   \n",
       "2                                         BRAIN MASS                     0   \n",
       "3                     INTERIOR MYOCARDIAL INFARCTION                     0   \n",
       "4                            ACUTE CORONARY SYNDROME                     0   \n",
       "\n",
       "   HAS_CHARTEVENTS_DATA  \n",
       "0                     1  \n",
       "1                     1  \n",
       "2                     1  \n",
       "3                     1  \n",
       "4                     1  "
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mimic_admissions.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "4de09b85-0eea-400f-a5e1-4e49060554e2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2083180"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mimic_notes.shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "683f1aef-2212-4254-8a25-3f1518d58512",
   "metadata": {},
   "outputs": [],
   "source": [
    "import argparse\n",
    "\n",
    "import pandas as pd\n",
    "import os\n",
    "import csv\n",
    "\n",
    "def filter_notes(notes_df: pd.DataFrame, admissions_df: pd.DataFrame, admission_text_only=False) -> pd.DataFrame:\n",
    "    \"\"\"\n",
    "    Keep only Discharge Summaries and filter out Newborn admissions. Replace duplicates and join reports with\n",
    "    their addendums. If admission_text_only is True, filter all sections that are not known at admission time.\n",
    "    \"\"\"\n",
    "    # filter out newborns\n",
    "    adm_grownups = admissions_df[admissions_df.ADMISSION_TYPE != \"NEWBORN\"]\n",
    "    notes_df = notes_df[notes_df.HADM_ID.isin(adm_grownups.HADM_ID)]\n",
    "    print('1',notes_df.shape[0])\n",
    "    # remove notes with no TEXT or HADM_ID\n",
    "    notes_df = notes_df.dropna(subset=[\"TEXT\", \"HADM_ID\"])\n",
    "    print('2',notes_df.shape[0])\n",
    "    \n",
    "    # filter discharge summaries\n",
    "    notes_df = notes_df[notes_df.CATEGORY == \"Discharge summary\"]\n",
    "    print('3',notes_df.shape[0])\n",
    "    \n",
    "    # remove duplicates and keep the later ones\n",
    "    notes_df = notes_df.sort_values(by=[\"CHARTDATE\"])\n",
    "    notes_df = notes_df.drop_duplicates(subset=[\"TEXT\"], keep=\"last\")\n",
    "    print('4',notes_df.shape[0])\n",
    "    \n",
    "    # combine text of same admissions (those are usually addendums)\n",
    "    combined_adm_texts = notes_df.groupby('HADM_ID')['TEXT'].apply(lambda x: '\\n\\n'.join(x)).reset_index()\n",
    "    notes_df = notes_df[notes_df.DESCRIPTION == \"Report\"]\n",
    "    print('6',notes_df.shape[0])\n",
    "    \n",
    "    notes_df = notes_df[[\"HADM_ID\", \"ROW_ID\", \"SUBJECT_ID\", \"CHARTDATE\"]]\n",
    "    notes_df = notes_df.drop_duplicates(subset=[\"HADM_ID\"], keep=\"last\")\n",
    "    print('7',notes_df.shape[0])\n",
    "    \n",
    "    notes_df = pd.merge(combined_adm_texts, notes_df, on=\"HADM_ID\", how=\"inner\")\n",
    "    print('8',notes_df.shape[0])\n",
    "    \n",
    "    # strip texts from leading and trailing and white spaces\n",
    "    notes_df[\"TEXT\"] = notes_df[\"TEXT\"].str.strip()\n",
    "\n",
    "    # remove entries without admission id, subject id or text\n",
    "    notes_df = notes_df.dropna(subset=[\"HADM_ID\", \"SUBJECT_ID\", \"TEXT\"])\n",
    "    print('9',notes_df.shape[0])\n",
    "    \n",
    "    if admission_text_only:\n",
    "        print('admission in')\n",
    "        # reduce text to admission-only text\n",
    "        notes_df = filter_admission_text(notes_df)\n",
    "        print('admission out')\n",
    "        print('10',notes_df.shape[0])\n",
    "\n",
    "    return notes_df\n",
    "\n",
    "\n",
    "def filter_admission_text(notes_df) -> pd.DataFrame:\n",
    "    \"\"\"\n",
    "    Filter text information by section and only keep sections that are known on admission time.\n",
    "    \"\"\"\n",
    "    admission_sections = {\n",
    "        \"CHIEF_COMPLAINT\": \"chief complaint:\",\n",
    "        \"PRESENT_ILLNESS\": \"present illness:\",\n",
    "        \"MEDICAL_HISTORY\": \"medical history:\",\n",
    "        \"MEDICATION_ADM\": \"medications on admission:\",\n",
    "        \"ALLERGIES\": \"allergies:\",\n",
    "        \"PHYSICAL_EXAM\": \"physical exam:\",\n",
    "        \"FAMILY_HISTORY\": \"family history:\",\n",
    "        \"SOCIAL_HISTORY\": \"social history:\"\n",
    "    }\n",
    "\n",
    "    # replace linebreak indicators\n",
    "    notes_df['TEXT'] = notes_df['TEXT'].str.replace(r\"\\n\", r\"\\\\n\")\n",
    "    print('1',notes_df.shape[0])\n",
    "\n",
    "    # extract each section by regex\n",
    "    for key in admission_sections.keys():\n",
    "        section = admission_sections[key]\n",
    "        notes_df[key] = notes_df.TEXT.str.extract(r'(?i){}(.+?)\\\\n\\\\n[^(\\\\|\\d|\\.)]+?:'\n",
    "                                                  .format(section))\n",
    "\n",
    "        notes_df[key] = notes_df[key].str.replace(r'\\\\n', r' ')\n",
    "        notes_df[key] = notes_df[key].str.strip()\n",
    "        notes_df[key] = notes_df[key].fillna(\"\")\n",
    "        notes_df[notes_df[key].str.startswith(\"[]\")][key] = \"\"\n",
    "\n",
    "    # filter notes with missing main information\n",
    "    notes_df = notes_df[(notes_df.CHIEF_COMPLAINT != \"\") | (notes_df.PRESENT_ILLNESS != \"\") |\n",
    "                        (notes_df.MEDICAL_HISTORY != \"\")]\n",
    "    print('2',notes_df.shape[0])\n",
    "    \n",
    "\n",
    "    # add section headers and combine into TEXT_ADMISSION\n",
    "    notes_df = notes_df.assign(TEXT=\"CHIEF COMPLAINT: \" + notes_df.CHIEF_COMPLAINT.astype(str)\n",
    "                                    + '\\n\\n' +\n",
    "                                    \"PRESENT ILLNESS: \" + notes_df.PRESENT_ILLNESS.astype(str)\n",
    "                                    + '\\n\\n' +\n",
    "                                    \"MEDICAL HISTORY: \" + notes_df.MEDICAL_HISTORY.astype(str)\n",
    "                                    + '\\n\\n' +\n",
    "                                    \"MEDICATION ON ADMISSION: \" + notes_df.MEDICATION_ADM.astype(str)\n",
    "                                    + '\\n\\n' +\n",
    "                                    \"ALLERGIES: \" + notes_df.ALLERGIES.astype(str)\n",
    "                                    + '\\n\\n' +\n",
    "                                    \"PHYSICAL EXAM: \" + notes_df.PHYSICAL_EXAM.astype(str)\n",
    "                                    + '\\n\\n' +\n",
    "                                    \"FAMILY HISTORY: \" + notes_df.FAMILY_HISTORY.astype(str)\n",
    "                                    + '\\n\\n' +\n",
    "                                    \"SOCIAL HISTORY: \" + notes_df.SOCIAL_HISTORY.astype(str))\n",
    "    print('3',notes_df.shape[0])\n",
    "\n",
    "    return notes_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "90cfd06c-6b33-4cef-809b-1b76754bff14",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 1443362\n",
      "2 1443362\n",
      "3 55598\n",
      "4 55520\n",
      "6 51106\n",
      "7 49048\n",
      "8 49048\n",
      "9 49048\n",
      "49048\n"
     ]
    }
   ],
   "source": [
    "# filter notes\n",
    "mimic_notes = filter_notes(mimic_notes, mimic_admissions, admission_text_only=admission_only)\n",
    "print(mimic_notes.shape[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "a7305933-c9b8-4afd-8d25-fa4e38fdf094",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Processing rows: 100%|██████████| 49048/49048 [00:12<00:00, 3797.49it/s]\n"
     ]
    }
   ],
   "source": [
    "import re\n",
    "from tqdm import tqdm\n",
    "def find_home_labels(df_column):\n",
    "    \"\"\"\n",
    "    Find all unique labels containing the word 'discharge' (case-insensitive).\n",
    "    \"\"\"\n",
    "    # Initialize an empty set to store unique labels\n",
    "    home_labels = set()\n",
    "    \n",
    "    # Iterate through each row in the column\n",
    "    for text in tqdm(df_column, desc=\"Processing rows\"):\n",
    "        # Find all headers containing the word 'discharge' (case-insensitive)\n",
    "        labels = re.findall(r'^[^\\n:]*home[^\\n:]*:\\s*$', text, flags=re.IGNORECASE | re.MULTILINE)\n",
    "        home_labels.update(labels)  # Add to the set to ensure uniqueness\n",
    "    \n",
    "    return home_labels\n",
    "\n",
    "\n",
    "# Find all labels containing 'discharge'\n",
    "home_labels = find_home_labels(mimic_notes['TEXT'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "b4dfa4ad-703d-4e5c-bc44-b12cb9e00153",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Processing rows: 100%|██████████| 49048/49048 [00:13<00:00, 3725.76it/s]\n"
     ]
    }
   ],
   "source": [
    "def find_discharge_labels(df_column):\n",
    "    \"\"\"\n",
    "    Find all unique labels containing the word 'discharge' (case-insensitive).\n",
    "    \"\"\"\n",
    "    # Initialize an empty set to store unique labels\n",
    "    discharge_labels = set()\n",
    "    \n",
    "    # Iterate through each row in the column\n",
    "    for text in tqdm(df_column, desc=\"Processing rows\"):\n",
    "        # Find all headers containing the word 'discharge' (case-insensitive)\n",
    "        labels = re.findall(r'^[^\\n:]*discharge[^\\n:]*:\\s*$', text, flags=re.IGNORECASE | re.MULTILINE)\n",
    "        discharge_labels.update(labels)  # Add to the set to ensure uniqueness\n",
    "    \n",
    "    return discharge_labels\n",
    "\n",
    "\n",
    "# Find all labels containing 'discharge'\n",
    "discharge_labels = find_discharge_labels(mimic_notes['TEXT'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "id": "fa5a8123-fd9c-438a-954c-0c5253377250",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Removing unwanted sections: 100%|██████████| 49048/49048 [05:32<00:00, 147.47it/s]\n"
     ]
    }
   ],
   "source": [
    "import re\n",
    "import pandas as pd\n",
    "\n",
    "def remove_unwanted_sections_with_progress(df_column, sections_to_remove):\n",
    "    \"\"\"\n",
    "    Remove all unwanted sections from the given text based on section headers \n",
    "    with the pattern: newline followed by words ending with a colon (e.g., \\nSome Words:).\n",
    "    \"\"\"\n",
    "    # Combine all sections into one regex pattern\n",
    "    combined_pattern = '|'.join(re.escape(section) for section in sections_to_remove)\n",
    "    \n",
    "    # Regex to match sections dynamically based on headers in the list\n",
    "    # Matches from the given section until the next header or the end of text\n",
    "    regex_pattern = rf'(?i)(?<=\\n)({combined_pattern}):.*?(?=\\n[A-Za-z ]+?:|$)'\n",
    "    \n",
    "    cleaned_texts = []\n",
    "    \n",
    "    # Iterate through each row in the column with a progress bar\n",
    "    for text in tqdm(df_column, desc=\"Removing unwanted sections\"):\n",
    "        # Use a single regex to remove all unwanted sections\n",
    "        cleaned_text = re.sub(regex_pattern, '', text, flags=re.DOTALL).strip()\n",
    "        cleaned_texts.append(cleaned_text)\n",
    "    \n",
    "    return cleaned_texts\n",
    "\n",
    "\n",
    "# Combine all unwanted sections (provided list + dynamically found discharge labels)\n",
    "sections_to_remove = [\n",
    "    \"PHYSICAL EXAMINATION ON DISCHARGE\",\n",
    "    \"LABS ON DISCHARGE\",\n",
    "    \"Brief Hospital Course\",\n",
    "    \"Discharge Medications\",\n",
    "    \"Discharge Disposition\",\n",
    "    \"Discharge Diagnosis\",\n",
    "    \"SECONDARY DIAGNOSES\",\n",
    "    #\"DISCHARGE PHYSICAL EXAM\",\n",
    "    \"PRIMARY DIAGNOSES\",\n",
    "    \"Discharge Condition\",\n",
    "    \"Discharge Instructions\",\n",
    "    \"Followup Instructions\",\n",
    "    \"major surgical or invasive procedure\",\n",
    "    \"While you were in the hospital\",\n",
    "    \"Now that you are going home\",\n",
    "    \"BRIEF HOSPITAL COURSE\",\n",
    "    \"HOW YOU MAY FEEL\",\n",
    "    \"Follow-up\",\n",
    "    \"IMPORTANT ITEMS AFTER YOU LEAVE THE HOSPITAL\",\n",
    "    \"WHY WAS I ADMITTED TO THE HOSPITAL?\",\n",
    "    \"WHAT HAPPENED WHILE I WAS IN THE HOSPITAL?\",\n",
    "    \"WHAT SHOULD I DO WHEN I GO HOME?\",\n",
    "    \"DISCHARGE STATUS\"\n",
    "    #\"DISCHARGE LABS\"\n",
    "    # Add any other static labels here...\n",
    "] + list(discharge_labels) +list(home_labels)  # Combine static and dynamically found labels\n",
    "\n",
    "mimic_notes['TEXT'] = remove_unwanted_sections_with_progress(mimic_notes['TEXT'], sections_to_remove)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "id": "27a02fc2-3008-487f-9224-4d23828760df",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "49048"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mimic_notes.shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "id": "5aa1fdd8-b23f-4755-999f-e49fa98bb321",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculating the Length of Stay in days per admission\n",
    "mimic_admissions['ADMITTIME'] = pd.to_datetime(mimic_admissions['ADMITTIME'])\n",
    "mimic_admissions['DISCHTIME'] = pd.to_datetime(mimic_admissions['DISCHTIME'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "id": "672c8765-8c4e-477e-aab3-007140ea0ea3",
   "metadata": {},
   "outputs": [],
   "source": [
    "t = mimic_admissions.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "id": "b0f2b69c-597d-4e55-bb41-e1876cd6856e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ROW_ID</th>\n",
       "      <th>SUBJECT_ID</th>\n",
       "      <th>HADM_ID</th>\n",
       "      <th>ADMITTIME</th>\n",
       "      <th>DISCHTIME</th>\n",
       "      <th>DEATHTIME</th>\n",
       "      <th>ADMISSION_TYPE</th>\n",
       "      <th>ADMISSION_LOCATION</th>\n",
       "      <th>DISCHARGE_LOCATION</th>\n",
       "      <th>INSURANCE</th>\n",
       "      <th>LANGUAGE</th>\n",
       "      <th>RELIGION</th>\n",
       "      <th>MARITAL_STATUS</th>\n",
       "      <th>ETHNICITY</th>\n",
       "      <th>EDREGTIME</th>\n",
       "      <th>EDOUTTIME</th>\n",
       "      <th>DIAGNOSIS</th>\n",
       "      <th>HOSPITAL_EXPIRE_FLAG</th>\n",
       "      <th>HAS_CHARTEVENTS_DATA</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>21</td>\n",
       "      <td>22</td>\n",
       "      <td>165315</td>\n",
       "      <td>2196-04-09 12:26:00</td>\n",
       "      <td>2196-04-10 15:54:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>EMERGENCY</td>\n",
       "      <td>EMERGENCY ROOM ADMIT</td>\n",
       "      <td>DISC-TRAN CANCER/CHLDRN H</td>\n",
       "      <td>Private</td>\n",
       "      <td>NaN</td>\n",
       "      <td>UNOBTAINABLE</td>\n",
       "      <td>MARRIED</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>2196-04-09 10:06:00</td>\n",
       "      <td>2196-04-09 13:24:00</td>\n",
       "      <td>BENZODIAZEPINE OVERDOSE</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>22</td>\n",
       "      <td>23</td>\n",
       "      <td>152223</td>\n",
       "      <td>2153-09-03 07:15:00</td>\n",
       "      <td>2153-09-08 19:10:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ELECTIVE</td>\n",
       "      <td>PHYS REFERRAL/NORMAL DELI</td>\n",
       "      <td>HOME HEALTH CARE</td>\n",
       "      <td>Medicare</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CATHOLIC</td>\n",
       "      <td>MARRIED</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CORONARY ARTERY DISEASE\\CORONARY ARTERY BYPASS...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>23</td>\n",
       "      <td>23</td>\n",
       "      <td>124321</td>\n",
       "      <td>2157-10-18 19:34:00</td>\n",
       "      <td>2157-10-25 14:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>EMERGENCY</td>\n",
       "      <td>TRANSFER FROM HOSP/EXTRAM</td>\n",
       "      <td>HOME HEALTH CARE</td>\n",
       "      <td>Medicare</td>\n",
       "      <td>ENGL</td>\n",
       "      <td>CATHOLIC</td>\n",
       "      <td>MARRIED</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>BRAIN MASS</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>24</td>\n",
       "      <td>24</td>\n",
       "      <td>161859</td>\n",
       "      <td>2139-06-06 16:14:00</td>\n",
       "      <td>2139-06-09 12:48:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>EMERGENCY</td>\n",
       "      <td>TRANSFER FROM HOSP/EXTRAM</td>\n",
       "      <td>HOME</td>\n",
       "      <td>Private</td>\n",
       "      <td>NaN</td>\n",
       "      <td>PROTESTANT QUAKER</td>\n",
       "      <td>SINGLE</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>INTERIOR MYOCARDIAL INFARCTION</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>25</td>\n",
       "      <td>25</td>\n",
       "      <td>129635</td>\n",
       "      <td>2160-11-02 02:06:00</td>\n",
       "      <td>2160-11-05 14:55:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>EMERGENCY</td>\n",
       "      <td>EMERGENCY ROOM ADMIT</td>\n",
       "      <td>HOME</td>\n",
       "      <td>Private</td>\n",
       "      <td>NaN</td>\n",
       "      <td>UNOBTAINABLE</td>\n",
       "      <td>MARRIED</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>2160-11-02 01:01:00</td>\n",
       "      <td>2160-11-02 04:27:00</td>\n",
       "      <td>ACUTE CORONARY SYNDROME</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ROW_ID  SUBJECT_ID  HADM_ID           ADMITTIME           DISCHTIME  \\\n",
       "0      21          22   165315 2196-04-09 12:26:00 2196-04-10 15:54:00   \n",
       "1      22          23   152223 2153-09-03 07:15:00 2153-09-08 19:10:00   \n",
       "2      23          23   124321 2157-10-18 19:34:00 2157-10-25 14:00:00   \n",
       "3      24          24   161859 2139-06-06 16:14:00 2139-06-09 12:48:00   \n",
       "4      25          25   129635 2160-11-02 02:06:00 2160-11-05 14:55:00   \n",
       "\n",
       "  DEATHTIME ADMISSION_TYPE         ADMISSION_LOCATION  \\\n",
       "0       NaN      EMERGENCY       EMERGENCY ROOM ADMIT   \n",
       "1       NaN       ELECTIVE  PHYS REFERRAL/NORMAL DELI   \n",
       "2       NaN      EMERGENCY  TRANSFER FROM HOSP/EXTRAM   \n",
       "3       NaN      EMERGENCY  TRANSFER FROM HOSP/EXTRAM   \n",
       "4       NaN      EMERGENCY       EMERGENCY ROOM ADMIT   \n",
       "\n",
       "          DISCHARGE_LOCATION INSURANCE LANGUAGE           RELIGION  \\\n",
       "0  DISC-TRAN CANCER/CHLDRN H   Private      NaN       UNOBTAINABLE   \n",
       "1           HOME HEALTH CARE  Medicare      NaN           CATHOLIC   \n",
       "2           HOME HEALTH CARE  Medicare     ENGL           CATHOLIC   \n",
       "3                       HOME   Private      NaN  PROTESTANT QUAKER   \n",
       "4                       HOME   Private      NaN       UNOBTAINABLE   \n",
       "\n",
       "  MARITAL_STATUS ETHNICITY            EDREGTIME            EDOUTTIME  \\\n",
       "0        MARRIED     WHITE  2196-04-09 10:06:00  2196-04-09 13:24:00   \n",
       "1        MARRIED     WHITE                  NaN                  NaN   \n",
       "2        MARRIED     WHITE                  NaN                  NaN   \n",
       "3         SINGLE     WHITE                  NaN                  NaN   \n",
       "4        MARRIED     WHITE  2160-11-02 01:01:00  2160-11-02 04:27:00   \n",
       "\n",
       "                                           DIAGNOSIS  HOSPITAL_EXPIRE_FLAG  \\\n",
       "0                            BENZODIAZEPINE OVERDOSE                     0   \n",
       "1  CORONARY ARTERY DISEASE\\CORONARY ARTERY BYPASS...                     0   \n",
       "2                                         BRAIN MASS                     0   \n",
       "3                     INTERIOR MYOCARDIAL INFARCTION                     0   \n",
       "4                            ACUTE CORONARY SYNDROME                     0   \n",
       "\n",
       "   HAS_CHARTEVENTS_DATA  \n",
       "0                     1  \n",
       "1                     1  \n",
       "2                     1  \n",
       "3                     1  \n",
       "4                     1  "
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mimic_admissions.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "id": "e8292ea2-b5c9-4de2-b255-bb32c2743250",
   "metadata": {},
   "outputs": [],
   "source": [
    "save_dir = \"/scratch/ssd004/scratch/dorsasm/datasets/unpackedmimiciii/mimic3MultiGender\"\n",
    "\n",
    "mimic_admissions['LOS_days'] = round(\n",
    "    (mimic_admissions['DISCHTIME'] - mimic_admissions['ADMITTIME']).dt.total_seconds() / (24 * 60 * 60), 1)\n",
    "\n",
    "mimic_admissions = mimic_admissions[[\"ROW_ID\", \"ADMITTIME\",\"DISCHTIME\",\"SUBJECT_ID\", \"HADM_ID\", \"LOS_days\", \"HOSPITAL_EXPIRE_FLAG\", \"ETHNICITY\",\"INSURANCE\",\"RELIGION\",\"MARITAL_STATUS\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "id": "1f7fc472-b89c-4bc0-bcee-723eb802d268",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ROW_ID</th>\n",
       "      <th>ADMITTIME</th>\n",
       "      <th>DISCHTIME</th>\n",
       "      <th>SUBJECT_ID</th>\n",
       "      <th>HADM_ID</th>\n",
       "      <th>LOS_days</th>\n",
       "      <th>HOSPITAL_EXPIRE_FLAG</th>\n",
       "      <th>ETHNICITY</th>\n",
       "      <th>INSURANCE</th>\n",
       "      <th>RELIGION</th>\n",
       "      <th>MARITAL_STATUS</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>21</td>\n",
       "      <td>2196-04-09 12:26:00</td>\n",
       "      <td>2196-04-10 15:54:00</td>\n",
       "      <td>22</td>\n",
       "      <td>165315</td>\n",
       "      <td>1.1</td>\n",
       "      <td>0</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>Private</td>\n",
       "      <td>UNOBTAINABLE</td>\n",
       "      <td>MARRIED</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>22</td>\n",
       "      <td>2153-09-03 07:15:00</td>\n",
       "      <td>2153-09-08 19:10:00</td>\n",
       "      <td>23</td>\n",
       "      <td>152223</td>\n",
       "      <td>5.5</td>\n",
       "      <td>0</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>Medicare</td>\n",
       "      <td>CATHOLIC</td>\n",
       "      <td>MARRIED</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ROW_ID           ADMITTIME           DISCHTIME  SUBJECT_ID  HADM_ID  \\\n",
       "0      21 2196-04-09 12:26:00 2196-04-10 15:54:00          22   165315   \n",
       "1      22 2153-09-03 07:15:00 2153-09-08 19:10:00          23   152223   \n",
       "\n",
       "   LOS_days  HOSPITAL_EXPIRE_FLAG ETHNICITY INSURANCE      RELIGION  \\\n",
       "0       1.1                     0     WHITE   Private  UNOBTAINABLE   \n",
       "1       5.5                     0     WHITE  Medicare      CATHOLIC   \n",
       "\n",
       "  MARITAL_STATUS  \n",
       "0        MARRIED  \n",
       "1        MARRIED  "
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mimic_admissions.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "id": "82a38a26-815d-4160-acc2-c9f6aabcdfff",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_los_label(mimic_admissions,is_binary):\n",
    "    save_dir = \"/scratch/ssd004/scratch/dorsasm/datasets/unpackedmimiciii/mimic3MultiGender\"\n",
    "    \n",
    "    #mimic_admissions['LOS_days'] = round(\n",
    "    #    (mimic_admissions['DISCHTIME'] - mimic_admissions['ADMITTIME']).dt.total_seconds() / (24 * 60 * 60), 1)\n",
    "    \n",
    "    #mimic_admissions = mimic_admissions[[\"ROW_ID\", \"SUBJECT_ID\", \"HADM_ID\", \"LOS_days\", \"HOSPITAL_EXPIRE_FLAG\"]]\n",
    "    \n",
    "    # Creation of Label\n",
    "    '''\n",
    "        <= 3: 0\n",
    "        > 3 & <= 7: 1\n",
    "        > 7 & <= 14: 2\n",
    "        >14: 3\n",
    "    \n",
    "    \n",
    "        Binary:\n",
    "        <= 7 : 0\n",
    "        > 7 : 1\n",
    "        \n",
    "    '''\n",
    "    if is_binary == False:\n",
    "        mimic_admissions.loc[mimic_admissions['LOS_days'] <= 3, 'LOS_label'] = 0\n",
    "        mimic_admissions.loc[(mimic_admissions['LOS_days'] > 3) & (\n",
    "                mimic_admissions['LOS_days'] <= 7), 'LOS_label'] = 1\n",
    "        mimic_admissions.loc[(mimic_admissions['LOS_days'] > 7) & (\n",
    "                mimic_admissions['LOS_days'] <= 14), 'LOS_label'] = 2\n",
    "        mimic_admissions.loc[(mimic_admissions['LOS_days'] > 14), 'LOS_label'] = 3\n",
    "        mimic_admissions.LOS_label = mimic_admissions.LOS_label.astype(int)\n",
    "    elif is_binary == True:\n",
    "        mimic_admissions.loc[mimic_admissions['LOS_days']<= 7, 'LOS_label'] = 0\n",
    "        mimic_admissions.loc[mimic_admissions['LOS_days']>7, 'LOS_label'] = 1\n",
    "        mimic_admissions.LOS_label = mimic_admissions.LOS_label.astype(int)\n",
    "    return mimic_admissions\n",
    "\n",
    "mimic_admissions=create_los_label(mimic_admissions=mimic_admissions,is_binary=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "id": "15660a70-d5bb-4b9f-b092-c5663b530fd7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LOS_label\n",
       "1    20153\n",
       "2    15609\n",
       "0    11773\n",
       "3    11441\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mimic_admissions['LOS_label'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "id": "706f222b-06dc-4c4b-b19e-07fbdf0fd3cb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "58976\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ROW_ID</th>\n",
       "      <th>ADMITTIME</th>\n",
       "      <th>DISCHTIME</th>\n",
       "      <th>SUBJECT_ID</th>\n",
       "      <th>HADM_ID</th>\n",
       "      <th>LOS_days</th>\n",
       "      <th>HOSPITAL_EXPIRE_FLAG</th>\n",
       "      <th>ETHNICITY</th>\n",
       "      <th>INSURANCE</th>\n",
       "      <th>RELIGION</th>\n",
       "      <th>MARITAL_STATUS</th>\n",
       "      <th>LOS_label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>21</td>\n",
       "      <td>2196-04-09 12:26:00</td>\n",
       "      <td>2196-04-10 15:54:00</td>\n",
       "      <td>22</td>\n",
       "      <td>165315</td>\n",
       "      <td>1.1</td>\n",
       "      <td>0</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>Private</td>\n",
       "      <td>UNOBTAINABLE</td>\n",
       "      <td>MARRIED</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>22</td>\n",
       "      <td>2153-09-03 07:15:00</td>\n",
       "      <td>2153-09-08 19:10:00</td>\n",
       "      <td>23</td>\n",
       "      <td>152223</td>\n",
       "      <td>5.5</td>\n",
       "      <td>0</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>Medicare</td>\n",
       "      <td>CATHOLIC</td>\n",
       "      <td>MARRIED</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ROW_ID           ADMITTIME           DISCHTIME  SUBJECT_ID  HADM_ID  \\\n",
       "0      21 2196-04-09 12:26:00 2196-04-10 15:54:00          22   165315   \n",
       "1      22 2153-09-03 07:15:00 2153-09-08 19:10:00          23   152223   \n",
       "\n",
       "   LOS_days  HOSPITAL_EXPIRE_FLAG ETHNICITY INSURANCE      RELIGION  \\\n",
       "0       1.1                     0     WHITE   Private  UNOBTAINABLE   \n",
       "1       5.5                     0     WHITE  Medicare      CATHOLIC   \n",
       "\n",
       "  MARITAL_STATUS  LOS_label  \n",
       "0        MARRIED          0  \n",
       "1        MARRIED          1  "
      ]
     },
     "execution_count": 81,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(mimic_admissions.shape[0])\n",
    "mimic_admissions.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "id": "d7c55a75-2760-4413-8d32-ecb61f89a397",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Keeping the required variables\n",
    "mimic_admissions = mimic_admissions[[\"HADM_ID\",\"DISCHTIME\",\"ADMITTIME\", \"LOS_label\", \"HOSPITAL_EXPIRE_FLAG\", \"ETHNICITY\",\"INSURANCE\",\"RELIGION\",\"MARITAL_STATUS\"]]\n",
    "mimic_notes = mimic_notes[['HADM_ID', 'TEXT', \"ROW_ID\", \"SUBJECT_ID\"]]\n",
    "\n",
    "# Merging Mimic Notes data with Admissions data\n",
    "notes_adm_df = pd.merge(mimic_notes, mimic_admissions, how=\"left\", on=\"HADM_ID\")\n",
    "\n",
    "# Removing records where the patient died within a given hospitalization\n",
    "notes_adm_df = notes_adm_df[notes_adm_df['HOSPITAL_EXPIRE_FLAG'] == 0]\n",
    "notes_adm_df = notes_adm_df[[\"ROW_ID\",\"DISCHTIME\",\"ADMITTIME\", \"SUBJECT_ID\", \"HADM_ID\", \"TEXT\", \"LOS_label\",\"ETHNICITY\",\"INSURANCE\",\"RELIGION\",\"MARITAL_STATUS\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "id": "49e3cab7-30a0-4ecc-97ca-7859f5585503",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ROW_ID</th>\n",
       "      <th>DISCHTIME</th>\n",
       "      <th>ADMITTIME</th>\n",
       "      <th>SUBJECT_ID</th>\n",
       "      <th>HADM_ID</th>\n",
       "      <th>TEXT</th>\n",
       "      <th>LOS_label</th>\n",
       "      <th>ETHNICITY</th>\n",
       "      <th>INSURANCE</th>\n",
       "      <th>RELIGION</th>\n",
       "      <th>MARITAL_STATUS</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>42102</td>\n",
       "      <td>2117-09-17 16:45:00</td>\n",
       "      <td>2117-09-11 11:46:00</td>\n",
       "      <td>58526</td>\n",
       "      <td>100001.0</td>\n",
       "      <td>Admission Date:  [**2117-9-11**]              ...</td>\n",
       "      <td>1</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>Private</td>\n",
       "      <td>PROTESTANT QUAKER</td>\n",
       "      <td>DIVORCED</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>19215</td>\n",
       "      <td>2150-04-21 17:30:00</td>\n",
       "      <td>2150-04-17 15:34:00</td>\n",
       "      <td>54610</td>\n",
       "      <td>100003.0</td>\n",
       "      <td>Admission Date:  [**2150-4-17**]              ...</td>\n",
       "      <td>1</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>Private</td>\n",
       "      <td>NOT SPECIFIED</td>\n",
       "      <td>SINGLE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8772</td>\n",
       "      <td>2108-04-18 17:18:00</td>\n",
       "      <td>2108-04-06 15:49:00</td>\n",
       "      <td>9895</td>\n",
       "      <td>100006.0</td>\n",
       "      <td>Admission Date:  [**2108-4-6**]       Discharg...</td>\n",
       "      <td>2</td>\n",
       "      <td>BLACK/AFRICAN AMERICAN</td>\n",
       "      <td>Private</td>\n",
       "      <td>NOT SPECIFIED</td>\n",
       "      <td>SINGLE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>50238</td>\n",
       "      <td>2145-04-07 12:40:00</td>\n",
       "      <td>2145-03-31 05:33:00</td>\n",
       "      <td>23018</td>\n",
       "      <td>100007.0</td>\n",
       "      <td>Admission Date:  [**2145-3-31**]              ...</td>\n",
       "      <td>2</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>Private</td>\n",
       "      <td>JEWISH</td>\n",
       "      <td>MARRIED</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>21119</td>\n",
       "      <td>2162-05-21 13:37:00</td>\n",
       "      <td>2162-05-16 15:56:00</td>\n",
       "      <td>533</td>\n",
       "      <td>100009.0</td>\n",
       "      <td>Admission Date:  [**2162-5-16**]              ...</td>\n",
       "      <td>1</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>Private</td>\n",
       "      <td>CATHOLIC</td>\n",
       "      <td>MARRIED</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49043</th>\n",
       "      <td>16616</td>\n",
       "      <td>2161-11-17 08:10:00</td>\n",
       "      <td>2161-10-23 18:01:00</td>\n",
       "      <td>20785</td>\n",
       "      <td>199993.0</td>\n",
       "      <td>Admission Date:  [**2161-10-23**]       Discha...</td>\n",
       "      <td>3</td>\n",
       "      <td>UNKNOWN/NOT SPECIFIED</td>\n",
       "      <td>Private</td>\n",
       "      <td>CATHOLIC</td>\n",
       "      <td>DIVORCED</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49044</th>\n",
       "      <td>10570</td>\n",
       "      <td>2188-07-17 13:31:00</td>\n",
       "      <td>2188-07-07 18:47:00</td>\n",
       "      <td>23761</td>\n",
       "      <td>199994.0</td>\n",
       "      <td>Admission Date:  [**2188-7-7**]       Discharg...</td>\n",
       "      <td>2</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>Medicare</td>\n",
       "      <td>CATHOLIC</td>\n",
       "      <td>SINGLE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49045</th>\n",
       "      <td>21697</td>\n",
       "      <td>2137-12-28 12:30:00</td>\n",
       "      <td>2137-12-11 17:35:00</td>\n",
       "      <td>19412</td>\n",
       "      <td>199995.0</td>\n",
       "      <td>Admission Date: [**2137-12-11**]        Discha...</td>\n",
       "      <td>3</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>Private</td>\n",
       "      <td>PROTESTANT QUAKER</td>\n",
       "      <td>SINGLE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49046</th>\n",
       "      <td>33528</td>\n",
       "      <td>2119-02-24 14:25:00</td>\n",
       "      <td>2119-02-18 16:26:00</td>\n",
       "      <td>27200</td>\n",
       "      <td>199998.0</td>\n",
       "      <td>Admission Date:  [**2119-2-18**]              ...</td>\n",
       "      <td>1</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>Medicare</td>\n",
       "      <td>CATHOLIC</td>\n",
       "      <td>MARRIED</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49047</th>\n",
       "      <td>2652</td>\n",
       "      <td>2136-04-10 12:10:00</td>\n",
       "      <td>2136-04-04 23:37:00</td>\n",
       "      <td>40370</td>\n",
       "      <td>199999.0</td>\n",
       "      <td>Admission Date:  [**2136-4-4**]              D...</td>\n",
       "      <td>1</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>Medicare</td>\n",
       "      <td>JEWISH</td>\n",
       "      <td>MARRIED</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>43851 rows × 11 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       ROW_ID           DISCHTIME           ADMITTIME  SUBJECT_ID   HADM_ID  \\\n",
       "0       42102 2117-09-17 16:45:00 2117-09-11 11:46:00       58526  100001.0   \n",
       "1       19215 2150-04-21 17:30:00 2150-04-17 15:34:00       54610  100003.0   \n",
       "2        8772 2108-04-18 17:18:00 2108-04-06 15:49:00        9895  100006.0   \n",
       "3       50238 2145-04-07 12:40:00 2145-03-31 05:33:00       23018  100007.0   \n",
       "4       21119 2162-05-21 13:37:00 2162-05-16 15:56:00         533  100009.0   \n",
       "...       ...                 ...                 ...         ...       ...   \n",
       "49043   16616 2161-11-17 08:10:00 2161-10-23 18:01:00       20785  199993.0   \n",
       "49044   10570 2188-07-17 13:31:00 2188-07-07 18:47:00       23761  199994.0   \n",
       "49045   21697 2137-12-28 12:30:00 2137-12-11 17:35:00       19412  199995.0   \n",
       "49046   33528 2119-02-24 14:25:00 2119-02-18 16:26:00       27200  199998.0   \n",
       "49047    2652 2136-04-10 12:10:00 2136-04-04 23:37:00       40370  199999.0   \n",
       "\n",
       "                                                    TEXT  LOS_label  \\\n",
       "0      Admission Date:  [**2117-9-11**]              ...          1   \n",
       "1      Admission Date:  [**2150-4-17**]              ...          1   \n",
       "2      Admission Date:  [**2108-4-6**]       Discharg...          2   \n",
       "3      Admission Date:  [**2145-3-31**]              ...          2   \n",
       "4      Admission Date:  [**2162-5-16**]              ...          1   \n",
       "...                                                  ...        ...   \n",
       "49043  Admission Date:  [**2161-10-23**]       Discha...          3   \n",
       "49044  Admission Date:  [**2188-7-7**]       Discharg...          2   \n",
       "49045  Admission Date: [**2137-12-11**]        Discha...          3   \n",
       "49046  Admission Date:  [**2119-2-18**]              ...          1   \n",
       "49047  Admission Date:  [**2136-4-4**]              D...          1   \n",
       "\n",
       "                    ETHNICITY INSURANCE           RELIGION MARITAL_STATUS  \n",
       "0                       WHITE   Private  PROTESTANT QUAKER       DIVORCED  \n",
       "1                       WHITE   Private      NOT SPECIFIED         SINGLE  \n",
       "2      BLACK/AFRICAN AMERICAN   Private      NOT SPECIFIED         SINGLE  \n",
       "3                       WHITE   Private             JEWISH        MARRIED  \n",
       "4                       WHITE   Private           CATHOLIC        MARRIED  \n",
       "...                       ...       ...                ...            ...  \n",
       "49043   UNKNOWN/NOT SPECIFIED   Private           CATHOLIC       DIVORCED  \n",
       "49044                   WHITE  Medicare           CATHOLIC         SINGLE  \n",
       "49045                   WHITE   Private  PROTESTANT QUAKER         SINGLE  \n",
       "49046                   WHITE  Medicare           CATHOLIC        MARRIED  \n",
       "49047                   WHITE  Medicare             JEWISH        MARRIED  \n",
       "\n",
       "[43851 rows x 11 columns]"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "notes_adm_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "id": "149724e1-fa47-4f63-9dc1-cdd973816ec8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ROW_ID</th>\n",
       "      <th>SUBJECT_ID</th>\n",
       "      <th>GENDER</th>\n",
       "      <th>DOB</th>\n",
       "      <th>DOD</th>\n",
       "      <th>DOD_HOSP</th>\n",
       "      <th>DOD_SSN</th>\n",
       "      <th>EXPIRE_FLAG</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>234</td>\n",
       "      <td>249</td>\n",
       "      <td>F</td>\n",
       "      <td>2075-03-13 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>235</td>\n",
       "      <td>250</td>\n",
       "      <td>F</td>\n",
       "      <td>2164-12-27 00:00:00</td>\n",
       "      <td>2188-11-22 00:00:00</td>\n",
       "      <td>2188-11-22 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>236</td>\n",
       "      <td>251</td>\n",
       "      <td>M</td>\n",
       "      <td>2090-03-15 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>237</td>\n",
       "      <td>252</td>\n",
       "      <td>M</td>\n",
       "      <td>2078-03-06 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>238</td>\n",
       "      <td>253</td>\n",
       "      <td>F</td>\n",
       "      <td>2089-11-26 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ROW_ID  SUBJECT_ID GENDER                  DOB                  DOD  \\\n",
       "0     234         249      F  2075-03-13 00:00:00                  NaN   \n",
       "1     235         250      F  2164-12-27 00:00:00  2188-11-22 00:00:00   \n",
       "2     236         251      M  2090-03-15 00:00:00                  NaN   \n",
       "3     237         252      M  2078-03-06 00:00:00                  NaN   \n",
       "4     238         253      F  2089-11-26 00:00:00                  NaN   \n",
       "\n",
       "              DOD_HOSP DOD_SSN  EXPIRE_FLAG  \n",
       "0                  NaN     NaN            0  \n",
       "1  2188-11-22 00:00:00     NaN            1  \n",
       "2                  NaN     NaN            0  \n",
       "3                  NaN     NaN            0  \n",
       "4                  NaN     NaN            0  "
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "patients.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "id": "1a83e2bb-27a9-4a15-8944-1237bb39a0db",
   "metadata": {},
   "outputs": [],
   "source": [
    "notes_adm_df = pd.merge(notes_adm_df,patients[['SUBJECT_ID','GENDER','DOB']], on = 'SUBJECT_ID' , how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "id": "87a5d012-cbb4-48a2-9601-847f2b6906c6",
   "metadata": {},
   "outputs": [],
   "source": [
    "notes_adm_df['DOB']=pd.to_datetime(notes_adm_df['DOB'])\n",
    "notes_adm_df['age']=notes_adm_df['ADMITTIME'].dt.year-notes_adm_df['DOB'].dt.year"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "id": "5d6de643-1b94-4a2d-bb6f-227acf06be21",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ROW_ID</th>\n",
       "      <th>DISCHTIME</th>\n",
       "      <th>ADMITTIME</th>\n",
       "      <th>SUBJECT_ID</th>\n",
       "      <th>HADM_ID</th>\n",
       "      <th>TEXT</th>\n",
       "      <th>LOS_label</th>\n",
       "      <th>ETHNICITY</th>\n",
       "      <th>INSURANCE</th>\n",
       "      <th>RELIGION</th>\n",
       "      <th>MARITAL_STATUS</th>\n",
       "      <th>GENDER</th>\n",
       "      <th>DOB</th>\n",
       "      <th>age</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>42102</td>\n",
       "      <td>2117-09-17 16:45:00</td>\n",
       "      <td>2117-09-11 11:46:00</td>\n",
       "      <td>58526</td>\n",
       "      <td>100001.0</td>\n",
       "      <td>Admission Date:  [**2117-9-11**]              ...</td>\n",
       "      <td>1</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>Private</td>\n",
       "      <td>PROTESTANT QUAKER</td>\n",
       "      <td>DIVORCED</td>\n",
       "      <td>F</td>\n",
       "      <td>2082-03-21</td>\n",
       "      <td>35</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>19215</td>\n",
       "      <td>2150-04-21 17:30:00</td>\n",
       "      <td>2150-04-17 15:34:00</td>\n",
       "      <td>54610</td>\n",
       "      <td>100003.0</td>\n",
       "      <td>Admission Date:  [**2150-4-17**]              ...</td>\n",
       "      <td>1</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>Private</td>\n",
       "      <td>NOT SPECIFIED</td>\n",
       "      <td>SINGLE</td>\n",
       "      <td>M</td>\n",
       "      <td>2090-05-19</td>\n",
       "      <td>60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8772</td>\n",
       "      <td>2108-04-18 17:18:00</td>\n",
       "      <td>2108-04-06 15:49:00</td>\n",
       "      <td>9895</td>\n",
       "      <td>100006.0</td>\n",
       "      <td>Admission Date:  [**2108-4-6**]       Discharg...</td>\n",
       "      <td>2</td>\n",
       "      <td>BLACK/AFRICAN AMERICAN</td>\n",
       "      <td>Private</td>\n",
       "      <td>NOT SPECIFIED</td>\n",
       "      <td>SINGLE</td>\n",
       "      <td>F</td>\n",
       "      <td>2059-05-07</td>\n",
       "      <td>49</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>50238</td>\n",
       "      <td>2145-04-07 12:40:00</td>\n",
       "      <td>2145-03-31 05:33:00</td>\n",
       "      <td>23018</td>\n",
       "      <td>100007.0</td>\n",
       "      <td>Admission Date:  [**2145-3-31**]              ...</td>\n",
       "      <td>2</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>Private</td>\n",
       "      <td>JEWISH</td>\n",
       "      <td>MARRIED</td>\n",
       "      <td>F</td>\n",
       "      <td>2071-06-04</td>\n",
       "      <td>74</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>21119</td>\n",
       "      <td>2162-05-21 13:37:00</td>\n",
       "      <td>2162-05-16 15:56:00</td>\n",
       "      <td>533</td>\n",
       "      <td>100009.0</td>\n",
       "      <td>Admission Date:  [**2162-5-16**]              ...</td>\n",
       "      <td>1</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>Private</td>\n",
       "      <td>CATHOLIC</td>\n",
       "      <td>MARRIED</td>\n",
       "      <td>M</td>\n",
       "      <td>2101-07-30</td>\n",
       "      <td>61</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ROW_ID           DISCHTIME           ADMITTIME  SUBJECT_ID   HADM_ID  \\\n",
       "0   42102 2117-09-17 16:45:00 2117-09-11 11:46:00       58526  100001.0   \n",
       "1   19215 2150-04-21 17:30:00 2150-04-17 15:34:00       54610  100003.0   \n",
       "2    8772 2108-04-18 17:18:00 2108-04-06 15:49:00        9895  100006.0   \n",
       "3   50238 2145-04-07 12:40:00 2145-03-31 05:33:00       23018  100007.0   \n",
       "4   21119 2162-05-21 13:37:00 2162-05-16 15:56:00         533  100009.0   \n",
       "\n",
       "                                                TEXT  LOS_label  \\\n",
       "0  Admission Date:  [**2117-9-11**]              ...          1   \n",
       "1  Admission Date:  [**2150-4-17**]              ...          1   \n",
       "2  Admission Date:  [**2108-4-6**]       Discharg...          2   \n",
       "3  Admission Date:  [**2145-3-31**]              ...          2   \n",
       "4  Admission Date:  [**2162-5-16**]              ...          1   \n",
       "\n",
       "                ETHNICITY INSURANCE           RELIGION MARITAL_STATUS GENDER  \\\n",
       "0                   WHITE   Private  PROTESTANT QUAKER       DIVORCED      F   \n",
       "1                   WHITE   Private      NOT SPECIFIED         SINGLE      M   \n",
       "2  BLACK/AFRICAN AMERICAN   Private      NOT SPECIFIED         SINGLE      F   \n",
       "3                   WHITE   Private             JEWISH        MARRIED      F   \n",
       "4                   WHITE   Private           CATHOLIC        MARRIED      M   \n",
       "\n",
       "         DOB  age  \n",
       "0 2082-03-21   35  \n",
       "1 2090-05-19   60  \n",
       "2 2059-05-07   49  \n",
       "3 2071-06-04   74  \n",
       "4 2101-07-30   61  "
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "notes_adm_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "id": "a64b38bb-ef65-44d7-844d-953fc0a0482a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 88,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "notes_adm_df['GENDER'].isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "id": "1ec99989-a6b8-4089-b744-b2315520cec2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "notes_adm_df['ETHNICITY'].isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "id": "36d4201d-70b9-4f2e-b03f-29e1be914bb0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "331\n",
      "1804\n",
      "0\n"
     ]
    }
   ],
   "source": [
    "print(notes_adm_df['age'].isna().sum())\n",
    "print(notes_adm_df['RELIGION'].isna().sum())\n",
    "print(notes_adm_df['MARITAL_STATUS'].isna().sum())\n",
    "print(notes_adm_df['INSURANCE'].isna().sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "id": "d888690d-403b-4c8b-a11f-b7e60fa0608c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RELIGION\n",
       "CATHOLIC                  16171\n",
       "NOT SPECIFIED              9093\n",
       "PROTESTANT QUAKER          5783\n",
       "UNOBTAINABLE               4232\n",
       "JEWISH                     4087\n",
       "OTHER                      2120\n",
       "EPISCOPALIAN                618\n",
       "GREEK ORTHODOX              366\n",
       "CHRISTIAN SCIENTIST         290\n",
       "BUDDHIST                    187\n",
       "MUSLIM                      160\n",
       "JEHOVAH'S WITNESS           110\n",
       "UNITARIAN-UNIVERSALIST       86\n",
       "7TH DAY ADVENTIST            63\n",
       "HINDU                        58\n",
       "ROMANIAN EAST. ORTH          57\n",
       "BAPTIST                      20\n",
       "HEBREW                       12\n",
       "METHODIST                     6\n",
       "LUTHERAN                      1\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "notes_adm_df['RELIGION'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "id": "26c4d20e-32de-4710-b799-29dbb4289837",
   "metadata": {},
   "outputs": [],
   "source": [
    "notes_adm_df['New_RELIGION']=notes_adm_df['RELIGION']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "id": "23461fc4-d953-47f9-ad91-5cff1c3420c1",
   "metadata": {},
   "outputs": [],
   "source": [
    "notes_adm_df['New_RELIGION'] = notes_adm_df['New_RELIGION'].replace([None, 'NOT SPECIFIED', 'UNOBTAINABLE'], 'Unknown')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "id": "4e8f3880-47ac-4476-8c26-296f92a74bab",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "New_RELIGION\n",
       "CATHOLIC                  16171\n",
       "Unknown                   13656\n",
       "PROTESTANT QUAKER          5783\n",
       "JEWISH                     4087\n",
       "OTHER                      2120\n",
       "EPISCOPALIAN                618\n",
       "GREEK ORTHODOX              366\n",
       "CHRISTIAN SCIENTIST         290\n",
       "BUDDHIST                    187\n",
       "MUSLIM                      160\n",
       "JEHOVAH'S WITNESS           110\n",
       "UNITARIAN-UNIVERSALIST       86\n",
       "7TH DAY ADVENTIST            63\n",
       "HINDU                        58\n",
       "ROMANIAN EAST. ORTH          57\n",
       "BAPTIST                      20\n",
       "HEBREW                       12\n",
       "METHODIST                     6\n",
       "LUTHERAN                      1\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 94,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(notes_adm_df['New_RELIGION'].isna().sum())\n",
    "notes_adm_df['New_RELIGION'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "id": "26565678-f8f6-432b-b0c9-2947c50474d8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MARITAL_STATUS\n",
       "MARRIED              20974\n",
       "SINGLE               11418\n",
       "WIDOWED               6060\n",
       "DIVORCED              2811\n",
       "SEPARATED              500\n",
       "UNKNOWN (DEFAULT)      269\n",
       "LIFE PARTNER            15\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 95,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "notes_adm_df['MARITAL_STATUS'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "id": "20ae4d64-7803-43b6-80e1-45a0c7397594",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "New_MARITAL_STATUS\n",
       "MARRIED              20974\n",
       "SINGLE               11418\n",
       "WIDOWED               6060\n",
       "DIVORCED              2811\n",
       "UNKNOWN (DEFAULT)     2073\n",
       "SEPARATED              500\n",
       "LIFE PARTNER            15\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 96,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "notes_adm_df['New_MARITAL_STATUS']=notes_adm_df['MARITAL_STATUS']\n",
    "notes_adm_df['New_MARITAL_STATUS'] = notes_adm_df['New_MARITAL_STATUS'].replace([None], 'UNKNOWN (DEFAULT)')\n",
    "print(notes_adm_df['New_MARITAL_STATUS'].isna().sum())\n",
    "notes_adm_df['New_MARITAL_STATUS'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "id": "fbe670f9-f46f-4c4e-b5a2-fd6dff31ce34",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ETHNICITY\n",
       "WHITE                                                       31195\n",
       "BLACK/AFRICAN AMERICAN                                       4048\n",
       "UNKNOWN/NOT SPECIFIED                                        3228\n",
       "HISPANIC OR LATINO                                           1177\n",
       "OTHER                                                         977\n",
       "ASIAN                                                         648\n",
       "UNABLE TO OBTAIN                                              610\n",
       "PATIENT DECLINED TO ANSWER                                    399\n",
       "ASIAN - CHINESE                                               208\n",
       "HISPANIC/LATINO - PUERTO RICAN                                208\n",
       "BLACK/CAPE VERDEAN                                            158\n",
       "WHITE - RUSSIAN                                               140\n",
       "MULTI RACE ETHNICITY                                           92\n",
       "BLACK/HAITIAN                                                  89\n",
       "ASIAN - ASIAN INDIAN                                           76\n",
       "HISPANIC/LATINO - DOMINICAN                                    72\n",
       "WHITE - OTHER EUROPEAN                                         66\n",
       "PORTUGUESE                                                     54\n",
       "WHITE - BRAZILIAN                                              50\n",
       "ASIAN - VIETNAMESE                                             39\n",
       "MIDDLE EASTERN                                                 38\n",
       "HISPANIC/LATINO - GUATEMALAN                                   36\n",
       "BLACK/AFRICAN                                                  35\n",
       "HISPANIC/LATINO - CUBAN                                        23\n",
       "AMERICAN INDIAN/ALASKA NATIVE                                  22\n",
       "WHITE - EASTERN EUROPEAN                                       22\n",
       "ASIAN - FILIPINO                                               21\n",
       "HISPANIC/LATINO - SALVADORAN                                   16\n",
       "ASIAN - CAMBODIAN                                              13\n",
       "HISPANIC/LATINO - CENTRAL AMERICAN (OTHER)                     11\n",
       "NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER                      11\n",
       "ASIAN - KOREAN                                                 10\n",
       "ASIAN - OTHER                                                  10\n",
       "HISPANIC/LATINO - MEXICAN                                      10\n",
       "CARIBBEAN ISLAND                                                9\n",
       "HISPANIC/LATINO - COLOMBIAN                                     9\n",
       "SOUTH AMERICAN                                                  7\n",
       "ASIAN - THAI                                                    4\n",
       "HISPANIC/LATINO - HONDURAN                                      4\n",
       "ASIAN - JAPANESE                                                4\n",
       "AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNIZED TRIBE        2\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "notes_adm_df['ETHNICITY'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "id": "7e9c3197-4ef2-425e-98a3-963d4743cf98",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "def categorize_ethnicity(ethnicity):\n",
    "    ethnicity = str(ethnicity).lower().strip()  # Convert to lowercase and remove leading/trailing spaces\n",
    "    \n",
    "    # Define regex patterns for each category\n",
    "    if re.search(r\"\\b(white)\\b\", ethnicity, re.I):\n",
    "        return \"White\"\n",
    "    elif re.search(r\"\\b(asian)\\b\", ethnicity, re.I):\n",
    "        return \"Asian\"\n",
    "    elif re.search(r\"\\b(hispanic|latino|mexican|cuban|puerto rican|colombian|argentine|south american)\\b\", ethnicity, re.I):\n",
    "        return \"Hispanic\"\n",
    "    elif re.search(r\"\\b(black|african american|nigerian|ethiopian|afro-caribbean|sub-saharan african|african)\\b\", ethnicity, re.I):\n",
    "        return \"Black\"\n",
    "    elif re.search(r\"\\b(portuguese|other|multi|middle|indian|caribbean)\\b\", ethnicity, re.I):\n",
    "        return \"other\"\n",
    "    elif re.search(r\"\\b(unknown|unable|declined|)\\b\", ethnicity, re.I):\n",
    "        return \"unknown\"\n",
    "    else:\n",
    "        return ethnicity\n",
    "\n",
    "# Apply the categorization function\n",
    "notes_adm_df[\"ethnicity_group\"] = notes_adm_df[\"ETHNICITY\"].apply(categorize_ethnicity)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "id": "e07a758c-4e5c-4a25-8238-0e186a6171fa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ethnicity_group\n",
       "White       31473\n",
       "Black        4330\n",
       "unknown      4237\n",
       "Hispanic     1573\n",
       "other        1205\n",
       "Asian        1033\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 99,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "notes_adm_df[\"ethnicity_group\"].value_counts() "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "id": "d89d418a-d74e-477c-8bd4-fe3335b41487",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ROW_ID</th>\n",
       "      <th>DISCHTIME</th>\n",
       "      <th>ADMITTIME</th>\n",
       "      <th>SUBJECT_ID</th>\n",
       "      <th>HADM_ID</th>\n",
       "      <th>TEXT</th>\n",
       "      <th>LOS_label</th>\n",
       "      <th>ETHNICITY</th>\n",
       "      <th>INSURANCE</th>\n",
       "      <th>RELIGION</th>\n",
       "      <th>MARITAL_STATUS</th>\n",
       "      <th>GENDER</th>\n",
       "      <th>DOB</th>\n",
       "      <th>age</th>\n",
       "      <th>New_RELIGION</th>\n",
       "      <th>New_MARITAL_STATUS</th>\n",
       "      <th>ethnicity_group</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>42102</td>\n",
       "      <td>2117-09-17 16:45:00</td>\n",
       "      <td>2117-09-11 11:46:00</td>\n",
       "      <td>58526</td>\n",
       "      <td>100001.0</td>\n",
       "      <td>Admission Date:  [**2117-9-11**]              ...</td>\n",
       "      <td>1</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>Private</td>\n",
       "      <td>PROTESTANT QUAKER</td>\n",
       "      <td>DIVORCED</td>\n",
       "      <td>F</td>\n",
       "      <td>2082-03-21</td>\n",
       "      <td>35</td>\n",
       "      <td>PROTESTANT QUAKER</td>\n",
       "      <td>DIVORCED</td>\n",
       "      <td>White</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>19215</td>\n",
       "      <td>2150-04-21 17:30:00</td>\n",
       "      <td>2150-04-17 15:34:00</td>\n",
       "      <td>54610</td>\n",
       "      <td>100003.0</td>\n",
       "      <td>Admission Date:  [**2150-4-17**]              ...</td>\n",
       "      <td>1</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>Private</td>\n",
       "      <td>NOT SPECIFIED</td>\n",
       "      <td>SINGLE</td>\n",
       "      <td>M</td>\n",
       "      <td>2090-05-19</td>\n",
       "      <td>60</td>\n",
       "      <td>Unknown</td>\n",
       "      <td>SINGLE</td>\n",
       "      <td>White</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8772</td>\n",
       "      <td>2108-04-18 17:18:00</td>\n",
       "      <td>2108-04-06 15:49:00</td>\n",
       "      <td>9895</td>\n",
       "      <td>100006.0</td>\n",
       "      <td>Admission Date:  [**2108-4-6**]       Discharg...</td>\n",
       "      <td>2</td>\n",
       "      <td>BLACK/AFRICAN AMERICAN</td>\n",
       "      <td>Private</td>\n",
       "      <td>NOT SPECIFIED</td>\n",
       "      <td>SINGLE</td>\n",
       "      <td>F</td>\n",
       "      <td>2059-05-07</td>\n",
       "      <td>49</td>\n",
       "      <td>Unknown</td>\n",
       "      <td>SINGLE</td>\n",
       "      <td>Black</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>50238</td>\n",
       "      <td>2145-04-07 12:40:00</td>\n",
       "      <td>2145-03-31 05:33:00</td>\n",
       "      <td>23018</td>\n",
       "      <td>100007.0</td>\n",
       "      <td>Admission Date:  [**2145-3-31**]              ...</td>\n",
       "      <td>2</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>Private</td>\n",
       "      <td>JEWISH</td>\n",
       "      <td>MARRIED</td>\n",
       "      <td>F</td>\n",
       "      <td>2071-06-04</td>\n",
       "      <td>74</td>\n",
       "      <td>JEWISH</td>\n",
       "      <td>MARRIED</td>\n",
       "      <td>White</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>21119</td>\n",
       "      <td>2162-05-21 13:37:00</td>\n",
       "      <td>2162-05-16 15:56:00</td>\n",
       "      <td>533</td>\n",
       "      <td>100009.0</td>\n",
       "      <td>Admission Date:  [**2162-5-16**]              ...</td>\n",
       "      <td>1</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>Private</td>\n",
       "      <td>CATHOLIC</td>\n",
       "      <td>MARRIED</td>\n",
       "      <td>M</td>\n",
       "      <td>2101-07-30</td>\n",
       "      <td>61</td>\n",
       "      <td>CATHOLIC</td>\n",
       "      <td>MARRIED</td>\n",
       "      <td>White</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ROW_ID           DISCHTIME           ADMITTIME  SUBJECT_ID   HADM_ID  \\\n",
       "0   42102 2117-09-17 16:45:00 2117-09-11 11:46:00       58526  100001.0   \n",
       "1   19215 2150-04-21 17:30:00 2150-04-17 15:34:00       54610  100003.0   \n",
       "2    8772 2108-04-18 17:18:00 2108-04-06 15:49:00        9895  100006.0   \n",
       "3   50238 2145-04-07 12:40:00 2145-03-31 05:33:00       23018  100007.0   \n",
       "4   21119 2162-05-21 13:37:00 2162-05-16 15:56:00         533  100009.0   \n",
       "\n",
       "                                                TEXT  LOS_label  \\\n",
       "0  Admission Date:  [**2117-9-11**]              ...          1   \n",
       "1  Admission Date:  [**2150-4-17**]              ...          1   \n",
       "2  Admission Date:  [**2108-4-6**]       Discharg...          2   \n",
       "3  Admission Date:  [**2145-3-31**]              ...          2   \n",
       "4  Admission Date:  [**2162-5-16**]              ...          1   \n",
       "\n",
       "                ETHNICITY INSURANCE           RELIGION MARITAL_STATUS GENDER  \\\n",
       "0                   WHITE   Private  PROTESTANT QUAKER       DIVORCED      F   \n",
       "1                   WHITE   Private      NOT SPECIFIED         SINGLE      M   \n",
       "2  BLACK/AFRICAN AMERICAN   Private      NOT SPECIFIED         SINGLE      F   \n",
       "3                   WHITE   Private             JEWISH        MARRIED      F   \n",
       "4                   WHITE   Private           CATHOLIC        MARRIED      M   \n",
       "\n",
       "         DOB  age       New_RELIGION New_MARITAL_STATUS ethnicity_group  \n",
       "0 2082-03-21   35  PROTESTANT QUAKER           DIVORCED           White  \n",
       "1 2090-05-19   60            Unknown             SINGLE           White  \n",
       "2 2059-05-07   49            Unknown             SINGLE           Black  \n",
       "3 2071-06-04   74             JEWISH            MARRIED           White  \n",
       "4 2101-07-30   61           CATHOLIC            MARRIED           White  "
      ]
     },
     "execution_count": 100,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "notes_adm_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "id": "358c7331-22f9-4880-a0e3-2f12e408c34c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "64.0\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "age_median_gp\n",
       "1    21122\n",
       "0    20688\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 120,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Calculate the median age\n",
    "median_age = notes_adm_df['age'].median()\n",
    "print(median_age)\n",
    "\n",
    "notes_adm_df['age_median_gp'] = notes_adm_df['age'].apply(lambda x: 0 if x < median_age else 1)\n",
    "\n",
    "notes_adm_df['age_median_gp'].value_counts()\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "id": "468bdec7-8ce9-4456-bc21-d55fdd4d4923",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAA1sAAAIjCAYAAAD1OgEdAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy88F64QAAAACXBIWXMAAA9hAAAPYQGoP6dpAABgBklEQVR4nO3dd3hUZf738c9kUgkpBJIAhkDooCJIzaIrKIKaZRfFFRUUKSpNKYquaxeUBRcsK4KFoo8oyK7ujwUsCIKLogLS0aAQQk2BQIb0ZOY8f7AZGBIg7XAmyft1XblkvueeM98755jMJ6eMzTAMQwAAAACAKuVjdQMAAAAAUBMRtgAAAADABIQtAAAAADABYQsAAAAATEDYAgAAAAATELYAAAAAwASELQAAAAAwAWELAAAAAExA2AIAAAAAExC2AAC1yv79+2Wz2bRw4ULTX2vhwoWy2Wzav3+/u9asWTP94Q9/MP21JWnt2rWy2Wxau3btJXk9AIAnwhYAoNzefPNN2Ww2de/e3epWZLPZ3F++vr6KiIhQ586dNX78eO3evbvKXufNN9+8JAGtIry5NwCozWyGYRhWNwEAqF569uypI0eOaP/+/fr111/VsmVLy3qx2Wy68cYbde+998owDGVmZmrbtm1aunSpsrOzNX36dE2aNMk93jAM5efny8/PT3a7vcyvc8UVV6hBgwblOkrkdDpVWFiogIAA2Ww2SaePbF1xxRVavnx5mddT0d5cLpcKCgrk7+8vHx/+vgoAlxo/eQEA5ZKUlKTvvvtOs2bNUmRkpBYtWmR1S2rdurWGDBmie+65R+PGjdM777yjvXv3qmvXrnrkkUe0cuVK91ibzabAwMByBa3yys7OliTZ7XYFBga6g9al5uPjo8DAQIIWAFiEn74AgHJZtGiR6tWrp4SEBN1+++3nDVvHjx/XPffco9DQUIWHh2vo0KHatm1bqddL/fLLL7r99tsVERGhwMBAdenSRcuWLatUn/Xr19fixYvl6+urF1980V0v7ZqtlJQUDRs2TDExMQoICFCjRo30pz/9yX2tVbNmzbRr1y6tW7fOfcpir169JJ25LmvdunUaM2aMoqKiFBMT47Hs7Gu2in355Zfq2LGjAgMD1b59e33yyScey5977rlSQ9q567xQb+e7Zmvp0qXq3LmzgoKC1KBBAw0ZMkSHDx/2GHPfffepbt26Onz4sAYMGKC6desqMjJSjz76qJxO50W++wAASfK1ugEAQPWyaNEi3XbbbfL399ddd92lOXPmaOPGjeratat7jMvlUv/+/fXjjz9q9OjRatu2rf7v//5PQ4cOLbG+Xbt2qWfPnrrsssv0l7/8RcHBwfr44481YMAA/etf/9Ktt95a4V5jY2N13XXX6euvv5bD4VBoaGip4wYOHKhdu3bpoYceUrNmzZSWlqZVq1bpwIEDatasmV599VU99NBDqlu3rp588klJUnR0tMc6xowZo8jISD3zzDPuI1vn8+uvv2rQoEEaNWqUhg4dqgULFujPf/6zPv/8c914443lmmNZejvbwoULNWzYMHXt2lXTpk1TamqqXnvtNX377bfasmWLwsPD3WOdTqf69eun7t276+9//7u++uorzZw5Uy1atNDo0aPL1ScA1EoGAABltGnTJkOSsWrVKsMwDMPlchkxMTHG+PHjPcb961//MiQZr776qrvmdDqN66+/3pBkLFiwwF2/4YYbjCuvvNLIy8tz11wul/G73/3OaNWq1UV7kmSMHTv2vMvHjx9vSDK2bdtmGIZhJCUlefRw4sQJQ5Lx8ssvX/B1Lr/8cuO6664rUV+wYIEhybjmmmuMoqKiUpclJSW5a02bNjUkGf/617/ctczMTKNRo0ZGp06d3LVnn33WKO3XdGnrPF9vX3/9tSHJ+Prrrw3DMIyCggIjKirKuOKKK4zc3Fz3uOXLlxuSjGeeecZdGzp0qCHJeOGFFzzW2alTJ6Nz584lXgsAUBKnEQIAymzRokWKjo5W7969JZ2+/mnQoEFavHixx6lln3/+ufz8/HT//fe7az4+Pho7dqzH+jIyMrRmzRrdcccdOnXqlI4dO6Zjx47p+PHj6tevn3799dcSp7eVV926dSVJp06dKnV5UFCQ/P39tXbtWp04caLCr3P//feX+Tqwxo0bexyxCw0N1b333qstW7YoJSWlwj1czKZNm5SWlqYxY8YoMDDQXU9ISFDbtm21YsWKEs8ZNWqUx+Nrr71W+/btM61HAKhJCFsAgDJxOp1avHixevfuraSkJP3222/67bff1L17d6Wmpmr16tXuscnJyWrUqJHq1KnjsY5z71r422+/yTAMPf3004qMjPT4evbZZyVJaWlpleo7KytLkhQSElLq8oCAAE2fPl2fffaZoqOj9fvf/14zZswod+iJi4sr89iWLVuWuB6rdevWklTq9V1VJTk5WZLUpk2bEsvatm3rXl4sMDBQkZGRHrV69epVKpQCQG3CNVsAgDJZs2aNjh49qsWLF2vx4sUlli9atEh9+/Yt1zpdLpck6dFHH1W/fv1KHVPZ28rv3LlTdrv9gmFowoQJ6t+/v/7973/riy++0NNPP61p06ZpzZo16tSpU5leJygoqFJ9nut8dzC8lDenMPOOjQBQGxC2AABlsmjRIkVFRWn27Nklln3yySf69NNPNXfuXAUFBalp06b6+uuvlZOT43F067fffvN4XvPmzSVJfn5+6tOnT5X3fODAAa1bt07x8fHnPbJVrEWLFnrkkUf0yCOP6Ndff1XHjh01c+ZMffDBB5LOH34qoviI3tnr3LNnj6TTdxeUTh9BkqSTJ0963LTi3KNP5emtadOmkqTExERdf/31HssSExPdywEAVYPTCAEAF5Wbm6tPPvlEf/jDH3T77beX+Bo3bpxOnTrlvl17v379VFhYqHfeece9DpfLVSKoRUVFqVevXnrrrbd09OjREq+bnp5e4Z4zMjJ01113yel0uu/SV5qcnBzl5eV51Fq0aKGQkBDl5+e7a8HBwTp58mSF+znbkSNH9Omnn7ofOxwOvf/+++rYsaMaNmzo7kGSvvnmG/e47OxsvffeeyXWV9beunTpoqioKM2dO9djbp999pl+/vlnJSQkVHRKAIBScGQLAHBRy5Yt06lTp/THP/6x1OU9evRwf8DxoEGDNGDAAHXr1k2PPPKIfvvtN7Vt21bLli1TRkaGJM8jMbNnz9Y111yjK6+8Uvfff7+aN2+u1NRUbdiwQYcOHdK2bdsu2t+ePXv0wQcfyDAMORwObdu2TUuXLlVWVpZmzZqlm2666YLPveGGG3THHXeoffv28vX11aeffqrU1FTdeeed7nGdO3fWnDlzNHXqVLVs2VJRUVEljg6VVevWrTVixAht3LhR0dHRmj9/vlJTU7VgwQL3mL59+yo2NlYjRozQ5MmTZbfbNX/+fEVGRurAgQMe6ytrb35+fpo+fbqGDRum6667TnfddZf71u/NmjXTxIkTKzQfAMB5WHw3RABANdC/f38jMDDQyM7OPu+Y++67z/Dz8zOOHTtmGIZhpKenG3fffbcREhJihIWFGffdd5/x7bffGpKMxYsXezx37969xr333ms0bNjQ8PPzMy677DLjD3/4g/HPf/7zor1Jcn/5+PgY4eHhRqdOnYzx48cbu3btKjH+3Fu/Hzt2zBg7dqzRtm1bIzg42AgLCzO6d+9ufPzxxx7PS0lJMRISEoyQkBBDkvtW68W3Yt+4cWOJ1zrfrd8TEhKML774wujQoYMREBBgtG3b1li6dGmJ52/evNno3r274e/vb8TGxhqzZs0qdZ3n6+3cW78XW7JkidGpUycjICDAiIiIMAYPHmwcOnTIY8zQoUON4ODgEj2d75b0AICSbIZhGNbEPABAbfPvf/9bt956q9avX6+ePXta3Q4AAKYibAEATJGbm+txhz6n06m+fftq06ZNSklJqfK79wEA4G24ZgsAYIqHHnpIubm5io+PV35+vj755BN99913eumllwhaAIBagSNbAABTfPjhh5o5c6Z+++035eXlqWXLlho9erTGjRtndWsAAFwShC0AAAAAMAGfswUAAAAAJiBsAQAAAIAJuEFGGbhcLh05ckQhISEeH8QJAAAAoHYxDEOnTp1S48aN5eNz4WNXhK0yOHLkiJo0aWJ1GwAAAAC8xMGDBxUTE3PBMYStMggJCZF0+hsaGhpqcTcAAKBGa9tWOnpUatRI+uUXq7sBcA6Hw6EmTZq4M8KFELbKoPjUwdDQUMIWAAAwV/FpST4+Eu87AK9VlsuLuEEGAAAAAJiAsAUAAAAAJiBsAQAAAIAJuGarihiGoaKiIjmdTqtbATzY7Xb5+vrysQUAUF1s3Cg5nZLdbnUnACqJsFUFCgoKdPToUeXk5FjdClCqOnXqqFGjRvL397e6FQDAxTRqZHUHAKoIYauSXC6XkpKSZLfb1bhxY/n7+3MEAV7DMAwVFBQoPT1dSUlJatWq1UU/fA8AAABVg7BVSQUFBXK5XGrSpInq1KljdTtACUFBQfLz81NycrIKCgoUGBhodUsAAAC1AmGrinC0AN6M/RMAqpG335aysqS6daUHHrC6GwCVQNgCAADwJi+8IB0+LF12GWELqOb4czcAAAAAmICwBVOtXbtWNptNJ0+elCQtXLhQ4eHhlvYEAAAAXAqErVrsvvvuk81m06hRo0osGzt2rGw2m+67774qfc1BgwZpz549VbrOC9mwYYPsdrsSEhIu2WtWhfz8fD355JNq2rSpAgIC1KxZM82fP7/UsYsXL5bNZtOAAQMubZMAAAC4IMJWLdekSRMtXrxYubm57lpeXp4+/PBDxcbGVvnrBQUFKSoqqsrXez7z5s3TQw89pG+++UZHjhy5ZK9bWXfccYdWr16tefPmKTExUR999JHatGlTYtz+/fv16KOP6tprr7WgSwAAAFwIYauWu/rqq9WkSRN98skn7tonn3yi2NhYderUyWOsy+XStGnTFBcXp6CgIF111VX65z//6TFm5cqVat26tYKCgtS7d2/t37/fY/m5pxHu3btXf/rTnxQdHa26deuqa9eu+uqrrzye06xZM7300ksaPny4QkJCFBsbq7fffvuic8vKytKSJUs0evRoJSQkaOHChSXGLFu2TK1atVJgYKB69+6t9957z+O0R0lav369rr32WgUFBalJkyZ6+OGHlZ2dfdHXr6jPP/9c69at08qVK9WnTx81a9ZM8fHx6tmzp8c4p9OpwYMH6/nnn1fz5s1N6wcAAAAVQ9gy06xZUkzMxb/++MeSz/3jH8v23FmzKt3m8OHDtWDBAvfj+fPna9iwYSXGTZs2Te+//77mzp2rXbt2aeLEiRoyZIjWrVsnSTp48KBuu+029e/fX1u3btXIkSP1l7/85YKvnZWVpVtuuUWrV6/Wli1bdNNNN6l///46cOCAx7iZM2eqS5cu2rJli8aMGaPRo0crMTHxguv++OOP1bZtW7Vp00ZDhgzR/PnzZRiGe3lSUpJuv/12DRgwQNu2bdODDz6oJ5980mMde/fu1U033aSBAwdq+/btWrJkidavX69x48ad93X/+9//qm7duhf8WrRo0Xmfv2zZMnXp0kUzZszQZZddptatW+vRRx/1OPooSS+88IKioqI0YsSIC34fAAAAYA1u/W4mh+P0rVsvpkmTkrX09LI91+Eof1/nGDJkiJ544gklJydLkr799lstXrxYa9eudY/Jz8/XSy+9pK+++krx8fGSpObNm2v9+vV66623dN1112nOnDlq0aKFZs6cKUlq06aNduzYoenTp5/3ta+66ipdddVV7sdTpkzRp59+qmXLlnkEmltuuUVjxoyRJD3++ON65ZVX9PXXX5d6al2xefPmaciQIZKkm266SZmZmVq3bp169eolSXrrrbfUpk0bvfzyy+5+d+7cqRdffNG9jmnTpmnw4MGaMGGCJKlVq1Z6/fXX3fMt7QOCu3Tpoq1bt563L0mKjo4+77J9+/Zp/fr1CgwM1Keffqpjx45pzJgxOn78uDsUr1+/XvPmzbvo6wAAAMA6hC0zhYae/oyMi4mMLL1WlueGhpa/rxIvFek+zc4wDCUkJKhBgwYeY3777Tfl5OToxhtv9KgXFBS4Tzf8+eef1b17d4/lxcHsfLKysvTcc89pxYoVOnr0qIqKipSbm1viyFaHDh3c/7bZbGrYsKHS0tLOu97ExET9+OOP+vTTTyVJvr6+GjRokObNm+cOW4mJieratavH87p16+bxeNu2bdq+fbvHkSjDMORyuZSUlKR27dqVeO2goCC1bNnygvO+EJfLJZvNpkWLFiksLEySNGvWLN1+++168803VVRUpHvuuUfvvPNOie0EAAAA70HYMtOkSae/KmLZsqrt5SKGDx/uPpI0e/bsEsuzsrIkSStWrNBl54TAgICACr/uo48+qlWrVunvf/+7WrZsqaCgIN1+++0qKCjwGOfn5+fx2GazyeVynXe98+bNU1FRkRo3buyuGYahgIAAvfHGG+4QczFZWVl68MEH9fDDD5dYdr4biPz3v//VzTfffMH1vvXWWxo8eHCpyxo1aqTLLrvMo8d27drJMAwdOnRI2dnZ2r9/v/r37+9eXvy98PX1VWJiolq0aHHRuQEAvFTr1lJYmHSBsyAAVA+ELUg6fZpdQUGBbDab+vXrV2J5+/btFRAQoAMHDui6664rdR3t2rXTsnNC4vfff3/B1/32229133336dZbb5V0Otyce1ON8ioqKtL777+vmTNnqm/fvh7LBgwYoI8++kijRo1SmzZttHLlSo/lGzdu9Hh89dVXa/fu3eU6UlXZ0wh79uyppUuXKisrS3Xr1pUk7dmzRz4+PoqJiZHNZtOOHTs8nvPUU0/p1KlTeu2119SktNNSAaCWSU9Pl6MKTrWvCqGhoYos7SyW81mzxrxmAFxShC1Ikux2u37++Wf3v88VEhKiRx99VBMnTpTL5dI111yjzMxMffvttwoNDdXQoUM1atQozZw5U5MnT9bIkSO1efPmUu8AeLZWrVrpk08+Uf/+/WWz2fT0009f8IhVWSxfvlwnTpzQiBEjShzBGjhwoObNm6dRo0bpwQcf1KxZs/T4449rxIgR2rp1q7tfm80m6fT1YT169NC4ceM0cuRIBQcHa/fu3Vq1apXeeOONUl+/sqcR3n333ZoyZYqGDRum559/XseOHdPkyZM1fPhwBQUFSZKuuOIKj+cU3+Hx3DoA1Ebp6ekaMmykMk7lWN2KJCkipI4+WPBu+QIXLilvCudSBQI6vBZhC26hF7n+a8qUKYqMjNS0adO0b98+hYeH6+qrr9Zf//pXSadPq/vXv/6liRMn6h//+Ie6devmvmX7+cyaNUvDhw/X7373OzVo0ECPP/54pX/YzZs3T3369Cn1VMGBAwdqxowZ2r59uzp06KB//vOfeuSRR/Taa68pPj5eTz75pEaPHu0+NbJDhw5at26dnnzySV177bUyDEMtWrTQoEGDKtXjhdStW1erVq3SQw89pC5duqh+/fq64447NHXqVNNeEwBqEofDoYxTOYqMH6jgCGtPxcvOSFX6hn/J4XDw5tlLeVs4lwjoNYnNOPte2CiVw+FQWFiYMjMzSwSSvLw8JSUlKS4urtQ706F6efHFFzV37lwdPHjQ6laqFPspgNpk7969unP4KDVLGKPQqBhLe3GkHdL+FW9q8fy5XE/rpYr3F28I59KZgM4+470ulA3OxZEt1Gpvvvmmunbtqvr16+vbb7/Vyy+/fMHP0AIAwHSDB0vHjkkNGkgX+FzGivCm0+W87VS54Ihoy8N5sXSrG0CVIWyhVvv11181depUZWRkKDY2Vo888oieeOIJq9sCANRm69ad/qzNsnwETDl42+lynCqH2oCwhVrtlVde0SuvvGJ1GwAAmI5r2YBLj7AFAABQi3jL6XKcKofawMfqBmoK7jMCb8b+CQAAcOkRtirJz89PkpST4x3nPwOlKd4/i/dXAAAAmI/TCCvJbrcrPDxcaWlpkqQ6deq4PxAXsJphGMrJyVFaWprCw8NL/cBqAAAAmIOwVQUaNmwoSe7ABXib8PBw934KAACAS4OwVQVsNpsaNWqkqKgoFRYWWt0O4MHPz48jWgAAABYgbFUhu93Om1oAAAAAkghbAAAA3uX++6XMTCkszOpOAFQSYQsAAMCbPPus1R0AqCLc+h0AAAAATEDYAgAAAAATELYAAAAAwASELQAAAG8SEyPZbKf/C6BaI2wBAAAAgAkIWwAAAABgAsIWAAAAAJiAsAUAAAAAJiBsAQAAAIAJCFsAAAAAYALCFgAAAACYgLAFAAAAACYgbAEAAACACXytbgAAAABn+eADKT9fCgiwuhMAlUTYAgAA8Ca9elndAYAqwmmEAAAAAGACwhYAAAAAmIDTCAEAALzJ2rVnrtnilEKgWiNsAQAAeJMhQ6TDh6XLLpMOHbK6GwCVwGmEAAAAAGACwhYAAAAAmICwBQAAAAAmIGwBAAAAgAkIWwAAAABgAsIWAAAAAJiAsAUAAAAAJiBsAQAAAIAJCFsAAAAAYAJfqxsAAADAWQ4dsroDAFWEI1sAAAAAYALCFgAAAACYgLAFAAAAACbgmi0AAABv8vzzUmamFBYmPfus1d0AqATCFgAAgDd55x3p8GHpsssIW0A15zWnEf7tb3+TzWbThAkT3LW8vDyNHTtW9evXV926dTVw4EClpqZ6PO/AgQNKSEhQnTp1FBUVpcmTJ6uoqMhjzNq1a3X11VcrICBALVu21MKFCy/BjAAAAADUZl4RtjZu3Ki33npLHTp08KhPnDhR//nPf7R06VKtW7dOR44c0W233eZe7nQ6lZCQoIKCAn333Xd67733tHDhQj3zzDPuMUlJSUpISFDv3r21detWTZgwQSNHjtQXX3xxyeYHAAAAoPaxPGxlZWVp8ODBeuedd1SvXj13PTMzU/PmzdOsWbN0/fXXq3PnzlqwYIG+++47ff/995KkL7/8Urt379YHH3ygjh076uabb9aUKVM0e/ZsFRQUSJLmzp2ruLg4zZw5U+3atdO4ceN0++2365VXXrFkvgAAAABqB8uv2Ro7dqwSEhLUp08fTZ061V3fvHmzCgsL1adPH3etbdu2io2N1YYNG9SjRw9t2LBBV155paKjo91j+vXrp9GjR2vXrl3q1KmTNmzY4LGO4jFnn654rvz8fOXn57sfOxwOSVJRUZH7FEUfHx/5+PjI5XLJ5XK5xxbXnU6nDMO4aN1ut8tms5U49dFut0s6ffSuLHVfX18ZhuFRt9lsstvtJXo8X505MSfmxJyYE3OqCXMq/rePTbLrzHinbJJsHrUzdckuo4x1H0mGR92QTS7ZZJMhn7PqPrYzPZ39vbzgnNzrlJznvO+ozHY6+3VKfg/KPqfz1V2yybhA3UeGbP+r222nt3dxj1buey6Xy72OyszpQvXy7Hv28+wz5ZnTher8jKj8nM5dfiGWhq3Fixfrp59+0saNG0ssS0lJkb+/v8LDwz3q0dHRSklJcY85O2gVLy9edqExDodDubm5CgoKKvHa06ZN0/PPP1+ivmXLFgUHB0uSIiMj1aJFCyUlJSk9Pd09JiYmRjExMdqzZ48yMzPd9ebNmysqKko7d+5Ubm6uu962bVuFh4dry5YtHjtZhw4d5O/vr02bNnn00KVLFxUUFGj79u3umt1uV9euXZWZmalffvnFXQ8KCtJVV12lY8eOad++fe56WFiY2rVrpyNHjujQWZ9Sz5yYE3NiTsyJOdWEOQUGBkqSOkfaFBZ8wl1PzAtRptNfHYNPery53ZETpgLDR53PGitJm7Pryd/m0pV1zqzbKZs2Z0cozF6oNoGn3PVcl107csPVwDdfcQHZ7nqa06Z9On3GztnfswvO6X//Ligo0Jb/PacqtlNOTo58fX1Vx1cecy3vnDKdfkrMC1Vjv1xd5n+ml/SiACXl11WzgGxF+p75o/XhgiAdLqyjVoGnFGYvlCQV+dlkxJ6eqdX7Xk5Ojq7p3lnJUqXmJElJ+cFKLwrU5UGZCvI503t59r0iP5uSfX1VWFjoMVd+RnjPnLKzz/z/cDE24+w4dwkdPHhQXbp00apVq9zXavXq1UsdO3bUq6++qg8//FDDhg3zOMIkSd26dVPv3r01ffp0PfDAA0pOTva4/ionJ0fBwcFauXKlbr75ZrVu3VrDhg3TE0884R6zcuVKJSQkKCcnp9SwVdqRrSZNmuj48eMKDQ2VxF8FmBNzYk7MiTkxJ2+d0/79+3XXiNFq/ocxCots7K5bcWQrM/2I9i1/Ux/Nm6NmzZqVbU6xsdLhwzIuu0zO/fsvONfybKekpCTdPXKMmiWMUb2oxh7jL/WRLUf6Ee1bMcf9fbFy30tKStI9Dz6kJjc9qLCoyyw/suVIP6K9pewz5ZnTher8jKj8nBwOh+rXr6/MzEx3Njgfy45sbd68WWlpabr66qvdNafTqW+++UZvvPGGvvjiCxUUFOjkyZMeR7dSU1PVsGFDSVLDhg31448/eqy3+G6FZ4859w6GqampCg0NLTVoSVJAQIACAgJK1H19feXr6/ktK95Y5yreKGWtn7veitRtNlup9fP1WN46c2JO56szJ+YkMafz9VjeOnOq/JyKT09zGcUhwlNptdN1WznqtlLrxjl1l3Gmp/J8D06/QsnvZ2W209mvU/r3oGxzqmjd9b/AIUlO48zpnlbve8Vvui/Ue1nmVJZ6WfY950X2GX5GWD+n8y0vjWU3yLjhhhu0Y8cObd261f3VpUsXDR482P1vPz8/rV692v2cxMREHThwQPHx8ZKk+Ph47dixQ2lpae4xq1atUmhoqNq3b+8ec/Y6iscUrwMAAAAAzGDZka2QkBBdccUVHrXg4GDVr1/fXR8xYoQmTZqkiIgIhYaG6qGHHlJ8fLx69OghSerbt6/at2+ve+65RzNmzFBKSoqeeuopjR071n1katSoUXrjjTf02GOPafjw4VqzZo0+/vhjrVix4tJOGAAAoCyuu046dkxq0MDqTgBUkuV3I7yQV155RT4+Pho4cKDy8/PVr18/vfnmm+7ldrtdy5cv1+jRoxUfH6/g4GANHTpUL7zwgntMXFycVqxYoYkTJ+q1115TTEyM3n33XfXr18+KKQEAAFzYokVWdwCginhV2Fq7dq3H48DAQM2ePVuzZ88+73OaNm2qlStXXnC9vXr10pYtW6qiRQAAAAAoE8s/1BgAAAAAaiLCFgAAAACYgLAFAADgTa6/Xrr88tP/BVCtedU1WwAAALXenj3S4cNSZqbVnQCoJI5sAQAAAIAJCFsAAAAAYALCFgAAAACYgLAFAAAAACYgbAEAAACACQhbAAAAAGACwhYAAAAAmICwBQAAAAAm4EONAQAAvMkzz0hZWVLdulZ3AqCSCFsAAADe5IEHrO4AQBXhNEIAAAAAMAFhCwAAAABMwGmEAAAA3uToUcnplOx2qVEjq7sBUAmELQAAABMVFhQoOTm5zOOb9uwp39RUFUVHK/nbb6usj+TkZBUVFlXZ+gBcHGELAADAJPlZmdqftE8T/vqcAgICyvSc5SdOKlrS8RMndefwUVXWS15ujg4dPqrYwsIqWyeACyNsAQAAmKQwP1cum68a9LhN9Rs3LdNzfLf+JBXkyzcwWM0SxlRZL2l7dyr54Hw5iwhbwKVC2AIAADBZnXqRCo2KKdNYm93X/d+yPqcsso6nVNm6AJQNdyMEAAAAABMQtgAAAADABIQtAAAAADABYQsAAAAATEDYAgAAAAATELYAAAAAwATc+h0AAMCL/H3ybNldRXL68DYNqO74vxgAAMCLpDYq24cfA/B+nEYIAAAAACYgbAEAAACACTiNEAAAwIt03/C5/AvyVOAfqB/ib7K6HQCVQNgCAADwIrcvfUMRJ9KUUS+KsAVUc5xGCAAAAAAmIGwBAAAAgAkIWwAAAABgAq7ZAgAAwCVXWFCg5ORkq9tQcnKyigqLrG4DNRRhCwAAAJdUflam9ift04S/PqeAgABLe8nLzdGhw0cVW1hoaR+omQhbAAAAuKQK83PlsvmqQY/bVL9xU0t7Sdu7U8kH58tZRNhC1SNsAQAAwBJ16kUqNCrG0h6yjqdY+vqo2bhBBgAAAACYgCNbAAAAXsQRFuHxXwDVF2ELAADAi0x59n2rWwBQRTiNEAAAAABMQNgCAAAAABMQtgAAAADABFyzBQAA4EXuWThNdbMzlRUcpv933xNWtwOgEghbAAAAXqTD9m8VcSJNGfWirG4FQCVxGiEAAAAAmICwBQAAAAAmIGwBAAAAgAkIWwAAAABgAsIWAAAAAJiAsAUAAAAAJiBsAQAAAIAJ+JwtAACqqfT0dDkcDqvbkCSFhoYqMjLS6jYAwKsQtgAAqIbS09M1ZNhIZZzKsboVSVJESB19sOBdAlcV+LF7X9XJdignONTqVgBUEmELAIBqyOFwKONUjiLjByo4ItrSXrIzUpW+4V9yOByErSqwdNDDVrcAoIoQtgAAqMaCI6IVGhVjdRtKt7oBAPBC3CADAAAAAExA2AIAAAAAE3AaIQAAgBeZ+sSfFX7ymE6GN9BT05Za3Q6ASuDIFgAAgBcJyM9VUF62AvJzrW4FQCURtgAAAADABIQtAAAAADAB12wBAAAAXqSwoEDJyclWtyFJCg0N5fPzKoGwBQAAAHiJ/KxM7U/apwl/fU4BAQFWt6OIkDr6YMG7BK4KImwBAAAAXqIwP1cum68a9LhN9Rs3tbSX7IxUpW/4lxwOB2GrgghbAAAAgJepUy9SoVExVrehdKsbqOa4QQYAAAAAmICwBQAAAAAm4DRCAAAAL/L/7n1c/oX5KvCz/uYIACqHsAUAACrNm25VnZycrKLCIqvbqLDtHa+1ugUAVYSwBQAAKsXbblWdl5ujQ4ePKraw0OpWANRyhC0AAFAp3nSraklK27tTyQfny1lE2AJgLcIWAACoEt5yq+qs4ylWt1ApTff/LN+iQhX5+im5WTur2wFQCYQtAAAALzLu9cmKOJGmjHpRmjxrudXtAKgEbv0OAAAAACYgbAEAAACACQhbAAAAAGACwhYAAAAAmICwBQAAAAAmIGwBAAAAgAkIWwAAAABgAsIWAAAAAJiAsAUAAAAAJrA0bM2ZM0cdOnRQaGioQkNDFR8fr88++8y9PC8vT2PHjlX9+vVVt25dDRw4UKmpqR7rOHDggBISElSnTh1FRUVp8uTJKioq8hizdu1aXX311QoICFDLli21cOHCSzE9AACAcnv6xSUa++YaPf3iEqtbAVBJloatmJgY/e1vf9PmzZu1adMmXX/99frTn/6kXbt2SZImTpyo//znP1q6dKnWrVunI0eO6LbbbnM/3+l0KiEhQQUFBfruu+/03nvvaeHChXrmmWfcY5KSkpSQkKDevXtr69atmjBhgkaOHKkvvvjiks8XAADgYvKCgpUXVFd5QcFWtwKgknytfPH+/ft7PH7xxRc1Z84cff/994qJidG8efP04Ycf6vrrr5ckLViwQO3atdP333+vHj166Msvv9Tu3bv11VdfKTo6Wh07dtSUKVP0+OOP67nnnpO/v7/mzp2ruLg4zZw5U5LUrl07rV+/Xq+88or69et3yecMAAAAoHawNGydzel0aunSpcrOzlZ8fLw2b96swsJC9enTxz2mbdu2io2N1YYNG9SjRw9t2LBBV155paKjo91j+vXrp9GjR2vXrl3q1KmTNmzY4LGO4jETJkw4by/5+fnKz893P3Y4HJKkoqIi9ymKPj4+8vHxkcvlksvlco8trjudThmGcdG63W6XzWYrceqj3W53f1/KUvf19ZVhGB51m80mu91eosfz1ZkTc2JOzIk5VZ85Ff/bJsmus+qyyZBNNhnykVGi7iNDtjLUnbJJsnms+0xdsp811tfHdmZuJcb7SDI8xhuyyVVKj+erl3dOxd3YbZ79lGdOF66XfU5225me2E5nevf1scnH5/QJVpWZ04XrZZuTr49Nfr6n3xJXdt+radvJbjv9M0iSV/zc85af5ecuvxDLw9aOHTsUHx+vvLw81a1bV59++qnat2+vrVu3yt/fX+Hh4R7jo6OjlZKSIklKSUnxCFrFy4uXXWiMw+FQbm6ugoKCSvQ0bdo0Pf/88yXqW7ZsUXDw6UP6kZGRatGihZKSkpSenu4eExMTo5iYGO3Zs0eZmZnuevPmzRUVFaWdO3cqNzfXXW/btq3Cw8O1ZcsWj52sQ4cO8vf316ZNmzx66NKliwoKCrR9+3Z3zW63q2vXrsrMzNQvv/zirgcFBemqq67SsWPHtG/fPnc9LCxM7dq105EjR3To0CF3nTkxJ+bEnJhT9ZlT8RuKpiFSy+AT7np6UYCS8uuqWUC2In3P/OHwcEGQDhfWUavAUwqzF7rrSfnBSi8K1OVBmQryOdN7Yl6IMp3+6hh80uON3Y6cMBUYPup81mvmta6nDX5+CvazedSdsmlzdoTC7IVqE3jKXc912bUjN1wNfPMVF5Dtrmc6/ZSYF6rGfrm6zP/M97e8czLCAiRJPS4LUMRZ/ZRnTpK0Obue/G0uXVnnzD5Q3jlFx4ToG0nNw33Vvozb6fLlnyqmMEO24EDt+MPpyydq2nbKa11Px9u0kqRKzakqtlNe63qqf/ONOiZVet+radupyM8mIzZGkrzi5563/CzPzj7zfb4Ym3F2nLNAQUGBDhw4oMzMTP3zn//Uu+++q3Xr1mnr1q0aNmyYxxEmSerWrZt69+6t6dOn64EHHlBycrLH9Vc5OTkKDg7WypUrdfPNN6t169YaNmyYnnjiCfeYlStXKiEhQTk5OaWGrdKObDVp0kTHjx9XaGiopOr911Bv+asAc2JOzIk5MaeKzykpKUl3jxyjuIQxCo9q7K5b8Zf4o4lbtG7hdP1+1FQ1btrynPGX/sjWoZ9/0vr3Z6j36KmKjj3TjxVHto788pO+eW+Grhs1VY3O+t5caE4zJvVXxIk0nagXqcdn/eeCc62u2+lo4hb99/2Xdc0DL6hR01aWHtk6mrhF330wU/Ejn1PDpq0sP7LlTdvJkX5E+1bM0Ufz5qhZs2aW/9zzlp/lDodD9evXV2ZmpjsbnI/lR7b8/f3VsuXpHalz587auHGjXnvtNQ0aNEgFBQU6efKkx9Gt1NRUNWzYUJLUsGFD/fjjjx7rK75b4dljzr2DYWpqqkJDQ0sNWpIUEBCggICAEnVfX1/5+np+y4o31rmKN0pZ6+eutyJ1m81Wav18PZa3zpyY0/nqzIk5SczpfD2Wt17WORX/21DxGzBPhmzuN3Jnc/3vTV9Z66Wt+3T9zNgi19lvKksbX3ov5+uxvPVzey/uxmmU3k9Z5nTxetl6dBpneirPdjqzzPM5NWU7FbkM95viyszp4vWL91jkMlT4vzfXld33LlavbtvJacgdRrzh515F61X9s/x8y0vjdZ+z5XK5lJ+fr86dO8vPz0+rV692L0tMTNSBAwcUHx8vSYqPj9eOHTuUlpbmHrNq1SqFhoaqffv27jFnr6N4TPE6AAAAAMAMlh7ZeuKJJ3TzzTcrNjZWp06d0ocffqi1a9fqiy++UFhYmEaMGKFJkyYpIiJCoaGheuihhxQfH68ePXpIkvr27av27dvrnnvu0YwZM5SSkqKnnnpKY8eOdR+ZGjVqlN544w099thjGj58uNasWaOPP/5YK1assHLqAAAAAGo4S8NWWlqa7r33Xh09elRhYWHq0KGDvvjiC914442SpFdeeUU+Pj4aOHCg8vPz1a9fP7355pvu59vtdi1fvlyjR49WfHy8goODNXToUL3wwgvuMXFxcVqxYoUmTpyo1157TTExMXr33Xe57TsAAAAAU1katubNm3fB5YGBgZo9e7Zmz5593jFNmzbVypUrL7ieXr16acuWLRXqEQAAAAAqwuuu2QIAAACAmoCwBQAAAAAmIGwBAAAAgAks/5wtAAAAnHGgaRudiIjSqZB6VrcCoJIIWwAAAF7kH+NnWt0CgCrCaYQAAAAAYALCFgAAAACYgLAFAAAAACbgmi0AAAAv8tBrjyjk1AmdCqnH9VtANUfYAgAA8CKxyYmKOJGmjHpRVrcCoJI4jRAAAAAATEDYAgAAAAATELYAAAAAwASELQAAAAAwAWELAAAAAExA2AIAAAAAExC2AAAAAMAEhC0AAAAAMAEfagwAAOBFVvW7S4G52coLCra6FQCVVKGwtW/fPjVv3ryqewEAAKj1vuw32OoWAFSRCp1G2LJlS/Xu3VsffPCB8vLyqronAAAAAKj2KhS2fvrpJ3Xo0EGTJk1Sw4YN9eCDD+rHH3+s6t4AAAAAoNqqUNjq2LGjXnvtNR05ckTz58/X0aNHdc011+iKK67QrFmzlJ6eXtV9AgAA1AqBudkKzM1SYG621a0AqKRK3Y3Q19dXt912m5YuXarp06frt99+06OPPqomTZro3nvv1dGjR6uqTwAAgFphypODNHvM9Zry5CCrWwFQSZUKW5s2bdKYMWPUqFEjzZo1S48++qj27t2rVatW6ciRI/rTn/5UVX0CAAAAQLVSobsRzpo1SwsWLFBiYqJuueUWvf/++7rlllvk43M6u8XFxWnhwoVq1qxZVfYKAAAAANVGhcLWnDlzNHz4cN13331q1KhRqWOioqI0b968SjUHAAAAANVVhcLWr7/+etEx/v7+Gjp0aEVWDwAAAADVXoWu2VqwYIGWLl1aor506VK99957lW4KAAAAAKq7CoWtadOmqUGDBiXqUVFReumllyrdFAAAAABUdxUKWwcOHFBcXFyJetOmTXXgwIFKNwUAAAAA1V2FwlZUVJS2b99eor5t2zbVr1+/0k0BAAAAQHVXobB111136eGHH9bXX38tp9Mpp9OpNWvWaPz48brzzjurukcAAAAAqHYqdDfCKVOmaP/+/brhhhvk63t6FS6XS/feey/XbAEAAFTCGw+/LN+iQhX5+lndCoBKqlDY8vf315IlSzRlyhRt27ZNQUFBuvLKK9W0adOq7g8AAKBWSW7WzuoWAFSRCoWtYq1bt1br1q2rqhcAAAAAqDEqFLacTqcWLlyo1atXKy0tTS6Xy2P5mjVrqqQ5AAAAAKiuKhS2xo8fr4ULFyohIUFXXHGFbDZbVfcFAABQK3XY+l/5F+arwC9A2ztea3U7ACqhQmFr8eLF+vjjj3XLLbdUdT8AAAC12j3vT1fEiTRl1IvSZMIWUK1V6Nbv/v7+atmyZVX3AgAAAAA1RoXC1iOPPKLXXntNhmFUdT8AAAAAUCNU6DTC9evX6+uvv9Znn32myy+/XH5+np8D8cknn1RJcwAAAABQXVUobIWHh+vWW2+t6l4AAAAAoMaoUNhasGBBVfcBAAAAwMsUFhQoOTnZ6jYkSaGhoYqMjLS6jXKp8IcaFxUVae3atdq7d6/uvvtuhYSE6MiRIwoNDVXdunWrskcAAAAAl1h+Vqb2J+3ThL8+p4CAAKvbUURIHX2w4N1qFbgqFLaSk5N100036cCBA8rPz9eNN96okJAQTZ8+Xfn5+Zo7d25V9wkAAADgEirMz5XL5qsGPW5T/cZNLe0lOyNV6Rv+JYfDUfPD1vjx49WlSxdt27ZN9evXd9dvvfVW3X///VXWHAAAAABr1akXqdCoGKvbULrVDVRAhcLWf//7X3333Xfy9/f3qDdr1kyHDx+uksYAAABqo/yAIOUGBis/IMjqVgBUUoXClsvlktPpLFE/dOiQQkJCKt0UAABAbfXUtKVWtwCgilToQ4379u2rV1991f3YZrMpKytLzz77rG655Zaq6g0AAAAAqq0KHdmaOXOm+vXrp/bt2ysvL0933323fv31VzVo0EAfffRRVfcIAAAAANVOhcJWTEyMtm3bpsWLF2v79u3KysrSiBEjNHjwYAUFcX4xAAAAAFT4c7Z8fX01ZMiQquwFAACg1vvzktdVJ9uhnOBQLR30sNXtAKiECoWt999//4LL77333go1AwAAUNt1++FLRZxIU0a9KMIWUM1V+HO2zlZYWKicnBz5+/urTp06hC0AAAAAtV6F7kZ44sQJj6+srCwlJibqmmuu4QYZAAAAAKAKhq3StGrVSn/7299KHPUCAAAAgNqoysKWdPqmGUeOHKnKVQIAAABAtVSha7aWLVvm8dgwDB09elRvvPGGevbsWSWNAQAAAEB1VqGwNWDAAI/HNptNkZGRuv766zVz5syq6AsAAAAAqrUKhS2Xy1XVfQAAAABAjVKl12wBAAAAAE6r0JGtSZMmlXnsrFmzKvISAAAAtdL2Dj1VNztTWcFhVrcCoJIqFLa2bNmiLVu2qLCwUG3atJEk7dmzR3a7XVdffbV7nM1mq5ouAQAAaon/d98TVrcAoIpUKGz1799fISEheu+991SvXj1Jpz/oeNiwYbr22mv1yCOPVGmTAAAAAFDdVOiarZkzZ2ratGnuoCVJ9erV09SpU7kbIQAAAACogmHL4XAoPT29RD09PV2nTp2qdFMAAAAAUN1V6DTCW2+9VcOGDdPMmTPVrVs3SdIPP/ygyZMn67bbbqvSBgEAAGqTp5+/V6GZGXKERWjKs+9b3Q6ASqhQ2Jo7d64effRR3X333SosLDy9Il9fjRgxQi+//HKVNggAAFCbhGZmKOJEmtVtAKgCFQpbderU0ZtvvqmXX35Ze/fulSS1aNFCwcHBVdocAAAAAFRXlfpQ46NHj+ro0aNq1aqVgoODZRhGVfUFAAAAANVahcLW8ePHdcMNN6h169a65ZZbdPToUUnSiBEjuO07AAAAAKiCpxFOnDhRfn5+OnDggNq1a+euDxo0SJMmTeL27wCAGis9PV0Oh8PqNpScnKyiwiKr2wAAXECFwtaXX36pL774QjExMR71Vq1aKTk5uUoaAwDA26Snp2vIsJHKOJVjdSvKy83RocNHFfu/G1UBALxPhcJWdna26tSpU6KekZGhgICASjcFAIA3cjgcyjiVo8j4gQqOiLa0l7S9O5V8cL6cRYQtAPBWFQpb1157rd5//31NmTJFkmSz2eRyuTRjxgz17t27ShsEAMDbBEdEKzQq5uIDTZR1PMXS1wcAXFyFwtaMGTN0ww03aNOmTSooKNBjjz2mXbt2KSMjQ99++21V9wgAAAAA1U6FwtYVV1yhPXv26I033lBISIiysrJ02223aezYsWrUqFFV9wgAAFBr/PPP4+RfkKcC/0CrWwFQSeUOW4WFhbrppps0d+5cPfnkk2b0BAAAUGv9EH+T1S0AqCLl/pwtPz8/bd++3YxeAAAAAKDGqNCHGg8ZMkTz5s2r6l4AAAAAoMao0DVbRUVFmj9/vr766it17txZwcHBHstnzZpVJc0BAADUNtFHk2V3Fcnp46vURk2tbgdAJZQrbO3bt0/NmjXTzp07dfXVV0uS9uzZ4zHGZrNVXXcAAAC1zKMvj1XEiTRl1IvS5FnLrW4HQCWUK2y1atVKR48e1ddffy1JGjRokF5//XVFR1v7wY4AAAAA4G3Kdc2WYRgejz/77DNlZ2dXaUMAAAAAUBNU6AYZxc4NXwAAAACA08oVtmw2W4lrsipzjda0adPUtWtXhYSEKCoqSgMGDFBiYqLHmLy8PI0dO1b169dX3bp1NXDgQKWmpnqMOXDggBISElSnTh1FRUVp8uTJKioq8hizdu1aXX311QoICFDLli21cOHCCvcNAAAAABdTrmu2DMPQfffdp4CAAEmng9CoUaNK3I3wk08+KdP61q1bp7Fjx6pr164qKirSX//6V/Xt21e7d+92r3PixIlasWKFli5dqrCwMI0bN0633Xabvv32W0mS0+lUQkKCGjZsqO+++05Hjx7VvffeKz8/P7300kuSpKSkJCUkJGjUqFFatGiRVq9erZEjR6pRo0bq169feb4FAAAAAFAm5QpbQ4cO9Xg8ZMiQSr34559/7vF44cKFioqK0ubNm/X73/9emZmZmjdvnj788ENdf/31kqQFCxaoXbt2+v7779WjRw99+eWX2r17t7766itFR0erY8eOmjJlih5//HE999xz8vf319y5cxUXF6eZM2dKktq1a6f169frlVdeKTVs5efnKz8/3/3Y4XBIOn3L++IjZj4+PvLx8ZHL5ZLL5XKPLa47nU6P0yzPV7fb7bLZbCWOxNntdkmnw2RZ6r6+vjIMw6Nus9lkt9tL9Hi+OnNiTsyJOTGnC/de/G8fm2TXmfFO2STZPGpn6pJdRhnrPpIMj7ohm1yyySZDPmfV7f87scQmz15csskoZXxx3UeGbGWol2dOvj5nznIpOb7sczpfvbxzKu7GXk2305l5GO7n1LTt5Otjk4/P6ROsKjOnC9fLNidfH5v8fE+/Ja7svsd2Mm872W1nfh9Y/fvp3OUXUq6wtWDBgvIML7fMzExJUkREhCRp8+bNKiwsVJ8+fdxj2rZtq9jYWG3YsEE9evTQhg0bdOWVV3rcEbFfv34aPXq0du3apU6dOmnDhg0e6ygeM2HChFL7mDZtmp5//vkS9S1btriPuEVGRqpFixZKSkpSenq6e0xMTIxiYmK0Z88e93wkqXnz5oqKitLOnTuVm5vrMZ/w8HBt2bLF4w1Dhw4d5O/vr02bNnn00KVLFxUUFGj79u3umt1uV9euXZWZmalffvnFXQ8KCtJVV12lY8eOad++fe56WFiY2rVrpyNHjujQoUPuOnNiTsyJOTGnC88pMDBQktQ50qaw4BPuemJeiDKd/uoYfNLjDcOOnDAVGD7qfNZYSdqcXU/+NpeurHNm3U7ZtDk7QmH2QrUJPOWu57rs2pEbrga++YoLOHNTquiYEH0jqXm4r9qftf70ogAl5ddVs4BsRfqe+cPh4YIgHS6so1aBpxRmL3TXk/KDlV4UqMuDMhXkc2Z7lGdOea3raYOfn4L9bB718s4p0+mnxLxQNfbL1WX+Z/aZ8s7JCDt9Bk6PywIUUQ23UzE/m8vdU03bTnmt6+l4m1aSVKk5SZXfTnmt66n+zTfqmFTpfY/tZN52KvKzKaZ7Z0my/PdTeW4QaDO85C4XLpdLf/zjH3Xy5EmtX79ekvThhx9q2LBhHkeZJKlbt27q3bu3pk+frgceeEDJycn64osv3MtzcnIUHByslStX6uabb1br1q01bNgwPfHEE+4xK1euVEJCgnJychQUFOSx/tKObDVp0kTHjx9XaGioJP7Cy5yYE3NiTrVxTvv379ddI0ar+R/GKCyysbtuxRGTI7/8pG/em6HrRk1Vo6Yt3XUr/hJ/NHGL1i2crt+PmqrGZ/VS3jlV1ZGtQz//pPXvz1Dv0VMVHXumn+qynWZM6q+IE2k6US9Sj8/6zwXnWl2309HELfrv+y/rmgdeUKOmrSw9YnI0cYu++2Cm4kc+p4ZNW1l+ZIvtVHrvjvQj2v/ZW1r0zmzFxcVZ+vvJ4XCofv36yszMdGeD8ynXkS0zjR07Vjt37nQHLSsFBAS4r0s7m6+vr3x9Pb9lxRvrXMUbpaz1c9dbkbrNZiu1fr4ey1tnTszpfHXmxJyk2jGn4ptCuYziNwieSqudrpd+M6nS67ZS68Y5dadRXC/9dc8dX8z1vzdIZa2XZU5FrrPfgJU2vmxzqmj93N6Lu3FW4+10Zpnnc2rKdipyGe43xZWZ08XrF++xyGWo8H9vriu7712sznaq+HZyGmfCjtW/n863vNTnlHmkicaNG6fly5frm2++UUxMjLvesGFDFRQU6OTJkwoPD3fXU1NT1bBhQ/eYH3/80WN9xXcrPHvMuXcwTE1NVWhoaImjWgAAAFaa+sxC+biccvmU/kYQQPVRqc/ZqizDMDRu3Dh9+umnWrNmjeLi4jyWd+7cWX5+flq9erW7lpiYqAMHDig+Pl6SFB8frx07digtLc09ZtWqVQoNDVX79u3dY85eR/GY4nUAAAB4i8zwBjoREa3M8AZWtwKgkiw9sjV27Fh9+OGH+r//+z+FhIQoJSVF0ukLmYOCghQWFqYRI0Zo0qRJioiIUGhoqB566CHFx8erR48ekqS+ffuqffv2uueeezRjxgylpKToqaee0tixY92nAo4aNUpvvPGGHnvsMQ0fPlxr1qzRxx9/rBUrVlg2dwAAAAA1m6VHtubMmaPMzEz16tVLjRo1cn8tWbLEPeaVV17RH/7wBw0cOFC///3v1bBhQ4/P8bLb7Vq+fLnsdrvi4+M1ZMgQ3XvvvXrhhRfcY+Li4rRixQqtWrVKV111lWbOnKl3332Xz9gCAAAAYBpLj2yV5UaIgYGBmj17tmbPnn3eMU2bNtXKlSsvuJ5evXppy5Yt5e4RAADgUvr92k8VmJ+jvIA6+qbXrVa3A6ASvOIGGQAAADit/7J5ijiRpox6UYQtoJqz9DRCAAAAAKipCFsAAAAAYALCFgAAAACYgLAFAAAAACYgbAEAAACACQhbAAAAAGACwhYAAAAAmICwBQAAAAAm4EONAQAAvEhqdBPlBgXLERphdSsAKomwBQAA4EX+/vgcq1sAUEU4jRAAAAAATEDYAgAAAAATELYAAAAAwARcswUAAOBF7n/radU9dVJZIeF658EpVrcDoBIIWwAAAF6kdeIWRZxIU0a9KKtbAVBJnEYIAAAAACYgbAEAAACACQhbAAAAAGACwhYAAAAAmICwBQAAAAAmIGwBAAAAgAkIWwAAAABgAsIWAAAAAJiADzUGAADwIv/9/Z8UlJul3KC6VrcCoJIIWwAAAF5k2YD7rW4BQBXhNEIAAAAAMAFhCwAAAABMQNgCAAAAABNwzRYAwOulp6fL4XBY3YaSk5NVVFhkdRuo4V6e9AdFnEhTRr0oTZ613Op2AFQCYQsA4NXS09M1ZNhIZZzKsboV5eXm6NDho4otLLS6FQBANUDYAgB4NYfDoYxTOYqMH6jgiGhLe0nbu1PJB+fLWUTYAgBcHGELAFAtBEdEKzQqxtIeso6nWPr6AIDqhRtkAAAAAIAJCFsAAAAAYALCFgAAAACYgLAFAAAAACYgbAEAAACACQhbAAAAAGACbv0OAADgRd594Hn5FhaoyM/f6lYAVBJhCwAAwIsktu1sdQsAqginEQIAAACACQhbAAAAAGACTiMEAADwIm1+2ey+ZotTCoHqjbAFAADgRUa+/awiTqQpo16UJs9abnU7ACqB0wgBAAAAwAQc2QIAlJCeni6Hw2F1G5Kk5ORkFRUWWd0GAADlRtgCAHhIT0/XkGEjlXEqx+pWJEl5uTk6dPioYgsLrW4FAIByIWwBADw4HA5lnMpRZPxABUdEW92O0vbuVPLB+XIWEbYAANULYQsAUKrgiGiFRsVY3YayjqdY3QIAABXCDTIAAAAAwASELQAAAAAwAWELAAAAAExA2AIAAAAAE3CDDAAAAC8yedZyq1sAUEU4sgUAAAAAJiBsAQAAAIAJCFsAAAAAYAKu2QIAAPAif/z3OwrKzVJuUF0tG3C/1e0AqATCFgAAgBe59pv/U8SJNGXUiyJsAdUcpxECAAAAgAkIWwAAAABgAsIWAAAAAJiAsAUAAAAAJiBsAQAAAIAJCFsAAAAAYALCFgAAAACYgLAFAAAAACbgQ40BAAC8yJ42nVT31EllhYRb3QqASiJsAQAAeJF3HpxidQsAqginEQIAAACACQhbAAAAAGACwhYAAAAAmIBrtgAAALzIo9NHK9SRIUdohP7++Byr2wFQCYQtAAAALxKdelARJ9IUlJttdSsAKonTCAEAAADABIQtAAAAADABYQsAAAAATEDYAgAAAAATELYAAAAAwASELQAAAAAwAWELAAAAAExA2AIAAAAAE/ChxgAAAF7kP38cocD8HOUF1LG6FQCVRNgCAADwIt/0utXqFgBUEUtPI/zmm2/Uv39/NW7cWDabTf/+9789lhuGoWeeeUaNGjVSUFCQ+vTpo19//dVjTEZGhgYPHqzQ0FCFh4drxIgRysrK8hizfft2XXvttQoMDFSTJk00Y8YMs6cGAAAAoJazNGxlZ2frqquu0uzZs0tdPmPGDL3++uuaO3eufvjhBwUHB6tfv37Ky8tzjxk8eLB27dqlVatWafny5frmm2/0wAMPuJc7HA717dtXTZs21ebNm/Xyyy/rueee09tvv236/AAAAADUXpaeRnjzzTfr5ptvLnWZYRh69dVX9dRTT+lPf/qTJOn9999XdHS0/v3vf+vOO+/Uzz//rM8//1wbN25Uly5dJEn/+Mc/dMstt+jvf/+7GjdurEWLFqmgoEDz58+Xv7+/Lr/8cm3dulWzZs3yCGUAAADeIOzkMfm4nHL52JUZ3sDqdgBUgtdes5WUlKSUlBT16dPHXQsLC1P37t21YcMG3XnnndqwYYPCw8PdQUuS+vTpIx8fH/3www+69dZbtWHDBv3+97+Xv7+/e0y/fv00ffp0nThxQvXq1Svx2vn5+crPz3c/djgckqSioiIVFRVJknx8fOTj4yOXyyWXy+UeW1x3Op0yDOOidbvdLpvN5l7v2XVJcjqdZar7+vrKMAyPus1mk91uL9Hj+erMiTkxJ+Ykyf1vmyS7zqrLJkM22WTIR0aJuo8M2cpQd8omyeax7jN1yX7WWI9ebZ79OOUjyfAYb8gmVyk9nq9e3jnZ/vffkr2Ub07nr5d9TnbbmZ6s3k6+Pjb3v0uOZzsV91TW7fTUC/cp4kSaTtSL1OOz/nPBuVbX7eTrY5OPz+kTrCr7M6Ky28nXxyY/39NviSu777GdzNtOdtuZ33FW/849d/mFeG3YSklJkSRFR0d71KOjo93LUlJSFBUV5bHc19dXERERHmPi4uJKrKN4WWlha9q0aXr++edL1Lds2aLg4GBJUmRkpFq0aKGkpCSlp6e7x8TExCgmJkZ79uxRZmamu968eXNFRUVp586dys3Nddfbtm2r8PBwbdmyxeNNUIcOHeTv769NmzZ59NClSxcVFBRo+/bt7prdblfXrl2VmZmpX375xV0PCgrSVVddpWPHjmnfvn3uelhYmNq1a6cjR47o0KFD7jpzYk7MiTlJZ8JW0xCpZfAJdz29KEBJ+XXVLCBbkb5n/iB1uCBIhwvrqFXgKYXZC931pPxgpRcF6vKgTAX5nOk9MS9EmU5/dQw+6fHLdUdOmAoMH3U+6zUl6aiPTRHhYbqhWaACgk4vc8qmzdkRCrMXqk3gKffYXJddO3LD1cA3X3EB2e56ptNPiXmhauyXq8v8z3x/yzsnIyxAktTjsgBFnNVneee0Obue/G0uXVnnzD5Q3jlFx4ToG0nNw33V3uLtlNe6njb4+SnYz+ZRZztVbDsV87O53D3VtO2U17qejrdpJUmV/hlR2e2U17qe6t98o45Jld732E7mbaciP5tiuneWJMt/52Znn/k+X4zNODvOWchms+nTTz/VgAEDJEnfffedevbsqSNHjqhRo0bucXfccYdsNpuWLFmil156Se+9954SExM91hUVFaXnn39eo0ePVt++fRUXF6e33nrLvXz37t26/PLLtXv3brVr165EL6Ud2WrSpImOHz+u0NBQSdX7r9Y18S/xzIk5Maeqm1NSUpLuHjlGcQljFB7V2F236sjWwZ9/0nf/72X1GjVF0bEtzxp/6Y+YHPr5J61/f4Z6j556Ti+X/ojJkV9+0jfvzdB1o6aqUdMzvVixnY4mbtG6hdP1+1FT1fisXso7J7bT6fqMSf1NObLlTdvpaOIW/ff9l3XNAy+oUdNWlh4xOZq4Rd99MFPxI59Tw6atLD+yxXYqvXdH+hHt/+wtLXpntuLi4iz9netwOFS/fn1lZma6s8H5eO2RrYYNG0qSUlNTPcJWamqqOnbs6B6Tlpbm8byioiJlZGS4n9+wYUOlpqZ6jCl+XDzmXAEBAQoICChR9/X1la+v57eseGOdq3ijlLV+7norUrfZbKXWz9djeevMiTmdr86cataciv9tqPiXoCdDNvcvzbO5/veLt6z10tZ9ul5yrGEYchqlPaf0Xs7XY3nr5/Ze/Cu59F7KN6fz18vWo9M405PV26nIdfYbsNLGs53Ku53OLPN8Tk3ZTkUuw/2muCp+Rpy/fvEei1yGCv/35rqy+97F6mynim8np3Em7Fj9O/d8y0tj6d0ILyQuLk4NGzbU6tWr3TWHw6EffvhB8fHxkqT4+HidPHlSmzdvdo9Zs2aNXC6Xunfv7h7zzTffqLDwzCHeVatWqU2bNqWeQggAAAAAVcHSsJWVlaWtW7dq69atkk6furJ161YdOHBANptNEyZM0NSpU7Vs2TLt2LFD9957rxo3buw+1bBdu3a66aabdP/99+vHH3/Ut99+q3HjxunOO+9U48anT325++675e/vrxEjRmjXrl1asmSJXnvtNU2aNMmiWQMAAACoDSw9jXDTpk3q3bu3+3FxABo6dKgWLlyoxx57TNnZ2XrggQd08uRJXXPNNfr8888VGBjofs6iRYs0btw43XDDDfLx8dHAgQP1+uuvu5eHhYXpyy+/1NixY9W5c2c1aNBAzzzzDLd9BwAAAGAqS8NWr169dKH7c9hsNr3wwgt64YUXzjsmIiJCH3744QVfp0OHDvrvf/9b4T4BAAAAoLy89potAAAAAKjOCFsAAAAAYAKvvfU7ANQ26enpcjgcVreh5ORkFRUWXXwgAFP8ffJs2V1FcvrwNg2o7vi/GAC8QHp6uoYMG6mMUzlWt6K83BwdOnxUsWd9ZAaASye1UVOrWwBQRQhbAOAFHA6HMk7lKDJ+oIIjoi3tJW3vTiUfnC9nEWELAIDKIGwBqNW87dS94IhohUbFWNpL1vEUS18fAICagrAFoNbi1D0A3qj7hs/lX5CnAv9A/RB/k9XtAKgEwhaAWotT9wB4o9uXvqGIE2nKqBdF2AKqOcIWgFqPU/cAAIAZ+JwtAAAAADABYQsAAAAATEDYAgAAAAATELYAAAAAwASELQAAAAAwAWELAAAAAExA2AIAAAAAE/A5WwAuufT0dDkcDqvbUHJysooKi6xuAwA8OMIiPP4LoPoibAG4pNLT0zVk2EhlnMqxuhXl5ebo0OGjii0stLoVAHCb8uz7VrcAoIoQtgBcUg6HQxmnchQZP1DBEdGW9pK2d6eSD86Xs4iwBQAAqh5hC6glvO3UveCIaIVGxVjaS9bxFEtfHwAA1GyELaAW4NQ9AACAS4+wBdQCnLoHANXHPQunqW52prKCw/T/7nvC6nYAVAJhC6hFOHUPALxfh+3fKuJEmjLqRVndCoBK4nO2AAAAAMAEhC0AAAAAMAFhCwAAAABMQNgCAAAAABMQtgAAAADABIQtAAAAADABYQsAAAAATEDYAgAAAAAT8KHGAAAAXuTH7n1VJ9uhnOBQq1sBUEmELQAAAC+ydNDDVrcAoIpwGiEAAAAAmICwBQAAAAAmIGwBAAAAgAm4ZgsAAMCLTH3izwo/eUwnwxvoqWlLrW4HQCVwZAsAAMCLBOTnKigvWwH5uVa3AqCSCFsAAAAAYALCFgAAAACYgLAFAAAAACYgbAEAAACACQhbAAAAAGACwhYAAAAAmICwBQAAAAAmIGwBAAAAgAl8rW4AAAAAZ/y/ex+Xf2G+CvwCrG4FQCURtgAAALzI9o7XWt0CgCrCaYQAAAAAYALCFgAAAACYgNMIAQAAvEjT/T/Lt6hQRb5+Sm7Wzup2AFQCYQsAAMCLjHt9siJOpCmjXpQmz1pudTsAKoHTCAEAAADABIQtAAAAADABYQsAAAAATEDYAgAAAAATELYAAAAAwASELQAAAAAwAWELAAAAAExA2AIAAAAAExC2AAAAAMAEvlY3AAAAgDOefnGJJEOSzepWAFQSYQsAAMCL5AUFW90CgCrCaYQAAAAAYALCFgAAAACYgNMIAQAAvEjfLxYpMDdbeUHB+rLfYKvbAVAJhC0AAAAvcuMXHyniRJoy6kURtoBqjtMIAQAAAMAEhC0AAAAAMAFhCwAAAABMQNgCAAAAABNwg4xqKj09XQ6Hw+o2JEmhoaGKjIy0ug0AAADAqxC2qqH09HQNGTZSGadyrG5FkhQRUkcfLHiXwAUAAACchbBVDTkcDmWcylFk/EAFR0Rb2kt2RqrSN/xLDoeDsAUAAACchbBVjQVHRCs0KsbqNpRudQMAAACAFyJsAQAAeJEDTdvoRESUToXUs7oVAJVE2AIAAPAi/xg/0+oWAFQRbv0OAAAAACYgbAEAAACACQhbAAAAAGACrtkCAADwIg+99ohCTp3QqZB6XL8FVHOELQAAAC8Sm5yoiBNpyqgXZXUrACqJ0wgBAAAAwASELQAAAAAwAacRAiZJT0+Xw+Gwug1JUnJysooKi6xuAwAAoFYhbAEmSE9P15BhI5VxKsfqViRJebk5OnT4qGILC61uBQAAoNaoVWFr9uzZevnll5WSkqKrrrpK//jHP9StWzer20IN5HA4lHEqR5HxAxUcEW11O0rbu1PJB+fLWUTYAgAAuFRqTdhasmSJJk2apLlz56p79+569dVX1a9fPyUmJioqirv9wBzBEdEKjYqxug1lHU+xugUAAIBap9bcIGPWrFm6//77NWzYMLVv315z585VnTp1NH/+fKtbAwAAAFAD1YojWwUFBdq8ebOeeOIJd83Hx0d9+vTRhg0bSozPz89Xfn6++3FmZqYkKSMjQ0VFRe7n+/j4yOVyyeVyeazXx8dHTqdThmFctG6322Wz2dzrPbsuSU6ns0S9+KYLWan75co/fU2Q05BsknxsZ8YaklzlqRuSS6cTuO2suss4/Rwf2+nnnF3POpEmZ0GBdu3a5XEzCJvN5jHPijjfOrytXpqDBw+qIC/PYxtJ1m0nR9oh+fr4KCftkE76eI63nz34fz2qnPXyzMmRdkg+kkcvFZnThepl7d2RdkgyDI9eKjKnqthOp9IPyXC5lH1OL1Zsp+xjh2W4XMpKPaiAs3phO7Gdztd7zv96caR49lLeObGd/vczu6hQvpIczkKdPLjngnOtrtsp59hh2SQ5Ug7K36dyPyMqu51yjh2Wr4+PHCkH5Wer/O9ctlPJOV2sXqbtdDJNhmHo1KlTOnnyZKnvvy/V+/Li97xleV9oMyr7jrgaOHLkiC677DJ99913io+Pd9cfe+wxrVu3Tj/88IPH+Oeee07PP//8pW4TAAAAQDVx8OBBxcRc+HKRWnFkq7yeeOIJTZo0yf3Y5XIpIyND9evXl81mu8AzLw2Hw6EmTZro4MGDCg0Ntbod1BLsd7AC+x2swr4HK7DfVQ/FR9gaN2580bG1Imw1aNBAdrtdqampHvXU1FQ1bNiwxPiAgAAFBAR41MLDw81ssUJCQ0P5HxGXHPsdrMB+B6uw78EK7HfeLywsrEzjasUNMvz9/dW5c2etXr3aXXO5XFq9erXHaYUAAAAAUFVqxZEtSZo0aZKGDh2qLl26qFu3bnr11VeVnZ2tYcOGWd0aAAAAgBqo1oStQYMGKT09Xc8884xSUlLUsWNHff7554qOtv4DZ8srICBAzz77bIlTHQEzsd/BCux3sAr7HqzAflfz1Iq7EQIAAADApVYrrtkCAAAAgEuNsAUAAAAAJiBsAQAAAIAJCFsAAAAAYALCVjUze/ZsNWvWTIGBgerevbt+/PFHq1tCDTJt2jR17dpVISEhioqK0oABA5SYmOgxJi8vT2PHjlX9+vVVt25dDRw4sMQHhgOV8be//U02m00TJkxw19jvYJbDhw9ryJAhql+/voKCgnTllVdq06ZN7uWGYeiZZ55Ro0aNFBQUpD59+ujXX3+1sGNUd06nU08//bTi4uIUFBSkFi1aaMqUKTr7nnXsdzUHYasaWbJkiSZNmqRnn31WP/30k6666ir169dPaWlpVreGGmLdunUaO3asvv/+e61atUqFhYXq27evsrOz3WMmTpyo//znP1q6dKnWrVunI0eO6LbbbrOwa9QkGzdu1FtvvaUOHTp41NnvYIYTJ06oZ8+e8vPz02effabdu3dr5syZqlevnnvMjBkz9Prrr2vu3Ln64YcfFBwcrH79+ikvL8/CzlGdTZ8+XXPmzNEbb7yhn3/+WdOnT9eMGTP0j3/8wz2G/a4GMVBtdOvWzRg7dqz7sdPpNBo3bmxMmzbNwq5Qk6WlpRmSjHXr1hmGYRgnT540/Pz8jKVLl7rH/Pzzz4YkY8OGDVa1iRri1KlTRqtWrYxVq1YZ1113nTF+/HjDMNjvYJ7HH3/cuOaaa8673OVyGQ0bNjRefvlld+3kyZNGQECA8dFHH12KFlEDJSQkGMOHD/eo3XbbbcbgwYMNw2C/q2k4slVNFBQUaPPmzerTp4+75uPjoz59+mjDhg0WdoaaLDMzU5IUEREhSdq8ebMKCws99sO2bdsqNjaW/RCVNnbsWCUkJHjsXxL7HcyzbNkydenSRX/+858VFRWlTp066Z133nEvT0pKUkpKise+FxYWpu7du7PvocJ+97vfafXq1dqzZ48kadu2bVq/fr1uvvlmSex3NY2v1Q2gbI4dOyan06no6GiPenR0tH755ReLukJN5nK5NGHCBPXs2VNXXHGFJCklJUX+/v4KDw/3GBsdHa2UlBQLukRNsXjxYv3000/auHFjiWXsdzDLvn37NGfOHE2aNEl//etftXHjRj388MPy9/fX0KFD3ftXab972fdQUX/5y1/kcDjUtm1b2e12OZ1Ovfjiixo8eLAksd/VMIQtAKUaO3asdu7cqfXr11vdCmq4gwcPavz48Vq1apUCAwOtbge1iMvlUpcuXfTSSy9Jkjp16qSdO3dq7ty5Gjp0qMXdoab6+OOPtWjRIn344Ye6/PLLtXXrVk2YMEGNGzdmv6uBOI2wmmjQoIHsdnuJu2+lpqaqYcOGFnWFmmrcuHFavny5vv76a8XExLjrDRs2VEFBgU6ePOkxnv0QlbF582alpaXp6quvlq+vr3x9fbVu3Tq9/vrr8vX1VXR0NPsdTNGoUSO1b9/eo9auXTsdOHBAktz7F797UZUmT56sv/zlL7rzzjt15ZVX6p577tHEiRM1bdo0Sex3NQ1hq5rw9/dX586dtXr1anfN5XJp9erVio+Pt7Az1CSGYWjcuHH69NNPtWbNGsXFxXks79y5s/z8/Dz2w8TERB04cID9EBV2ww03aMeOHdq6dav7q0uXLho8eLD73+x3MEPPnj1LfLzFnj171LRpU0lSXFycGjZs6LHvORwO/fDDD+x7qLCcnBz5+Hi+Bbfb7XK5XJLY72oaTiOsRiZNmqShQ4eqS5cu6tatm1599VVlZ2dr2LBhVreGGmLs2LH68MMP9X//938KCQlxnxseFhamoKAghYWFacSIEZo0aZIiIiIUGhqqhx56SPHx8erRo4fF3aO6CgkJcV8XWCw4OFj169d319nvYIaJEyfqd7/7nV566SXdcccd+vHHH/X222/r7bffliT3571NnTpVrVq1UlxcnJ5++mk1btxYAwYMsLZ5VFv9+/fXiy++qNjYWF1++eXasmWLZs2apeHDh0tiv6txrL4dIsrnH//4hxEbG2v4+/sb3bp1M77//nurW0INIqnUrwULFrjH5ObmGmPGjDHq1atn1KlTx7j11luNo0ePWtc0aqSzb/1uGOx3MM9//vMf44orrjACAgKMtm3bGm+//bbHcpfLZTz99NNGdHS0ERAQYNxwww1GYmKiRd2iJnA4HMb48eON2NhYIzAw0GjevLnx5JNPGvn5+e4x7Hc1h80wzvq4agAAAABAleCaLQAAAAAwAWELAAAAAExA2AIAAAAAExC2AAAAAMAEhC0AAAAAMAFhCwAAAABMQNgCAAAAABMQtgAAAADABIQtAAAAADABYQsAAEkbNmyQ3W5XQkKC1a0AAGoIm2EYhtVNAABgtZEjR6pu3bqaN2+eEhMT1bhxY6tbAgBUcxzZAgDUellZWVqyZIlGjx6thIQELVy40GP5smXL1KpVKwUGBqp379567733ZLPZdPLkSfeY9evX69prr1VQUJCaNGmihx9+WNnZ2Zd2IgAAr0LYAgDUeh9//LHatm2rNm3aaMiQIZo/f76KT/xISkrS7bffrgEDBmjbtm168MEH9eSTT3o8f+/evbrppps0cOBAbd++XUuWLNH69es1btw4K6YDAPASnEYIAKj1evbsqTvuuEPjx49XUVGRGjVqpKVLl6pXr176y1/+ohUrVmjHjh3u8U899ZRefPFFnThxQuHh4Ro5cqTsdrveeust95j169fruuuuU3Z2tgIDA62YFgDAYhzZAgDUaomJifrxxx911113SZJ8fX01aNAgzZs3z728a9euHs/p1q2bx+Nt27Zp4cKFqlu3rvurX79+crlcSkpKujQTAQB4HV+rGwAAwErz5s1TUVGRxw0xDMNQQECA3njjjTKtIysrSw8++KAefvjhEstiY2OrrFcAQPVC2AIA1FpFRUV6//33NXPmTPXt29dj2YABA/TRRx+pTZs2WrlypceyjRs3ejy++uqrtXv3brVs2dL0ngEA1QfXbAEAaq1///vfGjRokNLS0hQWFuax7PHHH9eaNWv08ccfq02bNpo4caJGjBihrVu36pFHHtGhQ4d08uRJhYWFafv27erRo4eGDx+ukSNHKjg4WLt379aqVavKfHQMAFDzcM0WAKDWmjdvnvr06VMiaEnSwIEDtWnTJp06dUr//Oc/9cknn6hDhw6aM2eO+26EAQEBkqQOHTpo3bp12rNnj6699lp16tRJzzzzDJ/VBQC1HEe2AAAopxdffFFz587VwYMHrW4FAODFuGYLAICLePPNN9W1a1fVr19f3377rV5++WU+QwsAcFGELQAALuLXX3/V1KlTlZGRodjYWD3yyCN64oknrG4LAODlOI0QAAAAAEzADTIAAAAAwASELQAAAAAwAWELAAAAAExA2AIAAAAAExC2AAAAAMAEhC0AAAAAMAFhCwAAAABMQNgCAAAAABP8f7OfixCrq7XwAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 1000x600 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "\n",
    "plt.figure(figsize=(10, 6))\n",
    "plt.hist(notes_adm_df['age'], bins=20, edgecolor='black', alpha=0.7)\n",
    "plt.axvline(notes_adm_df['age'].median(), color='red', linestyle='dashed', linewidth=2, label=f\"Median Age = {notes_adm_df['age'].median():.0f}\")\n",
    "plt.xlabel('Age')\n",
    "plt.ylabel('Frequency')\n",
    "plt.title('Age Distribution')\n",
    "plt.legend()\n",
    "plt.grid(axis='y', linestyle='--', alpha=0.7)\n",
    "\n",
    "# Show plot\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "id": "141a99ab-f705-4b22-9175-47396f7adc29",
   "metadata": {},
   "outputs": [],
   "source": [
    "notes_adm_df = notes_adm_df.drop(notes_adm_df[notes_adm_df['age'] > 200].index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "id": "42421c7d-1550-4aa1-9eaa-a51391df8f73",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "230"
      ]
     },
     "execution_count": 121,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "notes_adm_df[notes_adm_df['age'] <18].shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "id": "42a5c4e3-cfb1-4a0c-a4a8-0e18316e6df9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ROW_ID</th>\n",
       "      <th>DISCHTIME</th>\n",
       "      <th>ADMITTIME</th>\n",
       "      <th>SUBJECT_ID</th>\n",
       "      <th>HADM_ID</th>\n",
       "      <th>TEXT</th>\n",
       "      <th>LOS_label</th>\n",
       "      <th>ETHNICITY</th>\n",
       "      <th>INSURANCE</th>\n",
       "      <th>RELIGION</th>\n",
       "      <th>MARITAL_STATUS</th>\n",
       "      <th>GENDER</th>\n",
       "      <th>DOB</th>\n",
       "      <th>age</th>\n",
       "      <th>New_RELIGION</th>\n",
       "      <th>New_MARITAL_STATUS</th>\n",
       "      <th>ethnicity_group</th>\n",
       "      <th>age_median_gp</th>\n",
       "      <th>age_bins</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>42102</td>\n",
       "      <td>2117-09-17 16:45:00</td>\n",
       "      <td>2117-09-11 11:46:00</td>\n",
       "      <td>58526</td>\n",
       "      <td>100001.0</td>\n",
       "      <td>Admission Date:  [**2117-9-11**]              ...</td>\n",
       "      <td>1</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>Private</td>\n",
       "      <td>PROTESTANT QUAKER</td>\n",
       "      <td>DIVORCED</td>\n",
       "      <td>F</td>\n",
       "      <td>2082-03-21</td>\n",
       "      <td>35</td>\n",
       "      <td>PROTESTANT QUAKER</td>\n",
       "      <td>DIVORCED</td>\n",
       "      <td>White</td>\n",
       "      <td>0</td>\n",
       "      <td>Adult</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>19215</td>\n",
       "      <td>2150-04-21 17:30:00</td>\n",
       "      <td>2150-04-17 15:34:00</td>\n",
       "      <td>54610</td>\n",
       "      <td>100003.0</td>\n",
       "      <td>Admission Date:  [**2150-4-17**]              ...</td>\n",
       "      <td>1</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>Private</td>\n",
       "      <td>NOT SPECIFIED</td>\n",
       "      <td>SINGLE</td>\n",
       "      <td>M</td>\n",
       "      <td>2090-05-19</td>\n",
       "      <td>60</td>\n",
       "      <td>Unknown</td>\n",
       "      <td>SINGLE</td>\n",
       "      <td>White</td>\n",
       "      <td>0</td>\n",
       "      <td>Middle-aged</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8772</td>\n",
       "      <td>2108-04-18 17:18:00</td>\n",
       "      <td>2108-04-06 15:49:00</td>\n",
       "      <td>9895</td>\n",
       "      <td>100006.0</td>\n",
       "      <td>Admission Date:  [**2108-4-6**]       Discharg...</td>\n",
       "      <td>2</td>\n",
       "      <td>BLACK/AFRICAN AMERICAN</td>\n",
       "      <td>Private</td>\n",
       "      <td>NOT SPECIFIED</td>\n",
       "      <td>SINGLE</td>\n",
       "      <td>F</td>\n",
       "      <td>2059-05-07</td>\n",
       "      <td>49</td>\n",
       "      <td>Unknown</td>\n",
       "      <td>SINGLE</td>\n",
       "      <td>Black</td>\n",
       "      <td>0</td>\n",
       "      <td>Adult</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>50238</td>\n",
       "      <td>2145-04-07 12:40:00</td>\n",
       "      <td>2145-03-31 05:33:00</td>\n",
       "      <td>23018</td>\n",
       "      <td>100007.0</td>\n",
       "      <td>Admission Date:  [**2145-3-31**]              ...</td>\n",
       "      <td>2</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>Private</td>\n",
       "      <td>JEWISH</td>\n",
       "      <td>MARRIED</td>\n",
       "      <td>F</td>\n",
       "      <td>2071-06-04</td>\n",
       "      <td>74</td>\n",
       "      <td>JEWISH</td>\n",
       "      <td>MARRIED</td>\n",
       "      <td>White</td>\n",
       "      <td>1</td>\n",
       "      <td>Senior</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>21119</td>\n",
       "      <td>2162-05-21 13:37:00</td>\n",
       "      <td>2162-05-16 15:56:00</td>\n",
       "      <td>533</td>\n",
       "      <td>100009.0</td>\n",
       "      <td>Admission Date:  [**2162-5-16**]              ...</td>\n",
       "      <td>1</td>\n",
       "      <td>WHITE</td>\n",
       "      <td>Private</td>\n",
       "      <td>CATHOLIC</td>\n",
       "      <td>MARRIED</td>\n",
       "      <td>M</td>\n",
       "      <td>2101-07-30</td>\n",
       "      <td>61</td>\n",
       "      <td>CATHOLIC</td>\n",
       "      <td>MARRIED</td>\n",
       "      <td>White</td>\n",
       "      <td>0</td>\n",
       "      <td>Middle-aged</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ROW_ID           DISCHTIME           ADMITTIME  SUBJECT_ID   HADM_ID  \\\n",
       "0   42102 2117-09-17 16:45:00 2117-09-11 11:46:00       58526  100001.0   \n",
       "1   19215 2150-04-21 17:30:00 2150-04-17 15:34:00       54610  100003.0   \n",
       "2    8772 2108-04-18 17:18:00 2108-04-06 15:49:00        9895  100006.0   \n",
       "3   50238 2145-04-07 12:40:00 2145-03-31 05:33:00       23018  100007.0   \n",
       "4   21119 2162-05-21 13:37:00 2162-05-16 15:56:00         533  100009.0   \n",
       "\n",
       "                                                TEXT  LOS_label  \\\n",
       "0  Admission Date:  [**2117-9-11**]              ...          1   \n",
       "1  Admission Date:  [**2150-4-17**]              ...          1   \n",
       "2  Admission Date:  [**2108-4-6**]       Discharg...          2   \n",
       "3  Admission Date:  [**2145-3-31**]              ...          2   \n",
       "4  Admission Date:  [**2162-5-16**]              ...          1   \n",
       "\n",
       "                ETHNICITY INSURANCE           RELIGION MARITAL_STATUS GENDER  \\\n",
       "0                   WHITE   Private  PROTESTANT QUAKER       DIVORCED      F   \n",
       "1                   WHITE   Private      NOT SPECIFIED         SINGLE      M   \n",
       "2  BLACK/AFRICAN AMERICAN   Private      NOT SPECIFIED         SINGLE      F   \n",
       "3                   WHITE   Private             JEWISH        MARRIED      F   \n",
       "4                   WHITE   Private           CATHOLIC        MARRIED      M   \n",
       "\n",
       "         DOB  age       New_RELIGION New_MARITAL_STATUS ethnicity_group  \\\n",
       "0 2082-03-21   35  PROTESTANT QUAKER           DIVORCED           White   \n",
       "1 2090-05-19   60            Unknown             SINGLE           White   \n",
       "2 2059-05-07   49            Unknown             SINGLE           Black   \n",
       "3 2071-06-04   74             JEWISH            MARRIED           White   \n",
       "4 2101-07-30   61           CATHOLIC            MARRIED           White   \n",
       "\n",
       "   age_median_gp     age_bins  \n",
       "0              0        Adult  \n",
       "1              0  Middle-aged  \n",
       "2              0        Adult  \n",
       "3              1       Senior  \n",
       "4              0  Middle-aged  "
      ]
     },
     "execution_count": 125,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bins = [0, 18, 35, 50, 65, 80, 100]  # Adding 'inf' for ages above 100\n",
    "labels = [\"Child\", \"Young Adult\", \"Adult\", \"Middle-aged\", \"Senior\", \"Elderly\"]\n",
    "\n",
    "notes_adm_df['age_bins'] = pd.cut(notes_adm_df['age'], bins=bins, labels=labels, right=False)\n",
    "\n",
    "notes_adm_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "id": "daa732c1-6e9f-40e0-94c3-369a5242f1ce",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "age_bins\n",
       "Senior         13582\n",
       "Middle-aged    12385\n",
       "Elderly         6586\n",
       "Adult           6080\n",
       "Young Adult     2947\n",
       "Child            230\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 126,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "notes_adm_df['age_bins'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "id": "8f573263-8591-4992-992c-0885d2556152",
   "metadata": {},
   "outputs": [],
   "source": [
    "mimic_utils.save_mimic_split_patient_wise(notes_adm_df,\n",
    "                                          label_column='LOS_label',\n",
    "                                          save_dir=save_dir,\n",
    "                                          task_name=task_name,\n",
    "                                          seed=123,\n",
    "                                         column_list = [\"ID\",\"TEXT\",\"SUBJECT_ID\", 'LOS_label','GENDER','ethnicity_group','ETHNICITY','INSURANCE','RELIGION','New_RELIGION','MARITAL_STATUS','New_MARITAL_STATUS','age','age_median_gp','age_bins'])\n",
    "\n",
    "#if __name__ == \"__main__\":\n",
    "#    args = mimic_utils.parse_args()\n",
    "#    los_mimic(args.mimic_dir, args.save_dir, args.seed, args.admission_only)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5f8eaf7e-4d8d-4225-a55d-b9b385725e5f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
