{
    "cells": [
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "# MIMIC 4 data - dataset construction labevents"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "Code taken from GRU-ODE-Bayes preprocessing; simplified and adapted for MIMIC 4 1.0"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 1,
            "metadata": {},
            "outputs": [],
            "source": [
                "import os\n",
                "import pathlib\n",
                "\n",
                "p_project = str(pathlib.Path(os.getcwd()).parents[1])"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 2,
            "metadata": {},
            "outputs": [],
            "source": [
                "import pandas as pd\n",
                "import numpy as np\n",
                "from sklearn.model_selection import train_test_split"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 3,
            "metadata": {},
            "outputs": [],
            "source": [
                "pd.set_option('display.max_rows', 50)\n",
                "pd.set_option('display.max_columns', 300)"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 4,
            "metadata": {},
            "outputs": [],
            "source": [
                "path_data = p_project + '/data/original/mimic4'\n",
                "path_temp = p_project + '/data/mimic4'"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 5,
            "metadata": {},
            "outputs": [
                {
                    "data": {
                        "text/html": [
                            "<div>\n",
                            "<style scoped>\n",
                            "    .dataframe tbody tr th:only-of-type {\n",
                            "        vertical-align: middle;\n",
                            "    }\n",
                            "\n",
                            "    .dataframe tbody tr th {\n",
                            "        vertical-align: top;\n",
                            "    }\n",
                            "\n",
                            "    .dataframe thead th {\n",
                            "        text-align: right;\n",
                            "    }\n",
                            "</style>\n",
                            "<table border=\"1\" class=\"dataframe\">\n",
                            "  <thead>\n",
                            "    <tr style=\"text-align: right;\">\n",
                            "      <th></th>\n",
                            "      <th>Unnamed: 0</th>\n",
                            "      <th>hadm_id</th>\n",
                            "      <th>icu_los</th>\n",
                            "      <th>subject_id</th>\n",
                            "      <th>anchor_age</th>\n",
                            "      <th>admittime</th>\n",
                            "      <th>dischtime</th>\n",
                            "      <th>deathtime</th>\n",
                            "      <th>admission_type</th>\n",
                            "      <th>admission_location</th>\n",
                            "      <th>discharge_location</th>\n",
                            "      <th>insurance</th>\n",
                            "      <th>language</th>\n",
                            "      <th>marital_status</th>\n",
                            "      <th>ethnicity</th>\n",
                            "      <th>edregtime</th>\n",
                            "      <th>edouttime</th>\n",
                            "      <th>hospital_expire_flag</th>\n",
                            "    </tr>\n",
                            "  </thead>\n",
                            "  <tbody>\n",
                            "    <tr>\n",
                            "      <th>0</th>\n",
                            "      <td>0</td>\n",
                            "      <td>24528534</td>\n",
                            "      <td>1.587454</td>\n",
                            "      <td>17867402</td>\n",
                            "      <td>25</td>\n",
                            "      <td>2154-03-03 03:09:00</td>\n",
                            "      <td>2154-03-04 16:30:00</td>\n",
                            "      <td>NaN</td>\n",
                            "      <td>EW EMER.</td>\n",
                            "      <td>EMERGENCY ROOM</td>\n",
                            "      <td>HOME</td>\n",
                            "      <td>Medicaid</td>\n",
                            "      <td>ENGLISH</td>\n",
                            "      <td>NaN</td>\n",
                            "      <td>UNABLE TO OBTAIN</td>\n",
                            "      <td>2154-03-03 02:49:00</td>\n",
                            "      <td>2154-03-03 04:11:00</td>\n",
                            "      <td>0</td>\n",
                            "    </tr>\n",
                            "    <tr>\n",
                            "      <th>1</th>\n",
                            "      <td>1</td>\n",
                            "      <td>28960964</td>\n",
                            "      <td>3.025625</td>\n",
                            "      <td>14435996</td>\n",
                            "      <td>42</td>\n",
                            "      <td>2150-06-19 13:07:00</td>\n",
                            "      <td>2150-06-25 15:36:00</td>\n",
                            "      <td>NaN</td>\n",
                            "      <td>OBSERVATION ADMIT</td>\n",
                            "      <td>TRANSFER FROM HOSPITAL</td>\n",
                            "      <td>HOME</td>\n",
                            "      <td>Medicaid</td>\n",
                            "      <td>ENGLISH</td>\n",
                            "      <td>SINGLE</td>\n",
                            "      <td>WHITE</td>\n",
                            "      <td>2150-06-19 11:54:00</td>\n",
                            "      <td>2150-06-19 17:57:00</td>\n",
                            "      <td>0</td>\n",
                            "    </tr>\n",
                            "    <tr>\n",
                            "      <th>2</th>\n",
                            "      <td>2</td>\n",
                            "      <td>27385897</td>\n",
                            "      <td>9.741725</td>\n",
                            "      <td>17609946</td>\n",
                            "      <td>70</td>\n",
                            "      <td>2138-02-05 17:42:00</td>\n",
                            "      <td>2138-02-15 11:00:00</td>\n",
                            "      <td>2138-02-15 11:00:00</td>\n",
                            "      <td>OBSERVATION ADMIT</td>\n",
                            "      <td>EMERGENCY ROOM</td>\n",
                            "      <td>DIED</td>\n",
                            "      <td>Medicare</td>\n",
                            "      <td>ENGLISH</td>\n",
                            "      <td>SINGLE</td>\n",
                            "      <td>WHITE</td>\n",
                            "      <td>2138-02-05 15:44:00</td>\n",
                            "      <td>2138-02-05 18:54:00</td>\n",
                            "      <td>1</td>\n",
                            "    </tr>\n",
                            "    <tr>\n",
                            "      <th>3</th>\n",
                            "      <td>4</td>\n",
                            "      <td>20817525</td>\n",
                            "      <td>1.674769</td>\n",
                            "      <td>12776735</td>\n",
                            "      <td>72</td>\n",
                            "      <td>2200-07-11 22:46:00</td>\n",
                            "      <td>2200-07-19 12:00:00</td>\n",
                            "      <td>NaN</td>\n",
                            "      <td>OBSERVATION ADMIT</td>\n",
                            "      <td>EMERGENCY ROOM</td>\n",
                            "      <td>SKILLED NURSING FACILITY</td>\n",
                            "      <td>Medicare</td>\n",
                            "      <td>ENGLISH</td>\n",
                            "      <td>MARRIED</td>\n",
                            "      <td>OTHER</td>\n",
                            "      <td>2200-07-11 15:27:00</td>\n",
                            "      <td>2200-07-12 00:33:00</td>\n",
                            "      <td>0</td>\n",
                            "    </tr>\n",
                            "    <tr>\n",
                            "      <th>4</th>\n",
                            "      <td>5</td>\n",
                            "      <td>24283593</td>\n",
                            "      <td>1.292697</td>\n",
                            "      <td>10215159</td>\n",
                            "      <td>67</td>\n",
                            "      <td>2124-09-20 15:04:00</td>\n",
                            "      <td>2124-09-26 14:30:00</td>\n",
                            "      <td>NaN</td>\n",
                            "      <td>EW EMER.</td>\n",
                            "      <td>EMERGENCY ROOM</td>\n",
                            "      <td>HOME</td>\n",
                            "      <td>Medicare</td>\n",
                            "      <td>ENGLISH</td>\n",
                            "      <td>DIVORCED</td>\n",
                            "      <td>WHITE</td>\n",
                            "      <td>2124-09-20 12:52:00</td>\n",
                            "      <td>2124-09-20 17:21:00</td>\n",
                            "      <td>0</td>\n",
                            "    </tr>\n",
                            "  </tbody>\n",
                            "</table>\n",
                            "</div>"
                        ],
                        "text/plain": [
                            "   Unnamed: 0   hadm_id   icu_los  subject_id  anchor_age  \\\n",
                            "0           0  24528534  1.587454    17867402          25   \n",
                            "1           1  28960964  3.025625    14435996          42   \n",
                            "2           2  27385897  9.741725    17609946          70   \n",
                            "3           4  20817525  1.674769    12776735          72   \n",
                            "4           5  24283593  1.292697    10215159          67   \n",
                            "\n",
                            "             admittime            dischtime            deathtime  \\\n",
                            "0  2154-03-03 03:09:00  2154-03-04 16:30:00                  NaN   \n",
                            "1  2150-06-19 13:07:00  2150-06-25 15:36:00                  NaN   \n",
                            "2  2138-02-05 17:42:00  2138-02-15 11:00:00  2138-02-15 11:00:00   \n",
                            "3  2200-07-11 22:46:00  2200-07-19 12:00:00                  NaN   \n",
                            "4  2124-09-20 15:04:00  2124-09-26 14:30:00                  NaN   \n",
                            "\n",
                            "      admission_type      admission_location        discharge_location  \\\n",
                            "0           EW EMER.          EMERGENCY ROOM                      HOME   \n",
                            "1  OBSERVATION ADMIT  TRANSFER FROM HOSPITAL                      HOME   \n",
                            "2  OBSERVATION ADMIT          EMERGENCY ROOM                      DIED   \n",
                            "3  OBSERVATION ADMIT          EMERGENCY ROOM  SKILLED NURSING FACILITY   \n",
                            "4           EW EMER.          EMERGENCY ROOM                      HOME   \n",
                            "\n",
                            "  insurance language marital_status         ethnicity            edregtime  \\\n",
                            "0  Medicaid  ENGLISH            NaN  UNABLE TO OBTAIN  2154-03-03 02:49:00   \n",
                            "1  Medicaid  ENGLISH         SINGLE             WHITE  2150-06-19 11:54:00   \n",
                            "2  Medicare  ENGLISH         SINGLE             WHITE  2138-02-05 15:44:00   \n",
                            "3  Medicare  ENGLISH        MARRIED             OTHER  2200-07-11 15:27:00   \n",
                            "4  Medicare  ENGLISH       DIVORCED             WHITE  2124-09-20 12:52:00   \n",
                            "\n",
                            "             edouttime  hospital_expire_flag  \n",
                            "0  2154-03-03 04:11:00                     0  \n",
                            "1  2150-06-19 17:57:00                     0  \n",
                            "2  2138-02-05 18:54:00                     1  \n",
                            "3  2200-07-12 00:33:00                     0  \n",
                            "4  2124-09-20 17:21:00                     0  "
                        ]
                    },
                    "execution_count": 5,
                    "metadata": {},
                    "output_type": "execute_result"
                }
            ],
            "source": [
                "adm = pd.read_csv(path_temp + '/processed/tables/admissions_processed.csv')\n",
                "adm.head()"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 6,
            "metadata": {},
            "outputs": [
                {
                    "name": "stderr",
                    "output_type": "stream",
                    "text": [
                        "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py:3063: DtypeWarning: Columns (7) have mixed types. Specify dtype option on import or set low_memory=False.\n",
                        "  interactivity=interactivity, compiler=compiler, result=result)\n",
                        "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py:3063: DtypeWarning: Columns (9) have mixed types. Specify dtype option on import or set low_memory=False.\n",
                        "  interactivity=interactivity, compiler=compiler, result=result)\n",
                        "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py:3063: DtypeWarning: Columns (9,12) have mixed types. Specify dtype option on import or set low_memory=False.\n",
                        "  interactivity=interactivity, compiler=compiler, result=result)\n",
                        "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py:3063: DtypeWarning: Columns (7,9,12) have mixed types. Specify dtype option on import or set low_memory=False.\n",
                        "  interactivity=interactivity, compiler=compiler, result=result)\n",
                        "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py:3063: DtypeWarning: Columns (7,12) have mixed types. Specify dtype option on import or set low_memory=False.\n",
                        "  interactivity=interactivity, compiler=compiler, result=result)\n"
                    ]
                }
            ],
            "source": [
                "df = pd.DataFrame()\n",
                "for chunk in pd.read_csv(path_data + '/hosp/labevents.csv.gz', chunksize=500000):\n",
                "    adm_ids=list(adm['hadm_id'])\n",
                "    chunk=chunk.loc[chunk['hadm_id'].isin(adm_ids)]\n",
                "    df = df.append(chunk[['subject_id','hadm_id','charttime','valuenum','itemid']])"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 7,
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "Number of patients remaining in the database: \n",
                        "43956\n"
                    ]
                }
            ],
            "source": [
                "# only choose previously selected admission ids.\n",
                "print('Number of patients remaining in the database: ')\n",
                "print(df['subject_id'].nunique())"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 8,
            "metadata": {},
            "outputs": [
                {
                    "data": {
                        "text/html": [
                            "<div>\n",
                            "<style scoped>\n",
                            "    .dataframe tbody tr th:only-of-type {\n",
                            "        vertical-align: middle;\n",
                            "    }\n",
                            "\n",
                            "    .dataframe tbody tr th {\n",
                            "        vertical-align: top;\n",
                            "    }\n",
                            "\n",
                            "    .dataframe thead th {\n",
                            "        text-align: right;\n",
                            "    }\n",
                            "</style>\n",
                            "<table border=\"1\" class=\"dataframe\">\n",
                            "  <thead>\n",
                            "    <tr style=\"text-align: right;\">\n",
                            "      <th></th>\n",
                            "      <th>itemid</th>\n",
                            "      <th>label</th>\n",
                            "    </tr>\n",
                            "  </thead>\n",
                            "  <tbody>\n",
                            "    <tr>\n",
                            "      <th>0</th>\n",
                            "      <td>51905</td>\n",
                            "      <td></td>\n",
                            "    </tr>\n",
                            "    <tr>\n",
                            "      <th>1</th>\n",
                            "      <td>51532</td>\n",
                            "      <td>11-Deoxycorticosterone</td>\n",
                            "    </tr>\n",
                            "    <tr>\n",
                            "      <th>2</th>\n",
                            "      <td>51957</td>\n",
                            "      <td>17-Hydroxycorticosteroids</td>\n",
                            "    </tr>\n",
                            "    <tr>\n",
                            "      <th>3</th>\n",
                            "      <td>51958</td>\n",
                            "      <td>17-Ketosteroids, Urine</td>\n",
                            "    </tr>\n",
                            "    <tr>\n",
                            "      <th>4</th>\n",
                            "      <td>52068</td>\n",
                            "      <td>24 Hr</td>\n",
                            "    </tr>\n",
                            "  </tbody>\n",
                            "</table>\n",
                            "</div>"
                        ],
                        "text/plain": [
                            "   itemid                      label\n",
                            "0   51905                           \n",
                            "1   51532     11-Deoxycorticosterone\n",
                            "2   51957  17-Hydroxycorticosteroids\n",
                            "3   51958     17-Ketosteroids, Urine\n",
                            "4   52068                      24 Hr"
                        ]
                    },
                    "execution_count": 8,
                    "metadata": {},
                    "output_type": "execute_result"
                }
            ],
            "source": [
                "# get item ids\n",
                "item_id=pd.read_csv(path_data + '/hosp/d_labitems.csv.gz')\n",
                "item_id_1=item_id[['itemid','label']]\n",
                "item_id_1.head()"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 9,
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "Number of patients remaining in the database: \n",
                        "43956\n"
                    ]
                }
            ],
            "source": [
                "# get names of administered items\n",
                "lab2=pd.merge(df,item_id_1,on='itemid')\n",
                "lab2.head()\n",
                "print('Number of patients remaining in the database: ')\n",
                "print(lab2['subject_id'].nunique())"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 10,
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "Number of patients remaining in the database: \n",
                        "43955\n"
                    ]
                }
            ],
            "source": [
                "# get only top 150 most used tests\n",
                "n_best=150\n",
                "pat_for_item=lab2.groupby('label')['subject_id'].nunique()\n",
                "frequent_labels=pat_for_item.sort_values(ascending=False)[:n_best]\n",
                "lab3=lab2.loc[lab2['label'].isin(list(frequent_labels.index))].copy()\n",
                "\n",
                "print('Number of patients remaining in the database: ')\n",
                "print(lab3['subject_id'].nunique())"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 11,
            "metadata": {},
            "outputs": [
                {
                    "data": {
                        "text/plain": [
                            "label\n",
                            "Glucose                            43929\n",
                            "Sodium                             43920\n",
                            "Potassium                          43920\n",
                            "Chloride                           43919\n",
                            "Creatinine                         43918\n",
                            "Anion Gap                          43916\n",
                            "Urea Nitrogen                      43916\n",
                            "Bicarbonate                        43916\n",
                            "Hematocrit                         43907\n",
                            "Platelet Count                     43904\n",
                            "Hemoglobin                         43902\n",
                            "White Blood Cells                  43901\n",
                            "MCV                                43901\n",
                            "MCHC                               43901\n",
                            "MCH                                43901\n",
                            "RDW                                43901\n",
                            "Red Blood Cells                    43901\n",
                            "Magnesium                          43852\n",
                            "Phosphate                          43057\n",
                            "Calcium, Total                     43012\n",
                            "PT                                 41879\n",
                            "INR(PT)                            41879\n",
                            "PTT                                41754\n",
                            "pH                                 38303\n",
                            "Specimen Type                      34400\n",
                            "Lactate                            33439\n",
                            "Base Excess                        32903\n",
                            "Calculated Total CO2               32899\n",
                            "pCO2                               32898\n",
                            "pO2                                32898\n",
                            "Estimated GFR (MDRD equation)      32804\n",
                            "Alanine Aminotransferase (ALT)     30591\n",
                            "Asparate Aminotransferase (AST)    30565\n",
                            "Bilirubin, Total                   30420\n",
                            "Alkaline Phosphatase               30230\n",
                            "Lymphocytes                        29154\n",
                            "Monocytes                          29102\n",
                            "Eosinophils                        29002\n",
                            "Ketone                             28805\n",
                            "Nitrite                            28805\n",
                            "Leukocytes                         28805\n",
                            "Blood                              28805\n",
                            "Urobilinogen                       28805\n",
                            "Bilirubin                          28805\n",
                            "Urine Appearance                   28805\n",
                            "Specific Gravity                   28805\n",
                            "Protein                            28805\n",
                            "Basophils                          28657\n",
                            "Neutrophils                        28624\n",
                            "Free Calcium                       27070\n",
                            "Name: subject_id, dtype: int64"
                        ]
                    },
                    "execution_count": 11,
                    "metadata": {},
                    "output_type": "execute_result"
                }
            ],
            "source": [
                "frequent_labels.head(50)"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 12,
            "metadata": {},
            "outputs": [],
            "source": [
                "# only select the subset that was used in the paper (only missing is INR(PT))\n",
                "subset=['Albumin','Alanine Aminotransferase (ALT)','Alkaline Phosphatase','Anion Gap','Asparate Aminotransferase (AST)','Base Excess','Basophils','Bicarbonate','Bilirubin, Total','Calcium, Total','Calculated Total CO2','Chloride','Creatinine','Eosinophils','Glucose','Hematocrit','Hemoglobin',\n",
                "'Lactate','Lymphocytes','MCH','MCV','Magnesium','Monocytes','Neutrophils','PT','PTT','Phosphate','Platelet Count','Potassium','RDW','Red Blood Cells','Sodium','Specific Gravity','Urea Nitrogen','White Blood Cells','pCO2','pH','pO2']\n",
                "\n",
                "lab3=lab3.loc[lab3['label'].isin(subset)].copy()"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 13,
            "metadata": {},
            "outputs": [
                {
                    "data": {
                        "text/plain": [
                            "array(['PT', 'PTT', 'Basophils', 'Eosinophils', 'Hematocrit',\n",
                            "       'Hemoglobin', 'Lymphocytes', 'MCH', 'MCV', 'Monocytes',\n",
                            "       'Neutrophils', 'RDW', 'Red Blood Cells', 'White Blood Cells',\n",
                            "       'Anion Gap', 'Calcium, Total', 'Chloride', 'Creatinine',\n",
                            "       'Magnesium', 'Phosphate', 'Potassium', 'Urea Nitrogen',\n",
                            "       'Base Excess', 'Calculated Total CO2', 'pCO2', 'pO2', 'Lactate',\n",
                            "       'Alanine Aminotransferase (ALT)', 'Alkaline Phosphatase',\n",
                            "       'Asparate Aminotransferase (AST)', 'Bilirubin, Total',\n",
                            "       'Platelet Count', 'pH', 'Bicarbonate', 'Sodium', 'Albumin',\n",
                            "       'Specific Gravity', 'Glucose'], dtype=object)"
                        ]
                    },
                    "execution_count": 13,
                    "metadata": {},
                    "output_type": "execute_result"
                }
            ],
            "source": [
                "lab3['label'].unique()"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 14,
            "metadata": {},
            "outputs": [],
            "source": [
                "lab3.to_csv(path_temp + '/processed/tables/lab_processed.csv')"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "metadata": {},
            "outputs": [],
            "source": []
        }
    ],
    "metadata": {
        "kernelspec": {
            "display_name": "Python 3 (ipykernel)",
            "language": "python",
            "name": "python3"
        },
        "language_info": {
            "codemirror_mode": {
                "name": "ipython",
                "version": 3
            },
            "file_extension": ".py",
            "mimetype": "text/x-python",
            "name": "python",
            "nbconvert_exporter": "python",
            "pygments_lexer": "ipython3",
            "version": "3.7.3"
        },
        "vscode": {
            "interpreter": {
                "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
            }
        }
    },
    "nbformat": 4,
    "nbformat_minor": 4
}