{
    "cells": [
        {
            "attachments": {},
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "# MIMIC 4 data - dataset construction chartevents"
            ]
        },
        {
            "attachments": {},
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "Code taken from GRU-ODE-Bayes preprocessing; simplified and adapted for MIMIC 4 1.0"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 1,
            "metadata": {},
            "outputs": [],
            "source": [
                "import os\n",
                "import pathlib\n",
                "\n",
                "p_project = str(pathlib.Path(os.getcwd()).parents[1])"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 2,
            "metadata": {},
            "outputs": [],
            "source": [
                "import pandas as pd\n",
                "from datetime import datetime\n",
                "from datetime import timedelta\n",
                "import numpy as np"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 3,
            "metadata": {},
            "outputs": [],
            "source": [
                "path_mimic = p_project + '/data/mimic4'"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 4,
            "metadata": {},
            "outputs": [],
            "source": [
                "adm = pd.read_csv(path_mimic + '/processed/admissions_processed.csv')"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 5,
            "metadata": {},
            "outputs": [],
            "source": [
                "# only choose previously selected admission ids\n",
                "adm_ids=list(adm['hadm_id'])\n",
                "charts = pd.DataFrame()\n",
                "for chunk in pd.read_csv(path_mimic + '/raw/icu/chartevents.csv', chunksize=10000000, low_memory=False):\n",
                "    chunk=chunk.loc[chunk['hadm_id'].isin(adm_ids)]\n",
                "    charts = pd.concat([charts, chunk[['subject_id', 'hadm_id', 'charttime', 'itemid', 'valuenum', 'valueuom']]])\n",
                "    \n",
                "charts = charts.reset_index(drop=True)"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 6,
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "Number of patients remaining in the database: \n",
                        "44057\n"
                    ]
                }
            ],
            "source": [
                "# only choose previously selected admission ids.\n",
                "print('Number of patients remaining in the database: ')\n",
                "print(charts['subject_id'].nunique())"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 7,
            "metadata": {},
            "outputs": [
                {
                    "data": {
                        "text/html": [
                            "<div>\n",
                            "<style scoped>\n",
                            "    .dataframe tbody tr th:only-of-type {\n",
                            "        vertical-align: middle;\n",
                            "    }\n",
                            "\n",
                            "    .dataframe tbody tr th {\n",
                            "        vertical-align: top;\n",
                            "    }\n",
                            "\n",
                            "    .dataframe thead th {\n",
                            "        text-align: right;\n",
                            "    }\n",
                            "</style>\n",
                            "<table border=\"1\" class=\"dataframe\">\n",
                            "  <thead>\n",
                            "    <tr style=\"text-align: right;\">\n",
                            "      <th></th>\n",
                            "      <th>subject_id</th>\n",
                            "      <th>hadm_id</th>\n",
                            "      <th>charttime</th>\n",
                            "      <th>itemid</th>\n",
                            "      <th>valuenum</th>\n",
                            "      <th>valueuom</th>\n",
                            "      <th>label</th>\n",
                            "    </tr>\n",
                            "  </thead>\n",
                            "  <tbody>\n",
                            "    <tr>\n",
                            "      <th>0</th>\n",
                            "      <td>10004235</td>\n",
                            "      <td>24181354</td>\n",
                            "      <td>2196-02-24 14:53:00</td>\n",
                            "      <td>220224</td>\n",
                            "      <td>125.0</td>\n",
                            "      <td>mmHg</td>\n",
                            "      <td>Arterial O2 pressure</td>\n",
                            "    </tr>\n",
                            "    <tr>\n",
                            "      <th>1</th>\n",
                            "      <td>10004235</td>\n",
                            "      <td>24181354</td>\n",
                            "      <td>2196-02-24 16:25:00</td>\n",
                            "      <td>220224</td>\n",
                            "      <td>108.0</td>\n",
                            "      <td>mmHg</td>\n",
                            "      <td>Arterial O2 pressure</td>\n",
                            "    </tr>\n",
                            "    <tr>\n",
                            "      <th>2</th>\n",
                            "      <td>10004235</td>\n",
                            "      <td>24181354</td>\n",
                            "      <td>2196-02-24 17:46:00</td>\n",
                            "      <td>220224</td>\n",
                            "      <td>98.0</td>\n",
                            "      <td>mmHg</td>\n",
                            "      <td>Arterial O2 pressure</td>\n",
                            "    </tr>\n",
                            "    <tr>\n",
                            "      <th>3</th>\n",
                            "      <td>10004235</td>\n",
                            "      <td>24181354</td>\n",
                            "      <td>2196-02-24 19:10:00</td>\n",
                            "      <td>220224</td>\n",
                            "      <td>191.0</td>\n",
                            "      <td>mmHg</td>\n",
                            "      <td>Arterial O2 pressure</td>\n",
                            "    </tr>\n",
                            "    <tr>\n",
                            "      <th>4</th>\n",
                            "      <td>10004235</td>\n",
                            "      <td>24181354</td>\n",
                            "      <td>2196-02-24 20:54:00</td>\n",
                            "      <td>220224</td>\n",
                            "      <td>165.0</td>\n",
                            "      <td>mmHg</td>\n",
                            "      <td>Arterial O2 pressure</td>\n",
                            "    </tr>\n",
                            "  </tbody>\n",
                            "</table>\n",
                            "</div>"
                        ],
                        "text/plain": [
                            "   subject_id   hadm_id            charttime  itemid  valuenum valueuom  \\\n",
                            "0    10004235  24181354  2196-02-24 14:53:00  220224     125.0     mmHg   \n",
                            "1    10004235  24181354  2196-02-24 16:25:00  220224     108.0     mmHg   \n",
                            "2    10004235  24181354  2196-02-24 17:46:00  220224      98.0     mmHg   \n",
                            "3    10004235  24181354  2196-02-24 19:10:00  220224     191.0     mmHg   \n",
                            "4    10004235  24181354  2196-02-24 20:54:00  220224     165.0     mmHg   \n",
                            "\n",
                            "                  label  \n",
                            "0  Arterial O2 pressure  \n",
                            "1  Arterial O2 pressure  \n",
                            "2  Arterial O2 pressure  \n",
                            "3  Arterial O2 pressure  \n",
                            "4  Arterial O2 pressure  "
                        ]
                    },
                    "execution_count": 7,
                    "metadata": {},
                    "output_type": "execute_result"
                }
            ],
            "source": [
                "# get item ids\n",
                "item_id=pd.read_csv(path_mimic + '/raw/icu/d_items.csv.gz')[['itemid','label']]\n",
                "\n",
                "# get names of administered items\n",
                "charts2=pd.merge(charts, item_id, on='itemid')\n",
                "charts2.head()"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 8,
            "metadata": {},
            "outputs": [],
            "source": [
                "\n",
                "pat_for_item=charts2.groupby('label')['subject_id'].nunique().sort_values(ascending=False)\n",
                "\n",
                "label_counts=charts2['label'].value_counts()"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 9,
            "metadata": {},
            "outputs": [],
            "source": [
                "# get only top 50 most used tests\n",
                "frequent_labels1=pat_for_item[:100]\n",
                "frequent_labels2=label_counts.head(200)\n",
                "\n",
                "fre_labels = frequent_labels1.loc[frequent_labels1.index.isin(frequent_labels2.index)]"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 10,
            "metadata": {},
            "outputs": [
                {
                    "data": {
                        "text/plain": [
                            "Index(['Heart Rate', 'O2 saturation pulseoxymetry', 'GCS - Eye Opening',\n",
                            "       'GCS - Verbal Response', 'GCS - Motor Response', 'Respiratory Rate',\n",
                            "       'Alarms On', 'Heart Rate Alarm - Low', 'Heart rate Alarm - High',\n",
                            "       'Head of Bed', 'O2 Saturation Pulseoxymetry Alarm - Low',\n",
                            "       'Skin Integrity', 'LUL Lung Sounds', 'RUL Lung Sounds',\n",
                            "       'Skin Temperature', 'RLL Lung Sounds', 'LLL Lung Sounds',\n",
                            "       'O2 Saturation Pulseoxymetry Alarm - High', 'Resp Alarm - High',\n",
                            "       'Skin Condition', 'Braden Sensory Perception', 'Braden Moisture',\n",
                            "       'Braden Mobility', 'Braden Activity', 'Resp Alarm - Low',\n",
                            "       'Braden Nutrition', 'Turn', 'Abdominal Assessment',\n",
                            "       'Braden Friction/Shear', 'Heart Rhythm', 'Parameters Checked',\n",
                            "       'Bowel Sounds', 'Skin Color', 'Oral Cavity', 'Activity Tolerance',\n",
                            "       'Temperature Site', 'SpO2 Desat Limit', 'Diet Type', 'Urine Source',\n",
                            "       'Pain Assessment Method', 'IV/Saline lock', 'Ambulatory aid',\n",
                            "       'Gait/Transferring', 'Secondary diagnosis', 'Potassium (serum)',\n",
                            "       'Mental status', 'Edema Location', 'O2 Delivery Device(s)',\n",
                            "       'History of falling (within 3 mnths)', 'Pain Present', 'Position',\n",
                            "       'Dorsal PedPulse R', 'Dorsal PedPulse L', 'Safety Measures',\n",
                            "       'Non Invasive Blood Pressure diastolic',\n",
                            "       'Non Invasive Blood Pressure systolic',\n",
                            "       'Non Invasive Blood Pressure mean', 'Therapeutic Bed',\n",
                            "       'Temperature Fahrenheit', 'Ectopy Type 1', 'Urine Color',\n",
                            "       'Daily Wake Up', 'Pain Location', 'Cough Effort', 'Urine Appearance',\n",
                            "       'Speech', 'Education Learner', 'Education Topic', 'Education Method',\n",
                            "       'Education Barrier', 'Education Response', 'PostTib. Pulses R',\n",
                            "       'PostTib. Pulses L', 'Pain Level Acceptable', 'Pain Level',\n",
                            "       'Support Systems', 'Non-Invasive Blood Pressure Alarm - Low',\n",
                            "       'Non-Invasive Blood Pressure Alarm - High', 'NBP Alarm Source',\n",
                            "       'Pupil Size Right', 'Pupil Size Left', 'Pupil Response Right',\n",
                            "       'Pupil Response Left', 'Pain Management', 'Side Rails'],\n",
                            "      dtype='object', name='label')"
                        ]
                    },
                    "execution_count": 10,
                    "metadata": {},
                    "output_type": "execute_result"
                }
            ],
            "source": [
                "fre_labels.index"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 11,
            "metadata": {},
            "outputs": [],
            "source": [
                "mask_df = charts2['label'].isin(fre_labels.index)"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 12,
            "metadata": {},
            "outputs": [
                {
                    "data": {
                        "text/plain": [
                            "128331422"
                        ]
                    },
                    "execution_count": 12,
                    "metadata": {},
                    "output_type": "execute_result"
                }
            ],
            "source": [
                "mask_df.sum()"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 13,
            "metadata": {},
            "outputs": [],
            "source": [
                "charts3=charts2.loc[mask_df]"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 14,
            "metadata": {},
            "outputs": [
                {
                    "data": {
                        "text/plain": [
                            "(128331422, 7)"
                        ]
                    },
                    "execution_count": 14,
                    "metadata": {},
                    "output_type": "execute_result"
                }
            ],
            "source": [
                "charts3.shape"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 15,
            "metadata": {},
            "outputs": [
                {
                    "name": "stderr",
                    "output_type": "stream",
                    "text": [
                        "/tmp/ipykernel_3896464/715067778.py:1: SettingWithCopyWarning: \n",
                        "A value is trying to be set on a copy of a slice from a DataFrame\n",
                        "\n",
                        "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
                        "  charts3.dropna(inplace=True)\n"
                    ]
                }
            ],
            "source": [
                "charts3.dropna(inplace=True)"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 16,
            "metadata": {},
            "outputs": [
                {
                    "data": {
                        "text/plain": [
                            "(36835389, 7)"
                        ]
                    },
                    "execution_count": 16,
                    "metadata": {},
                    "output_type": "execute_result"
                }
            ],
            "source": [
                "charts3.shape"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 17,
            "metadata": {},
            "outputs": [],
            "source": [
                "charts3.to_csv(path_mimic + '/processed/tables/charts_processed.csv')"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 18,
            "metadata": {},
            "outputs": [
                {
                    "data": {
                        "text/plain": [
                            "array(['Heart Rate', 'Respiratory Rate', 'O2 saturation pulseoxymetry',\n",
                            "       'Heart rate Alarm - High', 'Heart Rate Alarm - Low',\n",
                            "       'O2 Saturation Pulseoxymetry Alarm - High',\n",
                            "       'O2 Saturation Pulseoxymetry Alarm - Low', 'Resp Alarm - High',\n",
                            "       'Resp Alarm - Low', 'SpO2 Desat Limit', 'Potassium (serum)',\n",
                            "       'Non Invasive Blood Pressure systolic',\n",
                            "       'Non Invasive Blood Pressure diastolic',\n",
                            "       'Non Invasive Blood Pressure mean', 'Temperature Fahrenheit',\n",
                            "       'Non-Invasive Blood Pressure Alarm - High',\n",
                            "       'Non-Invasive Blood Pressure Alarm - Low'], dtype=object)"
                        ]
                    },
                    "execution_count": 18,
                    "metadata": {},
                    "output_type": "execute_result"
                }
            ],
            "source": [
                "charts3['label'].unique()"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 19,
            "metadata": {},
            "outputs": [],
            "source": [
                "itemid_valueuom = charts3.groupby([\"itemid\"])[\"valueuom\"].unique()"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 23,
            "metadata": {},
            "outputs": [
                {
                    "data": {
                        "text/plain": [
                            "itemid\n",
                            "220045         [bpm]\n",
                            "220046         [bpm]\n",
                            "220047         [bpm]\n",
                            "220179        [mmHg]\n",
                            "220180        [mmHg]\n",
                            "220181        [mmHg]\n",
                            "220210    [insp/min]\n",
                            "220277           [%]\n",
                            "223751        [mmHg]\n",
                            "223752        [mmHg]\n",
                            "223761          [°F]\n",
                            "223769           [%]\n",
                            "223770           [%]\n",
                            "224161    [insp/min]\n",
                            "224162    [insp/min]\n",
                            "226253           [%]\n",
                            "227442       [mEq/L]\n",
                            "Name: valueuom, dtype: object"
                        ]
                    },
                    "execution_count": 23,
                    "metadata": {},
                    "output_type": "execute_result"
                }
            ],
            "source": [
                "itemid_valueuom"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "metadata": {},
            "outputs": [],
            "source": []
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "metadata": {},
            "outputs": [],
            "source": []
        }
    ],
    "metadata": {
        "kernelspec": {
            "display_name": "Python 3 (ipykernel)",
            "language": "python",
            "name": "python3"
        },
        "language_info": {
            "codemirror_mode": {
                "name": "ipython",
                "version": 3
            },
            "file_extension": ".py",
            "mimetype": "text/x-python",
            "name": "python",
            "nbconvert_exporter": "python",
            "pygments_lexer": "ipython3",
            "version": "3.10.8"
        },
        "vscode": {
            "interpreter": {
                "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
            }
        }
    },
    "nbformat": 4,
    "nbformat_minor": 4
}
