{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# autoreload\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "mimic_iv_notes_parent = \"/cis/home/charr165/Documents/physionet.org/files/mimic-iv-note/2.2/note\"\n",
    "mimic_iv_path = \"/cis/home/charr165/Documents/physionet.org/mimiciv/2.2\"\n",
    "\n",
    "rad_notes_f_path = os.path.join(mimic_iv_notes_parent, \"radiology.csv\")\n",
    "rad_notes_df = pd.read_csv(rad_notes_f_path, low_memory=False)\n",
    "rad_notes_df['charttime'] = pd.to_datetime(rad_notes_df['charttime'])\n",
    "rad_notes_df['storetime'] = pd.to_datetime(rad_notes_df['storetime'])\n",
    "\n",
    "icustays_df = pd.read_csv(os.path.join(mimic_iv_path, \"icu\", \"icustays.csv\"), low_memory=False)\n",
    "icustays_df['intime'] = pd.to_datetime(icustays_df['intime'])\n",
    "icustays_df['outtime'] = pd.to_datetime(icustays_df['outtime'])\n",
    "\n",
    "admissions_df = pd.read_csv(os.path.join(mimic_iv_path, \"hosp\", \"admissions.csv\"), low_memory=False)\n",
    "admissions_df['admittime'] = pd.to_datetime(admissions_df['admittime'])\n",
    "admissions_df['dischtime'] = pd.to_datetime(admissions_df['dischtime'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|          | 0/2321355 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 2321355/2321355 [42:41<00:00, 906.40it/s] \n"
     ]
    }
   ],
   "source": [
    "rad_notes_df['hadm_id'] = None\n",
    "rad_notes_df['stay_id'] = None\n",
    "rad_notes_df['icu_time_delta'] = None\n",
    "rad_notes_df['hosp_time_delta'] = None\n",
    "\n",
    "def calc_time_delta_hrs(icu_intime, charttime):\n",
    "    return (charttime - icu_intime).total_seconds() / 3600\n",
    "\n",
    "for index, row in tqdm(rad_notes_df.iterrows(), total=rad_notes_df.shape[0]):\n",
    "    curr_pts_icustays = icustays_df[icustays_df['subject_id'] == row['subject_id']]\n",
    "    \n",
    "    for icu_index, icu_row in curr_pts_icustays.iterrows():\n",
    "        if icu_row['intime'] <= row['charttime'] <= icu_row['outtime']:\n",
    "            rad_notes_df.loc[index, 'stay_id'] = icu_row['stay_id']\n",
    "            rad_notes_df.loc[index, 'icu_time_delta'] = calc_time_delta_hrs(icu_row['intime'], row['charttime'])\n",
    "    \n",
    "    curr_pts_admissions = admissions_df[admissions_df['subject_id'] == row['subject_id']]\n",
    "\n",
    "    for hosp_index, hosp_row in curr_pts_admissions.iterrows():\n",
    "        if hosp_row['admittime'] <= row['charttime'] <= hosp_row['dischtime']:\n",
    "            rad_notes_df.loc[index, 'hadm_id'] = hosp_row['hadm_id']\n",
    "            rad_notes_df.loc[index, 'hosp_time_delta'] = calc_time_delta_hrs(hosp_row['admittime'], row['charttime'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(2321355, 11)\n"
     ]
    }
   ],
   "source": [
    "mm_dir = \"/cis/home/charr165/Documents/multimodal\"\n",
    "output_dir = os.path.join(mm_dir, \"preprocessing\")\n",
    "\n",
    "rad_notes_df.to_pickle(os.path.join(output_dir, \"notes_text.pkl\"))\n",
    "print(rad_notes_df.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>note_id</th>\n",
       "      <th>subject_id</th>\n",
       "      <th>hadm_id</th>\n",
       "      <th>note_type</th>\n",
       "      <th>note_seq</th>\n",
       "      <th>charttime</th>\n",
       "      <th>storetime</th>\n",
       "      <th>text</th>\n",
       "      <th>stay_id</th>\n",
       "      <th>icu_time_delta</th>\n",
       "      <th>hosp_time_delta</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10000032-RR-14</td>\n",
       "      <td>10000032</td>\n",
       "      <td>None</td>\n",
       "      <td>RR</td>\n",
       "      <td>14</td>\n",
       "      <td>2180-05-06 21:19:00</td>\n",
       "      <td>2180-05-06 23:32:00</td>\n",
       "      <td>EXAMINATION:  CHEST (PA AND LAT)\\n\\nINDICATION...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10000032-RR-15</td>\n",
       "      <td>10000032</td>\n",
       "      <td>22595853</td>\n",
       "      <td>RR</td>\n",
       "      <td>15</td>\n",
       "      <td>2180-05-06 23:00:00</td>\n",
       "      <td>2180-05-06 23:26:00</td>\n",
       "      <td>EXAMINATION:  LIVER OR GALLBLADDER US (SINGLE ...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>0.616667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10000032-RR-16</td>\n",
       "      <td>10000032</td>\n",
       "      <td>22595853</td>\n",
       "      <td>RR</td>\n",
       "      <td>16</td>\n",
       "      <td>2180-05-07 09:55:00</td>\n",
       "      <td>2180-05-07 11:15:00</td>\n",
       "      <td>INDICATION:  ___ HCV cirrhosis c/b ascites, hi...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>11.533333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10000032-RR-18</td>\n",
       "      <td>10000032</td>\n",
       "      <td>None</td>\n",
       "      <td>RR</td>\n",
       "      <td>18</td>\n",
       "      <td>2180-06-03 12:46:00</td>\n",
       "      <td>2180-06-03 14:01:00</td>\n",
       "      <td>EXAMINATION:  Ultrasound-guided paracentesis.\\...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10000032-RR-20</td>\n",
       "      <td>10000032</td>\n",
       "      <td>None</td>\n",
       "      <td>RR</td>\n",
       "      <td>20</td>\n",
       "      <td>2180-07-08 13:18:00</td>\n",
       "      <td>2180-07-08 14:15:00</td>\n",
       "      <td>EXAMINATION:  Paracentesis\\n\\nINDICATION:  ___...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2321350</th>\n",
       "      <td>19999987-RR-17</td>\n",
       "      <td>19999987</td>\n",
       "      <td>23865745</td>\n",
       "      <td>RR</td>\n",
       "      <td>17</td>\n",
       "      <td>2145-11-02 22:37:00</td>\n",
       "      <td>2145-11-03 18:55:00</td>\n",
       "      <td>HISTORY:  ___, with left occipital bleeding.  ...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>0.983333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2321351</th>\n",
       "      <td>19999987-RR-18</td>\n",
       "      <td>19999987</td>\n",
       "      <td>23865745</td>\n",
       "      <td>RR</td>\n",
       "      <td>18</td>\n",
       "      <td>2145-11-03 04:35:00</td>\n",
       "      <td>2145-11-03 10:46:00</td>\n",
       "      <td>INDICATION:  ___ female intubated for head ble...</td>\n",
       "      <td>36195440</td>\n",
       "      <td>5.6</td>\n",
       "      <td>6.95</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2321352</th>\n",
       "      <td>19999987-RR-19</td>\n",
       "      <td>19999987</td>\n",
       "      <td>23865745</td>\n",
       "      <td>RR</td>\n",
       "      <td>19</td>\n",
       "      <td>2145-11-03 16:40:00</td>\n",
       "      <td>2145-11-04 08:36:00</td>\n",
       "      <td>HISTORY:  ___ woman with left occipital hemorr...</td>\n",
       "      <td>36195440</td>\n",
       "      <td>17.683333</td>\n",
       "      <td>19.033333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2321353</th>\n",
       "      <td>19999987-RR-20</td>\n",
       "      <td>19999987</td>\n",
       "      <td>23865745</td>\n",
       "      <td>RR</td>\n",
       "      <td>20</td>\n",
       "      <td>2145-11-04 05:10:00</td>\n",
       "      <td>2145-11-04 08:58:00</td>\n",
       "      <td>PORTABLE CHEST OF ___\\n\\nCOMPARISON:  ___ radi...</td>\n",
       "      <td>36195440</td>\n",
       "      <td>30.183333</td>\n",
       "      <td>31.533333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2321354</th>\n",
       "      <td>19999987-RR-21</td>\n",
       "      <td>19999987</td>\n",
       "      <td>23865745</td>\n",
       "      <td>RR</td>\n",
       "      <td>21</td>\n",
       "      <td>2145-11-07 15:18:00</td>\n",
       "      <td>2145-11-08 16:44:00</td>\n",
       "      <td>DATE OF SERVICE:  ___.\\n\\nPRE-OPERATIVE DIAGNO...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>113.666667</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2321355 rows × 11 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                note_id  subject_id   hadm_id note_type  note_seq  \\\n",
       "0        10000032-RR-14    10000032      None        RR        14   \n",
       "1        10000032-RR-15    10000032  22595853        RR        15   \n",
       "2        10000032-RR-16    10000032  22595853        RR        16   \n",
       "3        10000032-RR-18    10000032      None        RR        18   \n",
       "4        10000032-RR-20    10000032      None        RR        20   \n",
       "...                 ...         ...       ...       ...       ...   \n",
       "2321350  19999987-RR-17    19999987  23865745        RR        17   \n",
       "2321351  19999987-RR-18    19999987  23865745        RR        18   \n",
       "2321352  19999987-RR-19    19999987  23865745        RR        19   \n",
       "2321353  19999987-RR-20    19999987  23865745        RR        20   \n",
       "2321354  19999987-RR-21    19999987  23865745        RR        21   \n",
       "\n",
       "                  charttime           storetime  \\\n",
       "0       2180-05-06 21:19:00 2180-05-06 23:32:00   \n",
       "1       2180-05-06 23:00:00 2180-05-06 23:26:00   \n",
       "2       2180-05-07 09:55:00 2180-05-07 11:15:00   \n",
       "3       2180-06-03 12:46:00 2180-06-03 14:01:00   \n",
       "4       2180-07-08 13:18:00 2180-07-08 14:15:00   \n",
       "...                     ...                 ...   \n",
       "2321350 2145-11-02 22:37:00 2145-11-03 18:55:00   \n",
       "2321351 2145-11-03 04:35:00 2145-11-03 10:46:00   \n",
       "2321352 2145-11-03 16:40:00 2145-11-04 08:36:00   \n",
       "2321353 2145-11-04 05:10:00 2145-11-04 08:58:00   \n",
       "2321354 2145-11-07 15:18:00 2145-11-08 16:44:00   \n",
       "\n",
       "                                                      text   stay_id  \\\n",
       "0        EXAMINATION:  CHEST (PA AND LAT)\\n\\nINDICATION...      None   \n",
       "1        EXAMINATION:  LIVER OR GALLBLADDER US (SINGLE ...      None   \n",
       "2        INDICATION:  ___ HCV cirrhosis c/b ascites, hi...      None   \n",
       "3        EXAMINATION:  Ultrasound-guided paracentesis.\\...      None   \n",
       "4        EXAMINATION:  Paracentesis\\n\\nINDICATION:  ___...      None   \n",
       "...                                                    ...       ...   \n",
       "2321350  HISTORY:  ___, with left occipital bleeding.  ...      None   \n",
       "2321351  INDICATION:  ___ female intubated for head ble...  36195440   \n",
       "2321352  HISTORY:  ___ woman with left occipital hemorr...  36195440   \n",
       "2321353  PORTABLE CHEST OF ___\\n\\nCOMPARISON:  ___ radi...  36195440   \n",
       "2321354  DATE OF SERVICE:  ___.\\n\\nPRE-OPERATIVE DIAGNO...      None   \n",
       "\n",
       "        icu_time_delta hosp_time_delta  \n",
       "0                 None            None  \n",
       "1                 None        0.616667  \n",
       "2                 None       11.533333  \n",
       "3                 None            None  \n",
       "4                 None            None  \n",
       "...                ...             ...  \n",
       "2321350           None        0.983333  \n",
       "2321351            5.6            6.95  \n",
       "2321352      17.683333       19.033333  \n",
       "2321353      30.183333       31.533333  \n",
       "2321354           None      113.666667  \n",
       "\n",
       "[2321355 rows x 11 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rad_notes_df"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
