{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from collections import Counter\n",
    "import pickle\n",
    "\n",
    "from preprocessing_util import *"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### April 2, 2024\n",
    "### Preprocessing algorithm:\n",
    "1. Subset NOTEEVENTS.csv by choosing a \"HADM_ID\" for each \"SUBJECT_ID\" by selection on lowest value of \"CHARTDATE\" (\"NOTEEVENTS.csv\")\n",
    "2. Output A. Subset NOTEEVENTS.csv \"HADM_ID\" (drop everyone without notes, drop CATEGORY='Discharge summary'). Keep \"CATEGORY\" as a column\n",
    "3. Output B. Left = DIAGNOSES_ICD.csv, Right=\"D_ICD_DIAGNOSES.csv\" (just \"ICD9_CODE\",\"SHORT_TITLE\"). Key=\"ICD9_CODE\". left join\n",
    "4. Joining Output A & B.\n",
    "    1. Set A. Set of HADM_IDs from OutputA\n",
    "    2. Row by row in Output B. Check if \"HADM_ID\" is in Set A. If yes, add to Counter.\n",
    "    3. Counter/Dictionary: {\"ICD9_CODE\": Count of unique HADM_IDs from SetA with that code} (use Counter in Python)\n",
    "    4. Output: {\"58281\": 10, \"2762\": 100} -> {\"TB pneumonia-oth test: 10, ...} Sorted by value in dict\n",
    "    5. Eventually: Choose top 10 or so ICD9 and join w/ output A "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "notes = pd.read_csv('../../MIMIC-III/NOTEEVENTS.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "notes.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# sort all of the notes by chart date so that we are only keeping the row of data with the smallest chart date value\n",
    "notes = notes.sort_values(by='CHARTDATE')\n",
    "notes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note: There are some rows of data where the HADM_ID is nan. I am going to ignore those by dropping rows of data with nan HADM_ID."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# drop rows of data where the hospital admission ID is nan\n",
    "notes = notes.dropna(subset=['HADM_ID'])\n",
    "notes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# find all the unique subjects\n",
    "unique_subjects = set(notes['SUBJECT_ID'])\n",
    "len(unique_subjects)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# find the hospital admission id's that correspond to the first appearance for each subject\n",
    "# the first appearance corresponds to the lowest value of CHARTDATE since we sorted it previously\n",
    "first_hadm_ids = []\n",
    "\n",
    "for subject in unique_subjects:\n",
    "    # only considering a unique subject in the dataframe\n",
    "    temp = notes[notes['SUBJECT_ID'] == subject]\n",
    "\n",
    "    # identify the first (also the earliest according to CHARTDATE because we have sorted by that already)\n",
    "    # HADM_ID for this subject\n",
    "    first_hadm_ids.append(temp.iloc[0]['HADM_ID'])\n",
    "\n",
    "first_hadm_ids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(first_hadm_ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# subset notes to keep only the first hospital admission id for each patient based off of CHARTDATE\n",
    "notes_subset = notes[notes['HADM_ID'].isin(first_hadm_ids)]\n",
    "\n",
    "notes_subset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# drop all notes that are discharge summaries\n",
    "output_A = notes_subset[notes_subset['CATEGORY'] != 'Discharge summary']\n",
    "output_A"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save output_A as a csv file\n",
    "output_A.to_csv('output_A.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "diagnoses_icd = pd.read_csv('../../MIMIC-III/DIAGNOSES_ICD.csv')\n",
    "diagnoses_icd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "diagnoses_icd_key = pd.read_csv('../../MIMIC-III/D_ICD_DIAGNOSES.csv')\n",
    "diagnoses_icd_key = diagnoses_icd_key.drop(columns=['LONG_TITLE', 'ROW_ID'])\n",
    "diagnoses_icd_key"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# do a left join based on ICD9_CODE so that we append the short title to each row\n",
    "output_B = diagnoses_icd.merge(diagnoses_icd_key, how='left', on='ICD9_CODE')\n",
    "output_B"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# drop nan values for the short title column\n",
    "output_B = output_B.dropna(subset=['SHORT_TITLE'])\n",
    "output_B"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save output_B as a csv file\n",
    "output_B.to_csv('output_B.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "# declare a counter so that we can keep track of which diagnoses appear the most often\n",
    "# in the output_A dataframe\n",
    "diagnosis_counter = Counter()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# go through each row in output_B; if the HADM_ID appears in first_hadm_ids, update the value of the counter\n",
    "for index, row in output_B.iterrows():\n",
    "    if row['HADM_ID'] in first_hadm_ids:\n",
    "        diagnosis_counter[row['SHORT_TITLE']] += 1\n",
    "\n",
    "diagnosis_counter.most_common(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "pickle.dump(diagnosis_counter, open('diagnosis_counter.p', 'wb'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Take those top 10 categories and add them as columns to a dataframe."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "top_diagnoses = diagnosis_counter.most_common(10)\n",
    "\n",
    "# create dataframe for recording whether each patient received a specific diagnosis\n",
    "diagnoses_df = output_A[['HADM_ID', 'SUBJECT_ID']].drop_duplicates()\n",
    "\n",
    "diagnoses_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# next 10 columns are whether they received a diagnosis for the top 10 most common diagnoses\n",
    "\n",
    "diagnosis_names = [tup[0] for tup in diagnosis_counter.most_common(10)]\n",
    "\n",
    "for diagnosis in diagnosis_names:\n",
    "    l = []\n",
    "    for hadm_id in diagnoses_df['HADM_ID']:\n",
    "        subset = output_B[output_B['HADM_ID'] == hadm_id]\n",
    "\n",
    "        subset = subset[subset['SHORT_TITLE'] == diagnosis]\n",
    "        \n",
    "        if len(subset) >= 1:\n",
    "            l.append(1)\n",
    "        else:\n",
    "            l.append(0)\n",
    "        \n",
    "    diagnoses_df[diagnosis] = l\n",
    "\n",
    "diagnoses_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [],
   "source": [
    "diagnoses_df.to_csv('diagnoses_df.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Baseline confounders gender and age\n",
    "\n",
    "Join on the gender and age of all of patients then drop all of the patients under 18 years of age. Update output_A as the csv with this additional information."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_A = pd.read_csv('output_A.csv')\n",
    "output_A"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# read the csv containing demographic information of the patient\n",
    "patients = pd.read_csv('../../MIMIC-III/PATIENTS.csv')\n",
    "patients"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# transform the F and M in GENDER to 0 and 1\n",
    "patients['gender'] = patients['GENDER'].map({'F': 0, 'M': 1})\n",
    "patients"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_A = output_A.merge(patients[['SUBJECT_ID', 'gender']], how='left', on='SUBJECT_ID')\n",
    "output_A"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_A = output_A.merge(patients[['SUBJECT_ID', 'DOB']], how='left', on='SUBJECT_ID')\n",
    "output_A"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_A['age'] = calculate_age(output_A['CHARTDATE'], output_A['DOB'])\n",
    "output_A"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_A = pd.read_csv('output_A.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# drop all patients that are younger than 18 and greater than 100\n",
    "output_A = output_A[output_A['age'] > 18]\n",
    "output_A = output_A[output_A['age'] < 100]\n",
    "output_A"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save the dataframe back into a csv file\n",
    "output_A.to_csv('output_A.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Notes Categories Investigation\n",
    "\n",
    "Investigate which categories we can try to subset for the output_A dataframe and still have a sufficiently large sample size."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_A = pd.read_csv('output_A.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "categories = set(output_A['CATEGORY'])\n",
    "categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pre-processing where we make a dictionary such that key is the hospital admission id and\n",
    "# value is all of the cateogires that they have\n",
    "hadm_categories = {}\n",
    "\n",
    "unique_hadms = list(set(output_A['HADM_ID']))\n",
    "\n",
    "for hadm in unique_hadms:\n",
    "    subset = output_A[output_A['HADM_ID'] == hadm]\n",
    "    hadm_categories[hadm] = list(set(subset['CATEGORY']))\n",
    "\n",
    "hadm_categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make a dictionary that saves the categories and resulting dataframe sizes\n",
    "save = {}\n",
    "\n",
    "categories = list(categories)\n",
    "\n",
    "for i in range(len(categories)):\n",
    "    for j in range(i+1, len(categories)):\n",
    "        category1 = categories[i]\n",
    "        category2 = categories[j]\n",
    "        cnt = 0\n",
    "        for hadm in hadm_categories:\n",
    "            if category1 in hadm_categories[hadm] and category2 in hadm_categories[hadm]:\n",
    "                cnt += 1\n",
    "\n",
    "        save[(category1, category2)] = cnt\n",
    "\n",
    "save = dict(sorted(save.items(), key=lambda item: item[1], reverse=True))\n",
    "\n",
    "save"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
