{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import pickle\n",
    "\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Oct. 2, 2024\n",
    "\n",
    "This file contains code that iterates through all possible note pairings, subsets the data to patients who have clinical notes for both categories, and finds the most common words in the concatenated text data as well as the separate text data.\n",
    "\n",
    "Total folders:\n",
    "- ECG-Echo\n",
    "- ECG-Nursing\n",
    "- ECG-Radiology\n",
    "- Echo-Nursing\n",
    "- Echo-Radiology\n",
    "- Nursing-Radiology"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "diagnoses_df = pd.read_csv(\"../diagnoses_df.csv\")\n",
    "diagnoses_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_A = pd.read_csv(\"../output_A.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_A = output_A[['HADM_ID', 'gender', 'age']]\n",
    "output_A = output_A.drop_duplicates(subset=['HADM_ID'])\n",
    "output_A"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# merge age and gender values from output_A to diagnoses_df by matching on HADM_ID\n",
    "UandC_data = diagnoses_df.merge(output_A, how='inner', on='HADM_ID')\n",
    "UandC_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pairs = [('ECG', 'Echo'), ('ECG', 'Nursing'), ('ECG', 'Radiology'), ('Echo', 'Nursing'), ('Echo', 'Radiology'), ('Nursing', 'Radiology')]\n",
    "\n",
    "# create a dictionary to store all of the data that we want to plot, this\n",
    "# dictionary will store dictionaries\n",
    "table_data = dict()\n",
    "\n",
    "for pair in pairs:\n",
    "    # this dictionary will contain all of the data we want to plot for this pair of notes data\n",
    "    data = dict()\n",
    "\n",
    "    text_data_T_1 = pd.read_csv('../text_csv_files/text_data_' + pair[0] + '.csv')[['HADM_ID', pair[0]]]\n",
    "    text_data_T_2 = pd.read_csv('../text_csv_files/text_data_' + pair[1] + '.csv')[['HADM_ID', pair[1]]]\n",
    "\n",
    "    # combine the Us, Cs, and T_1 T_2 text data into one dataframe\n",
    "    combined_data = UandC_data.merge(text_data_T_1, how='inner', on='HADM_ID')\n",
    "    combined_data = combined_data.merge(text_data_T_2, how='inner', on='HADM_ID')\n",
    "\n",
    "    # use a tfidf vectorizer so that we can ignore stop words that appear in all the notes\n",
    "    # or in very few of the notes\n",
    "\n",
    "    # throw away words that appear in 90th percentile or more\n",
    "    # binary=False and use_idf=True by default\n",
    "    vectorizer = TfidfVectorizer(max_features=5, min_df=int(0.1*len(combined_data)), max_df=int(0.9*len(combined_data)))\n",
    "\n",
    "    # create a new column that represents the concatenation of the two columns of text data\n",
    "    combined_data[\"concated_text\"] = combined_data[pair[0]] + combined_data[pair[1]]\n",
    "\n",
    "    # train the vectorizer on the concatenation of the two notes data\n",
    "    vectorizer.fit(combined_data[\"concated_text\"])\n",
    "    print(pair[0])\n",
    "    print(pair[1])\n",
    "    print(\"combined text\", vectorizer.get_feature_names_out())\n",
    "    data[\"combined_text_words\"] = list(vectorizer.get_feature_names_out())\n",
    "\n",
    "    # create the bag of words matrices using the overlapping vocabulary\n",
    "    T_1 = (vectorizer.transform(combined_data[pair[0]]) > 0).astype(int)\n",
    "    T_2 = (vectorizer.transform(combined_data[pair[1]]) > 0).astype(int)\n",
    "\n",
    "    # print out the positivity ratio of the combined arrays\n",
    "    print(\"T_1 column averages\", np.mean(T_1.toarray(), axis=0))\n",
    "    data[\"T_1_column_averages\"] = list(np.mean(T_1.toarray(), axis=0))\n",
    "    print(\"T_2 column averages\", np.mean(T_2.toarray(), axis=0))\n",
    "    data[\"T_2_column_averages\"] = list(np.mean(T_2.toarray(), axis=0))\n",
    "\n",
    "    # train the vectorizer on just one notes data\n",
    "    vectorizer.fit(combined_data[pair[0]])\n",
    "    print(pair[0], vectorizer.get_feature_names_out())\n",
    "    data[\"T_1_words\"] = list(vectorizer.get_feature_names_out())\n",
    "\n",
    "    # train the vecotrizer on just one notes data\n",
    "    vectorizer.fit(combined_data[pair[1]])\n",
    "    print(pair[1], vectorizer.get_feature_names_out())\n",
    "    data[\"T_2_words\"] = list(vectorizer.get_feature_names_out())\n",
    "\n",
    "    table_data[pair] = data\n",
    "\n",
    "pickle.dump(table_data, open(\"table_data.p\", \"wb\"))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "proximal_w_nlp",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
