{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import codecs\n",
    "import sys\n",
    "sys.path.append('../')\n",
    "\n",
    "from constants import *\n",
    "import datasets\n",
    "import log_reg\n",
    "from dataproc import extract_wvs\n",
    "from dataproc import get_discharge_summaries\n",
    "from dataproc import concat_and_split\n",
    "from dataproc import build_vocab\n",
    "from dataproc import word_embeddings\n",
    "\n",
    "from nltk.tokenize import RegexpTokenizer\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "from collections import Counter, defaultdict\n",
    "import csv\n",
    "import operator\n",
    "import re"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Filter to notes that are not effectively empty"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "## Code lightly edited from that for the following paper:\n",
    "## \n",
    "## Adler Perotte, Rimma Pivovarov, Karthik Natarajan, Nicole Weiskopf, Frank Wood, Noemie Elhadad\n",
    "## Diagnosis Code Assignment: Models and Evaluation Metrics, JAMIA, 2013\n",
    "## \n",
    "## Columbia University\n",
    "## Biomedical Informatics\n",
    "## Author: Adler Perotte\n",
    "# Basically this removes some of the boilerplate parts of notes, and if nothing remains, exclude this note\n",
    "term_pattern = re.compile('[A-Za-z]+')\n",
    "with open('%s/MIMIC_RAW_DSUMS' % (MIMIC_2_DIR), 'r') as f:\n",
    "    with open('%s/MIMIC_FILTERED_DSUMS' % (MIMIC_2_DIR), 'w') as f2:\n",
    "        for i, line in enumerate(f):\n",
    "            raw_dsum = line.split('|')[6]\n",
    "\n",
    "            raw_dsum = re.sub(r'\\[[^\\]]+\\]', ' ', raw_dsum)\n",
    "            raw_dsum = re.sub(r'admission date:', ' ', raw_dsum, flags=re.I)\n",
    "            raw_dsum = re.sub(r'discharge date:', ' ', raw_dsum, flags=re.I)\n",
    "            raw_dsum = re.sub(r'date of birth:', ' ', raw_dsum, flags=re.I)\n",
    "            raw_dsum = re.sub(r'sex:', ' ', raw_dsum, flags=re.I)\n",
    "            raw_dsum = re.sub(r'service:', ' ', raw_dsum, flags=re.I)\n",
    "            raw_dsum = re.sub(r'dictated by:.*$', ' ', raw_dsum, flags=re.I)\n",
    "            raw_dsum = re.sub(r'completed by:.*$', ' ', raw_dsum, flags=re.I)\n",
    "            raw_dsum = re.sub(r'signed electronically by:.*$', ' ', raw_dsum, flags=re.I)\n",
    "\n",
    "            tokens = [token.lower() for token in re.findall(term_pattern, raw_dsum)]\n",
    "            tokens = [token for token in tokens if len(token) > 1]\n",
    "\n",
    "            # Determine if this DSUM should stay, if so, write to filtered DSUM file\n",
    "            if len(tokens) > 0:\n",
    "                f2.write(line)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Tokenize/preprocess raw text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "tokenizer = RegexpTokenizer(r'\\w+')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "22815it [00:31, 733.09it/s]\n"
     ]
    }
   ],
   "source": [
    "with codecs.open('%s/MIMIC_FILTERED_DSUMS' % MIMIC_2_DIR, 'r', encoding='latin-1') as f:\n",
    "    with open('%s/proc_dsums.csv' % MIMIC_2_DIR, 'w') as of:\n",
    "        r = csv.reader(f, delimiter='|')\n",
    "        #header\n",
    "        next(r)\n",
    "        w = csv.writer(of)\n",
    "        w.writerow(['SUBJECT_ID', 'HADM_ID', 'CHARTTIME', 'TEXT', 'LABELS'])\n",
    "        for row in tqdm(r):\n",
    "            note = row[6].replace('[NEWLINE]', '\\n')\n",
    "            tokens = [t.lower() for t in tokenizer.tokenize(note) if not t.isnumeric()]\n",
    "            text = ' '.join(tokens)\n",
    "            codes = ';'.join(row[5].split(','))\n",
    "            w.writerow([row[0], row[1], row[2], text, codes])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## split data using given id's"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_ids = set()\n",
    "test_ids = set()\n",
    "with open('%s/training_indices.data' % MIMIC_2_DIR) as f:\n",
    "    for row in f:\n",
    "        train_ids.add(int(row.rstrip()))\n",
    "        \n",
    "with open('%s/testing_indices.data' % MIMIC_2_DIR) as f:\n",
    "    for row in f:\n",
    "        test_ids.add(int(row.rstrip()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "with open('%s/proc_dsums.csv' % MIMIC_2_DIR, 'r') as nf:\n",
    "    with open('%s/test_dsums.csv' % MIMIC_2_DIR, 'w') as test_f:\n",
    "        with open('%s/train_dsums.csv' % MIMIC_2_DIR, 'w') as train_f:\n",
    "            r = csv.reader(nf, delimiter=',')\n",
    "            test_w = csv.writer(test_f)\n",
    "            train_w = csv.writer(train_f)\n",
    "            #header\n",
    "            header = next(r)\n",
    "            #don't need chart time\n",
    "            del(header[2])\n",
    "            test_w.writerow(header)\n",
    "            train_w.writerow(header)\n",
    "            for i,row in enumerate(r):\n",
    "                #don't need chart time\n",
    "                del(row[2])\n",
    "                if i in train_ids:\n",
    "                    train_w.writerow(row)\n",
    "                elif i in test_ids:\n",
    "                    test_w.writerow(row)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create vocabulary from training data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "reading in data...\n",
      "removing rare terms\n",
      "30688 terms qualify out of 77895 total\n",
      "writing output\n"
     ]
    }
   ],
   "source": [
    "vfile = build_vocab.build_vocab(3, '%s/train_dsums.csv' % MIMIC_2_DIR, '%s/vocab.csv' % MIMIC_2_DIR)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sort by length to get final data ready for models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df = pd.read_csv('%s/train_dsums.csv' % MIMIC_2_DIR)\n",
    "df['length'] = df.apply(lambda row: len(row[2].split()) if not pd.isnull(row[2]) else 0, axis=1)\n",
    "df = df[df['length'] > 1]\n",
    "df = df.sort_values(['length'])\n",
    "df.to_csv('%s/train.csv' % MIMIC_2_DIR, index=False)\n",
    "\n",
    "df = pd.read_csv('%s/test_dsums.csv' % MIMIC_2_DIR)\n",
    "df['length'] = df.apply(lambda row: len(row[2].split()) if not pd.isnull(row[2]) else 0, axis=1)\n",
    "df = df[df['length'] > 1]\n",
    "df = df.sort_values(['length'])\n",
    "df.to_csv('%s/test.csv' % MIMIC_2_DIR, index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Pre-train word embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "building word2vec vocab on /nethome/jmullenbach3/replication/cnn-medical-text/mimicdata/mimic2//proc_dsums.csv...\n",
      "training...\n",
      "writing embeddings to /nethome/jmullenbach3/replication/cnn-medical-text/mimicdata/mimic2//processed_full.w2v\n"
     ]
    }
   ],
   "source": [
    "w2v_file = word_embeddings.word_embeddings('full', '%s/proc_dsums.csv' % MIMIC_2_DIR, 100, 3, 5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save embeddings to be read in and used to initialize embedding layers later"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import gensim\n",
    "model = gensim.models.Word2Vec.load('%s/processed_full.w2v' % MIMIC_2_DIR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "wv = model.wv\n",
    "del(model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "ind2w = defaultdict(str)\n",
    "vocab = set()\n",
    "with open('%s/vocab.csv' % MIMIC_2_DIR, 'r') as f:\n",
    "    for i, line in enumerate(f):\n",
    "        vocab.add(line.rstrip())\n",
    "ind2w = {i+1:w for i,w in enumerate(sorted(vocab))}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 30688/30688 [01:02<00:00, 491.23it/s]\n"
     ]
    }
   ],
   "source": [
    "W, words = extract_wvs.build_matrix(ind2w, wv)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "with open('%s/processed_full.embed' % MIMIC_2_DIR, 'w') as f:\n",
    "    for i in range(len(words)):\n",
    "        line = [words[i]]\n",
    "        line.extend([str(d) for d in W[i]])\n",
    "        f.write(\" \".join(line) + \"\\n\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Write description vectors with vocab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "desc_dict = defaultdict(str)\n",
    "with open('%s/MIMIC_ICD9_mapping' % MIMIC_2_DIR, 'r') as f:\n",
    "    r = csv.reader(f)\n",
    "    #header\n",
    "    next(r)\n",
    "    for row in r:\n",
    "        desc_dict[str(row[1])] = str(row[2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "w2ind = {w:i for i,w in ind2w.items()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 7042/7042 [00:00<00:00, 50081.62it/s]\n"
     ]
    }
   ],
   "source": [
    "with open('%s/description_vectors.vocab' % MIMIC_2_DIR, 'w') as of:\n",
    "    w = csv.writer(of, delimiter=' ')\n",
    "    w.writerow([\"CODE\", \"VECTOR\"])\n",
    "    for code, desc in tqdm(desc_dict.items()):\n",
    "        tokens = [t.lower() for t in tokenizer.tokenize(desc) if not t.isnumeric()]\n",
    "        inds = [w2ind[t] if t in w2ind.keys() else len(w2ind)+1 for t in tokens]\n",
    "        w.writerow([code] + [str(i) for i in inds])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
