{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[This is a sentence.,  Dr. ___ thought this., upper lobe 2.5 cm -- ?, aspiration., Talked to at 9:35 a.m. 1., abc., 2., def.]\n"
     ]
    }
   ],
   "source": [
    "from spacy.lang.en import English\n",
    "from tqdm.notebook import tqdm\n",
    "\n",
    "nlp = English()\n",
    "# nlp=spacy.load('en_core_web_lg')\n",
    "nlp.add_pipe(\"sentencizer\")\n",
    "doc = nlp(\"This is a sentence.  Dr. ___ thought this. upper lobe 2.5 cm -- ? aspiration. Talked to at 9:35 a.m. 1. abc. 2. def.\")\n",
    "print(list(doc.sents))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import pandas\n",
    "# df = pandas.read_csv(\"../../mimic_unique.csv\", delimiter=\",\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# print(df.head())\n",
    "# print(len(df))\n",
    "# # print(df.count())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def split(val):\n",
    "    #old_idx, fname, rep = val[0], val[1], val[2]\n",
    "    fname, rep = val[0], val[1]\n",
    "    sentences = [str(i) for i in list(nlp(rep).sents)]\n",
    "    pruned = [i.replace(\"\\\"\", \"\") for i in sentences if '_' not in i and len(i) > 5]\n",
    "    return fname, pruned"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# newdf = pandas.DataFrame(columns=[\"Old Index\", \"filename\", \"Report Impression Sentence\"])\n",
    "# unique_sents = set()\n",
    "# counter = 0\n",
    "# for i in tqdm(df.itertuples(), total=len(df)):\n",
    "#     idx, old_idx, fname, rep = i[0], i[1], i[2], i[3]\n",
    "#     sentences = list(nlp(rep).sents)\n",
    "#     for sent in sentences:\n",
    "#         unique_sents.add(str(sent))\n",
    "#         row = {'Old Index': old_idx, 'filename': fname, 'Report Impression Sentence': str(sent)}\n",
    "#         newdf = newdf.append(row, ignore_index = True)\n",
    "#     if counter % 10000 == 0:\n",
    "#         newdf.to_csv(\"sentences.csv\", index=False)\n",
    "#     counter += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "from pyspark import SparkConf, SparkContext\n",
    "from pyspark.sql import SparkSession\n",
    "sc = SparkContext(conf=SparkConf())\n",
    "spark = SparkSession \\\n",
    "    .builder \\\n",
    "    .appName(\"sentence segmentation\") \\\n",
    "    .getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Row(filename='s57146986.txt', impression='   As compared to the previous radiograph, there is no relevant change.  Normal size of the cardiac silhouette.  No pulmonary edema.  No pleural effusions.  No focal parenchymal opacity suggesting pneumonia.  Normal appearance of the hilar and mediastinal structures.  IMPRESSION:   No acute cardiopulmonary process.')\n",
      "320027\n"
     ]
    }
   ],
   "source": [
    "#sdf = spark.read.csv(\"../../mimic_unique.csv\", header=True)\n",
    "sdf = spark.read.csv(\"/deep/group/data/med-data/mimic_train_findings.csv\", header=True)\n",
    "#sdf = sdf.withColumnRenamed('_c0', 'index')\n",
    "print(sdf.head())\n",
    "print(sdf.count())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql.functions import explode\n",
    "sents = sdf.rdd.map(lambda x: split(x))\n",
    "#new_sent_df = spark.createDataFrame(data=sents, schema = ['index','filename','sentence'])\n",
    "new_sent_df = spark.createDataFrame(data=sents, schema = ['filename','sentence'])\n",
    "#new_sent_df = new_sent_df.select(new_sent_df.index, new_sent_df.filename, explode(new_sent_df.sentence))\n",
    "new_sent_df = new_sent_df.select(new_sent_df.filename, explode(new_sent_df.sentence))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Row(filename='s57146986.txt', col='   As compared to the previous radiograph, there is no relevant change.'), Row(filename='s57146986.txt', col=' Normal size of the cardiac silhouette.'), Row(filename='s57146986.txt', col=' No pulmonary edema.'), Row(filename='s57146986.txt', col=' No pleural effusions.'), Row(filename='s57146986.txt', col=' No focal parenchymal opacity suggesting pneumonia.'), Row(filename='s57146986.txt', col=' Normal appearance of the hilar and mediastinal structures.'), Row(filename='s57146986.txt', col=' IMPRESSION:   No acute cardiopulmonary process.'), Row(filename='s57146986.txt', col='   As compared to the previous radiograph, there is no relevant change.'), Row(filename='s57146986.txt', col=' Normal size of the cardiac silhouette.'), Row(filename='s57146986.txt', col=' No pulmonary edema.')]\n"
     ]
    }
   ],
   "source": [
    "print(new_sent_df.take(10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_sent_df.coalesce(1).write.csv(\"sentence_dir\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Row(filename='s57146986.txt', col='   As compared to the previous radiograph, there is no relevant change.'), Row(filename='s57146986.txt', col=' Normal size of the cardiac silhouette.'), Row(filename='s57146986.txt', col=' No pulmonary edema.'), Row(filename='s57146986.txt', col=' No pleural effusions.'), Row(filename='s57146986.txt', col=' No focal parenchymal opacity suggesting pneumonia.'), Row(filename='s57146986.txt', col=' Normal appearance of the hilar and mediastinal structures.'), Row(filename='s57146986.txt', col=' IMPRESSION:   No acute cardiopulmonary process.'), Row(filename='s57146986.txt', col='   As compared to the previous radiograph, there is no relevant change.'), Row(filename='s57146986.txt', col=' Normal size of the cardiac silhouette.'), Row(filename='s57146986.txt', col=' No pulmonary edema.')]\n"
     ]
    }
   ],
   "source": [
    "print(new_sent_df.take(10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "522c9f18647d414aa4618b88cb8884c0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, max=187673.0), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "264000\n"
     ]
    }
   ],
   "source": [
    "unique_sents = set()\n",
    "# newdf = pandas.DataFrame(columns=[\"index\", \"filename\", \"Report Impression Sentence\"])\n",
    "# tempo_df = sdf.rdd.map(lambda x: split(x)).collect()\n",
    "for val in tqdm(tempo_df, total=len(tempo_df)):\n",
    "    idx, fname, sentences = val\n",
    "    unique_sents = unique_sents | set(sentences)\n",
    "#     for sentence in sentences:\n",
    "#         unique_sents.add(str(sentence))\n",
    "#         row = {'index': idx, 'filename': fname, 'Report Impression Sentence': sentence}\n",
    "#         newdf = newdf.append(row, ignore_index = True)\n",
    "# newdf.to_csv(\"sentences.csv\", index=False)\n",
    "print(len(unique_sents))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['s', 'S', '?', '.', 'I', 'A', '6', 'He', 'PE', 'No', 'An', 'hh', 'Or', '3.', 'It', 'AM', '1.', '4.', 'pm', 'MD', '2.', 'Dx', '6.', \"'s\", 'am', '2)', 'NG', 'E.', '5.', '7.', 'Is', 'Ill', 'CHF', 'At.', 'The', 'It.', 'All', 'New', '\"1.', 'And', 'No.', 'He.', 'am.', 'pm.', '___', 'End', 'Are', 'Ref.', 'One.', 'The.', '2. ?', 'ETT.', 'CHF.', '3. ?', '4) ?', '___,', 'Mild', 'What', 'A.m.', 'Item', 'COPD', '___.', '2) ?', 'Were', 'P.m.', 'Stop', '3) ?', 'Left', '4.___', 'COPD.', 'UZRD.', 'None.', 'bulla', 'P ___', 'Mass.', 'Whole', 'Zones', 'Ileus', 'Sixth', 'sixth', 'Edema.', 'Ileus.', 'Goiter', 'Above.', 'Insert', 'Would.', 'Drain.', '2. ___', 'Essed.', '1. ___', 'ASCVD.', 'Venous', 'Per CT', 'Slight', '12:31.', 'Signed', 'GOITER', 'No TB.', 'Should', '4. ___', 'Normal', 'Number', '3. ___', 'Little', 'Errant', 'Its is', 'COPD 2.', 'Stomach', 'Stable.', 'MD, MPH', '3-4 cm.', 'However', 'ET tube', 'Normal.', 'on ___.', 'No left', '2) CHF.', 'No CHF.', '2. ___%', 'All all', 'Film 2.', 'COPD 3.', 'Either.', 'Mid SVC', 'No ptx.', 'MD, PhD', 'Sign of', 'This is', '1. ___%', 'DR. ___', 'residual', 'AS ABOVE', 'Since at', 'Sign rib', 'Limited.', 'ET tube.', 'Are new.', 'Normal .', 'ETT low.', 'No chain', 'on left.', '6 6 spot', '1) COPD.', 'hyperinf', 'As above', 'Ascites.', 'No other', 'No CHF .', 'Failure.', 'HISTORY.', 'The left', 'unlikely.', 'Persists.', 'It is ___', 'caused by', 'Yet still', 'by phone.', 'At 10:11.', 'No scars.', 'Mild CHF.', 'NO EDEMA.', 'Were were', 'No edema.', 'NO CHANGE', 'The left.', 'Pneumonia', 'Recently.', 'Otherwise', 'LUL mass.', 'scarring.', 'intubated', 'Vertebral', 'AS ABOVE.', 'No change', 'As above.', 'Unchanged', 'Emphysema', 'The right', 'pm on ___', 'emphesema', 'ANCHORS 2', 'By dr ___', 'by Dr ___', 'artifact.', 'Continued', 'MD ___=___', 'Is stable.', 'Scoliosis.', 'Pneumonia.', 'was paged.', 'criteria).', 'MD CC: ___', 'one week).', 'See above.', 'Increased.', 'Effusions.', 'ET tube in', 'The large.', 'Status quo', 'Since ___.', 'Emphysema.', 'No change.', 'The report', '2) No CHF.', 'Post CABG.', 'Doubt CHF.', 'Mild COPD.', 'Is normal.', 'AS ABOVE..', 'technical.', 'Heart size', 'A page for', 'As above..', 'Is images.', 'Gallstone.', 'unchanged.', 'Unchanged.', 'No pleural', 'EMPHYSEMA.', 'Lung clear.', 'AICD noted.', 'Gallstones.', 'MD ___=W___', 'No failure.', 'Osteopenia.', 'Mild edema.', 'Atelectasis', 'No tension.', 'Sternotomy.', 'Left lower.', 'No effusion', 'Are stable.', 'No empyema.', 'aspiration.', 'TAVR noted.', 'WET READ !!', 'No edema. .', 'COPD noted.', 'Post TEVAR.', 'Is chronic.', 'Impression.', 'Severe COPD', 'Left clear.', 'CT advised.', 'Known COPD.', 'Extubation.', 'No evidence', 'No fracture', '___ images.', 'No nodules.', 'Lungs clear', 'ETT is low.', 'Status quo.', 'No pleural.', 'atelectasis', 'Is present.', 'Stable exam', 'Mild CHF. .', 'TIPS noted.', 'were paged.', 'Sign report', '___ to ___.', 'Clear lungs', 'Bronchitis.', 'Prior CABG.', 'New trachea', 'CT pending.', 'Minimal CHF.', 'Likely COPD.', 'Stable COPD.', 'Low ET tube.', 'a few weeks.', 'Normal heart', 'Study of ___', 'T AVR noted.', 'Emphysema 2.', 'Was removed.', 'Status post.', 'Normal chest', 'No free air.', 'No scarring.', 'Nopneumonia.', 'Recommend CT', 'New wall new', 'atelectasis.', 'On the left.', 'Severe ileus', 'CTA advised.', 'MVR is noted', 'CHECK PRIORS', 'Cardiomegaly', 'am by Dr ___', 'abnormality.', 'Consider CT.', 'positioning.', 'Is in place.', 'Normal lungs', 'No pnemonia.', 'was not ___.', 'Lung nodule.', 'Lungs clear.', 'Is constant.', 'Nasogastric.', 'Clear chest.', '3) Emphysema', 'Atelectasis.', 'No effusion.', 'Through ___.', 'pm by Dr ___', 'Marked COPD.', 'It is small.', 'Emphysema 3.', 'No pneumonia', 'No fracture.', 'Chest clear.', 'Pneumobilia.', 'Stable exam.', 'Severe COPD.', 'Also stable.', 'LUNGS CLEAR.', 'Is a stable.', 'Clear lungs.', 'Normal exam.', 'Improvement.', 'd/w Dr. ___.', 'No fibrosis.', '3) Pneumonia.', 'Lungs grossly', 'worsened CHF.', 'Similar size.', 'Chest CT ___.', 'NGT be knees.', 'ETT not seen.', 'Normal chest.', 'CARDIOMEGALY.', 'increased CHF', 'Splenomegaly.', 'Normal study.', 'Pneumothorax.', 'No emphysema.', 'Cardiomegaly.', 'ETT too high.', 'This opacity.', 'Demonstrated.', 'Nipple rings.', 'Probably COPD', 'Chest normal.', 'And severity.', 'Tracheostomy.', 'Is unchanged.', 'NO PNEUMONIA.', 'no infiltrate', 'Improved CHF.', 'No tamponade.', 'No effusions.', 'No infiltrate', 'No PICC seen.', 'Little change', 'New mild CHF.', 'No overt CHF.', 'No pneumonia,', 'Resolved CHF.', 'Stable chest.', 'VIA TELEPHONE', 'No lung mass.', 'Has resolved.', '___ into ___.', 'As above peer', 'Right jugular', 'Tips in situ.', 'Limited exam.', 'decreased CHF', 'Increased CHF', 'The replaced.', 'AV is images.', 'No penumonia.', 'Worsened CHF.', 'Sign spur and', 'Recent T AVR.', 'Newly placed.', 'CABG changes.', 'A sign report', 'No fractures.', 'Than earlier.', 'Hiatal hernia', 'cardiomegaly.', 'NGT not seen.', 'Normal x-ray.', 'IABP in-situ.', 'New PICC line', 'Mild failure.', 'Unremarkable.', 'The left lung', 'No pneumonia.', 'Heart normal.', 'No Pneumonia.', 'Probable CHF.', 'And bibasilar', 'Lung volumes.', 'TUBE MIDLINE.', 'was paged for', 'Residual CHF.', 'POSSIBLE COPD.', 'As above. ___,', 'There is again', 'Rib fractures.', 'No hemothorax.', 'LLL pneumonia.', 'Cause unknown.', 'Increased CHF.', 'AICD in place.', 'Cleared edema.', 'Probable ileus', '___ in to ___.', 'No chest mass.', 'Resolving CHF.', 'New pacemaker.', 'Cardiomegaly .', 'Has developed.', 'Sternal wires.', 'No evidence of', 'fluid overload', 'Team informed.', 'Hyperinflation', 'Signed reports', 'Now with small', 'Limited study.', 'Decreased CHF.', 'TAVR is noted.', '6 new 6 dx 8 2', 'Some clearing.', 'No new masses.', 'TAVR in place.', 'Hiatal hernia.', 'TAVR and seen.', 'Probably COPD.', 'No aspiration.', 'Dr. ___ paged.', 'No infiltrate.', 'Little change.', 'Old healed rib', '3) ICD device.', 'LVAD in place.', 'Worsening CHF.', 'No infiltrates', 'Dr. ___ aware.', 'No comparison.', 'In the larynx.', 'TIPS is noted.', 'are again seen', 'Moderate COPD.', 'Study if draft', 'Improving CHF.', 'PICC in place.', 'No adenopathy.', 'Fluid overload', 'Left effusion.', 'The radiograph', '3) IVC filter.', 'Fluid overlad.', 'No free air up', 'improving CHF.', 'Has increased.', 'new since ___.', 'Right PIC line', 'Left is clear.', 'Little overall', 'Continued CHF.', 'At expiration.', 'Tracheomegaly.', 'Possible COPD.', 'Hiatus hernia.', 'enteric tube .', 'Clips in situ.', 'NO COMPARISON.', 'MD CC: DR. ___', 'Probable COPD.', 'HETEROGENEOUS.', 'Dobbhoff tube.', 'Rest as above.', 'In appearance.', 'Are unchanged.', 'Stable extent.', 'Heart failure.', 'Mild scarring.', 'Bronchiectasis.', 'Dextrascoliosis', 'Azygos fissure.', 'pm with Dr. ___', 'No injury seen.', 'Cholelithiasis.', 'T AVR in place.', 'Unchanged exam.', 'Left pneumonia.', 'Lungs are clear', 'No new opacity.', 'The nasogastric', 'Trachea normal.', 'No frank edema.', 'No overt edema.']\n"
     ]
    }
   ],
   "source": [
    "temp = sorted(list(unique_sents), key=lambda x: len(x))\n",
    "print(temp[:500])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
