{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b9c0a836",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from pathlib import Path\n",
    "import json\n",
    "import matplotlib.pyplot as plt\n",
    "from tqdm import tqdm\n",
    "import os\n",
    "\n",
    "tqdm.pandas()\n",
    "\n",
    "report_path = Path('/data/FOLDER/FOLDER/data/mimic-cxr-reports/files/')\n",
    "root_dir = Path('/data/FOLDER/FOLDER/data/MIMIC-CXR-JPG')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b51285ca",
   "metadata": {},
   "outputs": [],
   "source": [
    "metadata = os.path.join(root_dir, 'mimic-cxr-2.0.0-metadata.csv.gz')\n",
    "label_path =os.path.join(root_dir, 'mimic-cxr-2.0.0-chexpert.csv.gz')\n",
    "split_path = os.path.join(root_dir, 'mimic-cxr-2.0.0-split.csv.gz')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "caa6025a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# df = pd.read_csv(metadata)\n",
    "df_split = pd.read_csv(split_path)\n",
    "df_label = pd.read_csv(label_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8bf93764",
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = ['Atelectasis', 'Cardiomegaly',\n",
    "       'Consolidation', 'Edema', 'Enlarged Cardiomediastinum', 'Fracture',\n",
    "       'Lung Lesion', 'Lung Opacity', 'No Finding', 'Pleural Effusion',\n",
    "       'Pleural Other', 'Pneumonia', 'Pneumothorax']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3a9dd6ff",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_label_set(x):\n",
    "    certain_yes = x[labels][x[labels] == 1].index\n",
    "    certain_no = x[labels][x[labels] == -1].index\n",
    "    uncertain = x[labels][x[labels] == 0].index\n",
    "    return [f'certain_yes_{i}' for i in certain_yes ] +  [f'certain_no_{i}'   for i in certain_no ] +  [ f'uncertain_{i}' for i in uncertain ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1d8bf6e7",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_label['cat_labels_text'] = df_label.progress_apply(lambda x: get_label_set(x), axis = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e1d91768",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_labels = [i + j for i in ['certain_yes_', 'certain_no_', 'uncertain_'] for j in labels]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b4dc0d3d",
   "metadata": {},
   "outputs": [],
   "source": [
    "mapping = {\n",
    "    j:i for i, j in enumerate(all_labels)\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "df2e40d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_label['cat_labels'] = df_label['cat_labels_text'].progress_apply(lambda x: [mapping[i] for i in x])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0450e625",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_label[\"reportfilename\"] = df_label.progress_apply(lambda row: os.path.join(report_path,\n",
    "                    'p{}/'.format(str(row.subject_id)[:2])+'p{}/'.format(row.subject_id) + 's{}.txt'.format(row.study_id)), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f7fb9369",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_label['sentence_long'] = df_label.progress_apply(lambda x: Path(x['reportfilename']).open('r').read().strip(), axis = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6c34b151",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_label = df_label.merge(df_split, on = ['subject_id', 'study_id'], how = 'inner')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9a3c405c",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_label.loc[df_label.split == 'validate', 'split'] = 'val'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d9b8bccf",
   "metadata": {},
   "outputs": [],
   "source": [
    "import section_parser as sp\n",
    "custom_section_names, custom_indices = sp.custom_mimic_cxr_rules()\n",
    "\n",
    "def list_rindex(l, s):\n",
    "    \"\"\"Helper function: *last* matching element in a list\"\"\"\n",
    "    return len(l) - l[-1::-1].index(s) - 1\n",
    "\n",
    "def parse(study_id, text):\n",
    "    if study_id in custom_indices:\n",
    "        idx = custom_indices[study_id]\n",
    "        return 'IMPRESSION: \\n'+  text[idx[0]:idx[1]]+ '\\n'\n",
    "    \n",
    "    sections, section_names, section_idx = sp.section_text(\n",
    "        text\n",
    "    )\n",
    "    \n",
    "    if study_id in custom_section_names:\n",
    "        sn = custom_section_names[study_id]\n",
    "        idx = list_rindex(section_names, sn)\n",
    "        return 'IMPRESSION: \\n'+ sections[idx].strip()+ '\\n'\n",
    "\n",
    "    study_sections = {}\n",
    "    for sn in ('impression', 'findings',\n",
    "               'last_paragraph', 'comparison'):\n",
    "        if sn in section_names:\n",
    "            idx = list_rindex(section_names, sn)\n",
    "            study_sections[sn] = sections[idx].strip()\n",
    "    \n",
    "    ret_text = ''\n",
    "    if 'findings' in study_sections:\n",
    "        ret_text += 'FINDINGS:\\n' +  study_sections['findings'] + '\\n\\n'\n",
    "        \n",
    "    if 'impression' in study_sections:\n",
    "        ret_text += 'IMPRESSION:\\n' +  study_sections['impression'] + '\\n'\n",
    "        \n",
    "    if 'impression' not in study_sections and 'last_paragraph' in study_sections:\n",
    "        ret_text += 'IMPRESSION:\\n' +  study_sections['last_paragraph'] + '\\n'\n",
    "        \n",
    "    return ret_text   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6e6ba176",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_label['sentence'] = df_label.progress_apply(lambda x: parse('s'+ str(x['study_id']), x['sentence_long']), axis = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5ba6476c",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_label.to_pickle(root_dir/'multimodal_mislabel_split.pkl')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
