{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "import os\n",
    "#set the openai env variable here"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "2845ea355413d4cf"
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "outputs": [],
   "source": [
    "api_version = \"2024-05-01-preview\"\n",
    "from langchain_openai import AzureChatOpenAI\n",
    "\n",
    "model_kwargs = dict(\n",
    "    model=\"gpt-4-128k\",\n",
    "    azure_endpoint=\"endpoint here\",\n",
    "    api_key=\"key here\",\n",
    "    api_version=api_version,\n",
    "    temperature=0.0,\n",
    ")\n",
    "gpt_model = AzureChatOpenAI(**model_kwargs, cache=False)"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "ab0df757-960d-47d7-8349-7b377f7e9422"
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "a69b418c-53b2-4661-9e34-ff7d9179295e",
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import Any, Dict, Iterator, List, Optional, Sequence, Union"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0b08e5a3-1448-4331-b164-f434c85b8332",
   "metadata": {},
   "source": [
    "### Close-ended"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "60752342-bb44-4d4a-9d14-26341e4eb622",
   "metadata": {},
   "outputs": [],
   "source": [
    "system_msg_close = \"\"\" You are provided with the clinical report about a medical image. You task is to synthesize a set of new close-ended QA pairs according to the requirements. Unfortunately, you don’t have access to the actual image. Below are requirements for generating the questions and answers in the conversation.\n",
    "- Do not use phrases like \"mentioned\", \"report\", \"context\" in the conversation. Instead, refer to the information as being \"in the image.\"\n",
    "- Answer responsibly within the given information and knowledge context, avoiding information not included in the given context.\n",
    "- Make sure your generation contains a key \"type\" for each QA pair, indicating its question type (as detailed in the following part).\n",
    "- Make the questions more diverse.\n",
    "- The ground truth type can be \"yes or no\" or one choice from multi-choice (you need to synthesize several choices in this case), condition on the question and available materials.\n",
    "\n",
    "Here are a set of question types for your generation, you must assign each new QA pair a question type.\n",
    "type_1: Anatomical Hallucination. Example questions: \"Which part of the body does this image belong to?\" \"Does the picture contain liver?\"\n",
    "type_2: Measurement Hallucination like location, size. Example questions: \"Where is the liver?\"\n",
    "type_3: Symptom-Based Hallucination. Example questions: \"Is the lung healthy?\" \"Is there evidence of a pneumothorax?\" \"Is there a fracture?\"\n",
    "type_4: Technique Hallucination. Example questions:  \"What modality is used to take this image? \"\n",
    "\n",
    "Instructions:\n",
    "- Add one key to each QA pair, key= \"ground_truth_type\", value= \"binary\" if the type is \"yes or no\" else \"multi-choice\"\n",
    "- When you see diagnosis information of a disease (e.g. lung cancer) in QA pairs, you should generate new QA pair by asking the symptoms of the disease \n",
    "- For \"multi-choice\" type QA, you must include one key \"choices\" of string type.\n",
    "- Avoid the question that you can not generate a ground truth, for example avoid the answer \"The image does not provide information\". \n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "37e43079-0622-413f-9845-1e3f0bf00cd4",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.output_parsers import PydanticOutputParser\n",
    "from langchain_core.prompts import PromptTemplate, ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate\n",
    "from pydantic import BaseModel, Field\n",
    "import json\n",
    "from typing import Any, Dict, Iterator, List, Optional, Sequence, Union\n",
    "\n",
    "class Closed_QA(BaseModel):\n",
    "    Q: str = Field(description=\"The question (Q)\")\n",
    "    A: str = Field(description=\"The answer (A)\")\n",
    "    type: str = Field(description=\"The QA type\")\n",
    "    choices: str = Field(default=\"\", description=\"The QA choices for multi-choice question type\")\n",
    "    ground_truth_type: str = Field(description=\"The ground_truth type\")\n",
    "    \n",
    "\n",
    "class Closed_QueryAnswer(BaseModel):\n",
    "    qa_pairs: List[Closed_QA] = Field(description=\"The QA pairs list generated.\")\n",
    "#Here we use the classic parser in LangChain: Pydantic, to ensure a strict parsing process\n",
    "closed_parser = PydanticOutputParser(pydantic_object=Closed_QueryAnswer)\n",
    "\n",
    "colsed_prompt = ChatPromptTemplate.from_messages([\n",
    "    SystemMessagePromptTemplate.from_template(system_msg_close),\n",
    "    HumanMessagePromptTemplate(\n",
    "        prompt=PromptTemplate(\n",
    "            template=\"The medical report of the image: {report}. generate high-quality questions for which the correct answers can be inferred solely from the provided report. \",\n",
    "            input_variables=[\"report\"],\n",
    "            partial_variables={\"format_instructions\": closed_parser.get_format_instructions()},\n",
    "        )\n",
    "    ),\n",
    "])\n",
    "\n",
    "closed_chain = (\n",
    "    colsed_prompt \n",
    "    | gpt_model \n",
    "    | closed_parser\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 190,
   "id": "730dcc83-9638-47ea-9ac2-73171cc7f181",
   "metadata": {},
   "outputs": [],
   "source": [
    "#batch process\n",
    "mimic_report_path = r\"mimic reports folder\"\n",
    "sampled_files_path = r\"mimic_cxr/sampled_paths.json\" # the sampled images from the MIMIC-CXR\n",
    "\n",
    "with open(sampled_files_path, 'r', encoding='utf8') as file:\n",
    "    mimic_files = json.load(file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 185,
   "id": "096c3a26-0c7c-47ce-8892-c8feac50df93",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2000"
      ]
     },
     "execution_count": 185,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(mimic_files) # 2000 samples from the MIMIC-CXR, will use 500"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 186,
   "id": "8bcc9196-e674-4c4c-8ec7-f89d8c6a8fb6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'p19/p19454978/s57331547/7d047120-d24a497e-fc26ea7e-6c3acc0c-ce5bc190.jpg'"
      ]
     },
     "execution_count": 186,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mimic_files[0] # path sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "44d311e5-5b24-48e9-a08b-468b04c76ac4",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-19T18:22:00.702576600Z",
     "start_time": "2024-08-19T18:22:00.685218400Z"
    }
   },
   "outputs": [],
   "source": [
    "def to_report(file_path): #get the report of a medical image\n",
    "    path = \"/\".join(file_path.split(\"/\")[:3]) + \".txt\"\n",
    "    with open(os.path.join(mimic_report_path, path), \"r\") as f:\n",
    "        report = f.read()\n",
    "    return path, file_path, report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 192,
   "id": "7de673ae-7462-4c68-9d5d-339384bb0fb1",
   "metadata": {},
   "outputs": [],
   "source": [
    "sampled_close_paths = mimic_files[:500] #sample 500 for close-ended"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 193,
   "id": "57d036df-f32f-4906-bfed-a394e3a7609f",
   "metadata": {},
   "outputs": [],
   "source": [
    "mimic_close_qas = []\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "ae923979-3735-4b38-b905-6978d0d27677",
   "metadata": {
    "scrolled": true,
    "ExecuteTime": {
     "end_time": "2024-08-19T18:14:18.076239Z",
     "start_time": "2024-08-19T18:14:18.071146200Z"
    }
   },
   "outputs": [],
   "source": [
    "for _id in tqdm(sampled_close_paths):\n",
    "    report_path, img_path, report = to_report(_id)\n",
    "    ans = closed_chain.invoke(dict(report=report))\n",
    "    final_new_qa = {\"img_path\":img_path, \"new_qa_pairs\": ans}\n",
    "    mimic_close_qas.append(final_new_qa)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 224,
   "id": "b0e35d73-f0c7-43f4-8d9d-af26aee24be4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "500"
      ]
     },
     "execution_count": 224,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(mimic_close_qas)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 196,
   "id": "4703ac3a-f117-496e-be8e-16961500420c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'img_path': 'p11/p11924226/s56990167/dc00203a-4168ce8c-d79d47d2-eef8780b-d3fe037a.jpg',\n",
       " 'new_qa_pairs': Closed_QueryAnswer(qa_pairs=[Closed_QA(Q='Is the heart size within normal limits in the image?', A='Yes', type='type_3', choices='', omission_type=1, ground_truth_type='binary'), Closed_QA(Q='Does the image show any abnormalities in the lung fields?', A='No', type='type_3', choices='', omission_type=0, ground_truth_type='binary'), Closed_QA(Q='What part of the mediastinum appears altered in the image?', A='The superior mediastinum', type='type_2', choices='A. The superior mediastinum, B. The inferior mediastinum, C. The anterior mediastinum, D. The posterior mediastinum', omission_type=-1, ground_truth_type='multi-choice'), Closed_QA(Q=\"Is the patient's position affecting the clarity of the image?\", A='Yes', type='type_3', choices='', omission_type=1, ground_truth_type='binary'), Closed_QA(Q='Are follow-up films suggested for the patient?', A='Yes', type='type_3', choices='', omission_type=1, ground_truth_type='binary'), Closed_QA(Q='What is the recommended interval for the follow-up films?', A='Four to six weeks', type='type_2', choices='A. Two to three weeks, B. Four to six weeks, C. Six to eight weeks, D. Eight to ten weeks', omission_type=-1, ground_truth_type='multi-choice'), Closed_QA(Q='Is the mediastinum definitively widened?', A='No', type='type_3', choices='', omission_type=0, ground_truth_type='binary'), Closed_QA(Q='What imaging modality was used to take the image?', A='CXR', type='type_4', choices='A. MRI, B. CT, C. CXR, D. Ultrasound', omission_type=-1, ground_truth_type='multi-choice'), Closed_QA(Q='Is there a comparison with previous examinations in the image?', A='No', type='type_3', choices='', omission_type=0, ground_truth_type='binary')])}"
      ]
     },
     "execution_count": 196,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mimic_close_qas[1] #data sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 210,
   "id": "7cd47076-59ca-42f3-a9b6-f2107dd2ba08",
   "metadata": {},
   "outputs": [],
   "source": [
    "#next step: filter out low-quality data\n",
    "\n",
    "sys_msg_filter = \"\"\"\n",
    "You are provided with a set of QA pairs in a json format. Your task is to improve or filter out low quality QA pairs.\n",
    "Here are some standards for the measurement of low-quality:\n",
    "- improve the answer format if possible.\n",
    "- for \"imaging modality\" type question, ensure that the ground truth is X-ray since all the data are chest x-ray, remove the choice that are not common radiology modality.\n",
    "- for the qa pair with type='type_2', ensure that the question is asking the attribute (position, color, size, etc) of an **organ**, not other things such as the **patient**, delete it if it is not about an organ.\n",
    "- The answer is vague when the question type is multi-choice, try to make the answer clear, which choice is correct.\n",
    "- The answer is like \"xxx is not specified in the image.\" or \"The image does not provide information\", which means that this qa pair should be filtered out becuase there is no ground truth.\n",
    "- The questions that can not response only from the context like \"Are there any changes in the xxx compared to the **previous study**?\"\n",
    "- ensure the answer of multi-choice question is in this type of format: \"A.XXXX, B.XXXX, C.XXXX, ....\"\n",
    "- add more choices if the multi-choice QA with only one choice like only \"A.XXXX\"!\n",
    "You can also judge from other common standards.\n",
    "Then please format the qa pairs into a high-quality format.\n",
    "Notice: jusr remove the qa_pair if it is of low-quality.\n",
    "\"\"\"\n",
    "filter_prompt = ChatPromptTemplate.from_messages([\n",
    "    SystemMessagePromptTemplate.from_template(sys_msg_filter),\n",
    "    HumanMessagePromptTemplate(\n",
    "        prompt=PromptTemplate(\n",
    "            template=\"The generated QA pairs in a json format: {qa_pairs}. Try to improve or filter out low-quality QA pairs, return in the original json format, but output the QA pair list as the value of additional key 'qa_pairs'. No other items, ensure the complete json format for parser!\",\n",
    "            input_variables=[\"qa_pairs\"],\n",
    "            partial_variables={\"format_instructions\": closed_parser.get_format_instructions()},\n",
    "        )\n",
    "    ),\n",
    "])\n",
    "\n",
    "from operator import itemgetter, attrgetter\n",
    "\n",
    "filter_chain = (\n",
    "    filter_prompt\n",
    "    | gpt_model\n",
    "    | closed_parser\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 221,
   "id": "a7f5e3e3-5948-48dc-88e4-aa707b870d7e",
   "metadata": {},
   "outputs": [],
   "source": [
    "filtered_mimic_close_qas = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "5412aaae-df5a-4b2d-b23f-198f1d48ac6a",
   "metadata": {
    "scrolled": true,
    "ExecuteTime": {
     "end_time": "2024-08-19T18:14:28.008706200Z",
     "start_time": "2024-08-19T18:14:27.984434Z"
    }
   },
   "outputs": [],
   "source": [
    "# for qa_pairs in tqdm(mimic_close_qas[]):\n",
    "for qa_pairs in tqdm(mimic_close_qas):\n",
    "    ans = filter_chain.invoke(dict(qa_pairs=qa_pairs['new_qa_pairs'].dict()))\n",
    "    final_new_qa = {\"img_path\":qa_pairs['img_path'], \"new_qa_pairs\": ans}\n",
    "    filtered_mimic_close_qas.append(final_new_qa)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 236,
   "id": "4d36b9bd-3cef-49ea-8bf3-05b1c334c196",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "500"
      ]
     },
     "execution_count": 236,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(filtered_mimic_close_qas)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 237,
   "id": "d711036c-2166-45bc-9ee8-00723c8779bb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "({'img_path': 'p16/p16043637/s52793175/1b3d4f71-68977c5e-a070ff6b-29584c84-b70bf667.jpg',\n",
       "  'new_qa_pairs': Closed_QueryAnswer(qa_pairs=[Closed_QA(Q='Does the image show any signs of pneumonia?', A='No', type='type_3', choices='', omission_type=0, ground_truth_type='binary'), Closed_QA(Q='Is there a pacemaker visible in the image?', A='Yes', type='type_1', choices='', omission_type=1, ground_truth_type='binary'), Closed_QA(Q='Can you identify the presence of pleural effusion in the image?', A='No', type='type_3', choices='', omission_type=0, ground_truth_type='binary'), Closed_QA(Q='What type of medical device is noted on the left side in the image?', A='A: A left-sided pacemaker', type='type_4', choices='A: A left-sided pacemaker, B: A stent, C: An infusion pump, D: A defibrillator', omission_type=-1, ground_truth_type='multi-choice'), Closed_QA(Q='Is there evidence of a pneumothorax in the image?', A='No', type='type_3', choices='', omission_type=0, ground_truth_type='binary'), Closed_QA(Q='What anatomical structure has been replaced, as seen in the image?', A='A: Aortic valve', type='type_1', choices='A: Aortic valve, B: Mitral valve, C: Pulmonary valve, D: Tricuspid valve', omission_type=-1, ground_truth_type='multi-choice'), Closed_QA(Q='Are the cardiomediastinal and hilar contours normal?', A='Yes', type='type_3', choices='', omission_type=1, ground_truth_type='binary'), Closed_QA(Q='What imaging modality was used to take the image?', A='C: Chest radiograph', type='type_4', choices='A: MRI, B: CT scan, C: Chest radiograph, D: Ultrasound', omission_type=-1, ground_truth_type='multi-choice'), Closed_QA(Q='Are there any signs of pulmonary edema in the image?', A='No', type='type_3', choices='', omission_type=0, ground_truth_type='binary'), Closed_QA(Q='Is there any new focal consolidation observed?', A='No', type='type_3', choices='', omission_type=0, ground_truth_type='binary')])},\n",
       " 500)"
      ]
     },
     "execution_count": 237,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filtered_mimic_close_qas[-1], len(filtered_mimic_close_qas) # data sample after the filtering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 235,
   "id": "19485c4e-47bc-48e5-bd47-5870f3b03756",
   "metadata": {},
   "outputs": [],
   "source": [
    "# save file\n",
    "def trans_file(qas_list):\n",
    "    new_list = []\n",
    "    id_ = 0\n",
    "    for i in qas_list:\n",
    "        img_id = i['img_path']\n",
    "        new_qas = i['new_qa_pairs'].dict()['qa_pairs']\n",
    "        # old_qa_ids = img_data_ids[img_id]\n",
    "        # old_qa_pair = test_data_id_map[old_qa_ids[0]]\n",
    "        for j in new_qas:\n",
    "            new_dict = dict()\n",
    "            new_dict['img_name'] = img_id\n",
    "            new_dict['question'] = j['Q']\n",
    "            new_dict['answer'] = j['A']\n",
    "            if \"image\" in j['A'].lower() and \"not\" in j['A'] and \"provide\" in j['A']:\n",
    "                continue\n",
    "            if j['ground_truth_type'] == \"multi-choice\":\n",
    "                if \"B\" not in j['choices']:\n",
    "                    continue\n",
    "                \n",
    "            if j['ground_truth_type'] == \"binary\":\n",
    "                if \"yes\" not in j['A'].lower() and \"no\" not in j['A'].lower():\n",
    "                    print(j['A'].lower())\n",
    "                    continue\n",
    "            new_dict['hallucination_type'] = j['type']\n",
    "            new_dict['question_type'] = j['ground_truth_type']\n",
    "            new_dict['choices'] = j['choices']\n",
    "            new_dict['qid'] = id_\n",
    "            new_dict['img_id'] = img_id\n",
    "            new_dict['location'] = None\n",
    "            new_dict['modality'] = None\n",
    "            new_list.append(new_dict)\n",
    "            id_ += 1\n",
    "    return new_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "cf830563-c27f-47ec-be26-3b43e08ea212",
   "metadata": {
    "scrolled": true,
    "ExecuteTime": {
     "end_time": "2024-08-19T18:23:47.589229700Z",
     "start_time": "2024-08-19T18:23:47.569903900Z"
    }
   },
   "outputs": [],
   "source": [
    "mimic_new_qas_json = trans_file(filtered_mimic_close_qas)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 240,
   "id": "46dd6cae-366e-40e1-a8a6-342e9357172e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data saved!\n"
     ]
    }
   ],
   "source": [
    "with open(\"saved json file\", 'w') as json_file:\n",
    "    json.dump(mimic_new_qas_json, json_file, indent=4)\n",
    "\n",
    "print(f\"Data saved!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0d8241eb-5047-422c-b25d-2a11d542925d",
   "metadata": {},
   "source": [
    "### Open-ended data generation for Visual Factual Hallucination"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "id": "19f531d0-41f3-4e37-8b20-84564d1345a8",
   "metadata": {},
   "outputs": [],
   "source": [
    "system_msg_open = \"\"\" You are provided with the clinical report about a medical image. You task is to synthesize a set of open-ended QA pairs according to the requirements. Unfortunately, you don’t have access to the actual image. Below are requirements for generating the questions and answers in the conversation.\n",
    "\n",
    "Do not use phrases like \"mentioned\", \"report\", \"context\" in the conversation. Instead, refer to the information as being \"in the image.“\n",
    "Answer responsibly within the given report, avoiding information not included in the given context. \n",
    "\n",
    "You need to generate the question to query three types of components: (1)anatomical structure; (2)anatomical measurement(organ location, size); (3)symptoms(such as normal or abnormal symptoms, not direct diagnosis). \n",
    "Here is an example question: \"List your findings of anatomical structures and measurements in detail, as well as the possible symptoms, abnormal findings on these structures.\"\n",
    "\n",
    "Instructions:\n",
    "Additionally return a structured answer with clear classification of three components, which means simply classify the answer into this form:\n",
    "       {\"Sturctured_Answer\": {\"anatomy\": List[str], \"measurement\": List[str], \"symptom\": List[str]} }\n",
    "Make sure the classification is precise and accurate. If you are not sure about the category, do not include it in the structured result.\n",
    "The structured output of \"measurement\" should be the measurements of organs or important structures, it could be an empty list if there is no important measurements.\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "id": "9714f83f-1da5-4119-9c77-99816aa2ce8a",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "class StructureAns(BaseModel):\n",
    "    anatomy: List[str] = Field(description=\"The structured answer about anatomical structure\")\n",
    "    measurement: List[str] = Field(description=\"The structured answer about anatomical structure measurement\")\n",
    "    symptom: List[str] = Field(description=\"The structured answer about possible symptoms\")\n",
    "\n",
    "class Type2_QA(BaseModel):\n",
    "    Question: str = Field(description=\"The question\")\n",
    "    Answer: str = Field(description=\"The answer\")\n",
    "    Sturctured_Answer: StructureAns = Field(description=\"The structured answer\")\n",
    "\n",
    "class OpenQA(BaseModel):\n",
    "    type_2: Type2_QA = Field(description=\"The QA generated.\")\n",
    "\n",
    "\n",
    "open_parser = PydanticOutputParser(pydantic_object=OpenQA)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "id": "9bb93172-ac91-4c20-a839-9c884a42a51e",
   "metadata": {},
   "outputs": [],
   "source": [
    "output_format: \"\"\"\n",
    "{\n",
    "\"type_1\": {\"Question\": str, \"Answer\":str, \"Sturctured_Answer\":{\"anatomy\": list[str], \"measurement\": list[str], \"symptom\": list[str]}}\n",
    "}\n",
    "\n",
    "\"\"\"\n",
    "from langchain_core.messages import HumanMessage, SystemMessage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "id": "a18f10ff-de5b-4deb-9dc4-1b787d15fec8",
   "metadata": {},
   "outputs": [],
   "source": [
    "rag_prompt_wo_knowledge = ChatPromptTemplate.from_messages([\n",
    "    # SystemMessagePromptTemplate.from_messages(system_msg_open_wo_knowledge),\n",
    "    SystemMessage(content=system_msg_open),\n",
    "    HumanMessagePromptTemplate(\n",
    "        prompt=PromptTemplate(\n",
    "            template=\"\"\"\n",
    "            The medical imaging report: {report}, ensure that you follow the output format as detailed in the system format. \n",
    "            \"\"\",\n",
    "            input_variables=[\"report\"],\n",
    "            partial_variables={\"format_instructions\": open_parser.get_format_instructions()},\n",
    "        )\n",
    "    ),\n",
    "])\n",
    "\n",
    "rag_chain_wo_knowledge = (\n",
    "    rag_prompt_wo_knowledge \n",
    "    | gpt_model \n",
    "    | open_parser\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 215,
   "id": "fd7cd757-9f53-40d2-8d95-e2636601de3d",
   "metadata": {},
   "outputs": [],
   "source": [
    "open_qas = []\n",
    "sampled_open_paths = mimic_files[500:1000]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "048dea27-f38e-4ebe-bcee-0cb75a95ce3f",
   "metadata": {
    "scrolled": true,
    "ExecuteTime": {
     "end_time": "2024-08-19T18:39:52.076907700Z",
     "start_time": "2024-08-19T18:39:52.061611200Z"
    }
   },
   "outputs": [],
   "source": [
    "#batch process\n",
    "# for qa_pairs in tqdm(mimic_close_qas[]):\n",
    "for _id in tqdm(sampled_open_paths):\n",
    "    report_path, img_path, report = to_report(_id)\n",
    "    ans = rag_chain_wo_knowledge.invoke(dict(report=report))\n",
    "    final_new_qa = {\"img_path\":img_path, \"qa_pairs\": ans}\n",
    "    open_qas.append(final_new_qa)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 231,
   "id": "f144b1a3-4d14-462b-b546-9a8bdd2a0adf",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(500,\n",
       " {'type_1': {'Question': 'What does the image show in a few words?',\n",
       "   'Answer': 'The image is a portable AP chest X-ray showing severe cardiomegaly, mild vascular congestion, retrocardiac opacities, and a possible small left effusion.'},\n",
       "  'type_2': {'Question': 'List your findings of anatomical structures and measurements in detail, as well as the possible symptoms, abnormal findings on these structures.',\n",
       "   'Answer': 'The chest X-ray reveals a severely enlarged heart, consistent with cardiomegaly. The vascular markings are mildly congested. There are retrocardiac opacities that have shown improvement, suggesting resolving atelectasis. Additionally, there may be a small effusion on the left side. No evidence of pneumothorax is present.',\n",
       "   'Sturctured_Answer': {'anatomy': ['heart',\n",
       "     'vascular markings',\n",
       "     'retrocardiac area',\n",
       "     'left pleural space'],\n",
       "    'measurement': [],\n",
       "    'symptom': ['severe cardiomegaly',\n",
       "     'mild vascular congestion',\n",
       "     'improved retrocardiac opacities',\n",
       "     'possible small left effusion',\n",
       "     'no pneumothorax']}}})"
      ]
     },
     "execution_count": 231,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(open_qas), open_qas[1][\"qa_pairs\"].dict() # data example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 232,
   "id": "b5862e84-d94a-4b3b-9e98-50e70cc2cfd3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# save file\n",
    "def trans_file_open(qas_list):\n",
    "    new_list = []\n",
    "    id_ = 0\n",
    "    for i in qas_list:\n",
    "        img_id = i['img_path']\n",
    "        new_qas = i['qa_pairs'].dict()\n",
    "        new_dict = dict()\n",
    "        \n",
    "        j = new_qas\n",
    "        new_dict['img_name'] = img_id\n",
    "        new_dict['question'] = j['type_1'][\"Question\"]\n",
    "        new_dict['answer'] = j['type_1'][\"Answer\"]\n",
    "        new_dict['question_type'] = 'type_1'\n",
    "        new_dict['qid'] = id_\n",
    "        new_dict['img_id'] = img_id\n",
    "\n",
    "        new_list.append(new_dict)\n",
    "        id_ += 1\n",
    "        new_list.append(new_dict)\n",
    "\n",
    "    return new_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "51ad9f39-eba9-47f5-95f9-f51ccd19d597",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-19T18:41:08.744994Z",
     "start_time": "2024-08-19T18:41:08.730370700Z"
    }
   },
   "outputs": [],
   "source": [
    "mimic_open_json = trans_file_open(open_qas)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 234,
   "id": "3e85c2e9-d86f-4a4c-8b2a-ecd2a12871c2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data saved!\n"
     ]
    }
   ],
   "source": [
    "with open(\"saved json file\", 'w') as json_file:\n",
    "    json.dump(mimic_open_json, json_file, indent=4)\n",
    "\n",
    "print(f\"Data saved!\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "name": "python3",
   "language": "python",
   "display_name": "Python 3 (ipykernel)"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
