import os
import argparse
from tqdm import tqdm
import pandas as pd
import re
import csv
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import ast
import math
import numpy as np


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--split_id', type=str, default='1')
    parser.add_argument('--model_name_or_path', type=str, default=None)
    args = parser.parse_args()
    return args


def adjust_negative_label(input_string):
    if input_string == '25':
        return input_string
    elif input_string == 'X':
        return input_string
    else:
        elements = input_string.split(',')
        processed_elements = [element + '-' for element in elements]
        output_string = ','.join(processed_elements)
        return output_string

def adjust_positive_label(n):
    mapping = {
        1: '1+', 2: '2+', 3: '3+', 4: '4+', 5: '5+',
        6: '6+', 7: '7+', 8: '8+', 9: '9+', 10: '10+',
        11: '11+', 12: '12+', 13: '13+', 14: '14+',
        15: '15+', 16: '16+', 17: '15+', 18: '17+',
        19: '18+', 20: '19+', 21: '20+', 22: '21+',
        23: '22+', 24: '23+', 25: '24+', 26: '25'
    }
    return mapping.get(n, "输入超出范围")

def prompt_1_data(sent):
    prompt = """
Assume you are an experienced radiologist. Please help me process the following medical diagnostic report sentences, adhering to the following requirements:

1.Rewriting Requirement: If the sentence content is related to specific medical parts or symptoms, please rewrite the sentence to ensure:
    (1)Semantic Clarity: The meaning of the sentence is clear and unambiguous.
    (2)Avoid Complex Negative Sentences: The structure of negative sentences is simplified to avoid confusion.
    (3)Avoid Double Negations: Double negatives are not used, as they can complicate understanding.
    (4)Use Active Voice: The active voice is employed whenever possible to enhance directness and clarity.
    (5)Avoid Inversion: Inverted sentence structures are not used to prevent complexity in understanding, rewrite format:'No xxx is present/show' to format:'The patient shows no signs of xxx'.
    (6)Accuracy of Professional Terminology: Precise and consistent medical terminology is used to maintain accuracy.
2.Marking Requirement: If the sentence content is unrelated to any specific medical parts or symptoms, such as when mentioning comparisons or general descriptions (e.g., "Compared to chest radiographs since the most recent one"), please output the letter "X" to mark this situation.

Examples:

Input: The patient exhibits signs of acute appendicitis with localized tenderness in the lower right quadrant.
Output: The patient shows symptoms of acute appendicitis, with localized tenderness observed in the lower right abdominal quadrant.

Input: No consolidation or bone fracture is present.
Output: The patient shows no signs of consolidation or bone fracture.

Input: No acute cardio pulmonary process.
Output: The patient shows no signs of acute cardiac or pulmonary issues.

Input: Compared to chest radiographs since the most recent one, there are no significant changes noted.
Output: X

Please process the input medical diagnostic report sentences according to the above prompt.
Notice: you only need to output the rewrite sentence or "X" , do not output any other thing!
"""
    messages = [
        {"role": "system", "content": prompt},
        {"role": "user", "content": sent}
    ]
    return messages
def prompt_2_data(sent):
    prompt = """
Assume you are an experienced radiologist. Help me identify the correct medical condition label for the given radiology report sentence. Below are the medical condition labels with corresponding numbers and its medical description:

1.Atelectasis: Lung tissue exhibits signs of partial or complete atelectasis, with decreased lung volume and increased localized radiographic density.
2.Pleural Effusion: There is an abnormal accumulation of fluid within the pleural cavity, which is evident on X-ray imaging as a blunted costophrenic angle or the presence of a fluid level.
3.Pneumothorax: There is evidence of free air within the thoracic cavity, resulting in partial or complete atelectasis. This is characterized by a retracted lung edge and an area devoid of lung markings.
4.Cardiomegaly: The cardiac silhouette is enlarged, indicating a size beyond the normal range.
5.Lung Opacity: A lung region demonstrates increased radiopacity, potentially suggestive of inflammatory changes, a tumor, or hemorrhage.
6.Pneumonia: Patchy to diffuse lung infiltrates are observed, frequently associated with air bronchograms.
7.Pulmonary Mass: A lung mass, either well-circumscribed or poorly defined, is present, typically measuring over 3 cm in diameter.
8.Edema: Interstitial or alveolar fluid accumulation in the lungs is evident, characterized by increased and indistinct lung markings, a common finding in cardiogenic pulmonary edema.
9.Lung Nodule: A round or oval opacity within the lung, measuring less than 3 cm in diameter.
10.Lung Infiltration: Patchy or reticular opacities are noted within the lung tissue, suggestive of inflammatory processes or other infiltrative conditions.
11.Pulmonary Fibrosis: Interstitial lung thickening and fibrosis are present, exhibiting a reticular pattern and a honeycomb-like appearance on imaging studies.
12.Pulmonary Emphysema: Overinflation of the lungs with alveolar destruction is observed, manifesting as increased lung lucency and expanded lung fields on the chest X-ray.
13.Pleural Thickening: Pleural layer thickening is evident, appearing as broadened pleural shadows on imaging, often attributed to chronic inflammation or fibrosis.
14.Hernia: Internal organ protrusion through either normal or abnormal openings is observed; in the case of a diaphragmatic hernia, this may present as an abnormal position and contour of the diaphragm on X-ray imaging.
15.Pulmonary Consolidation: The lung alveoli are opacified with fluid or solid material, manifesting as regions of increased density with indistinct borders on imaging.
16.Bone Fracture: A discontinuity within the bone structure is observed on X-ray, characterized by a disrupted cortical bone and the presence of fracture lines.
17.Consolidation: Similar to pulmonary consolidation, the lung tissue exhibits a solidified appearance, a common finding in cases of pneumonia.
18.Enlarged Cardiomediastinum: An enlargement of the mediastinal shadow is noted, which may indicate the presence of mediastinal tumors, hematoma, or other mediastinal abnormalities.
19.Pleural Other: Other pleural abnormalities, such as pleural calcifications or plaques, are present, exhibiting specific imaging features that are indicative of the underlying condition.
20.Lung Lesion: An encompassing term for a variety of abnormal imaging findings within the lung, encompassing nodules, masses, infiltrates, and other anomalies.
21.Support Devices: Visualized on imaging are various medical devices, including catheters, stents, prosthetic heart valves, and other implanted or inserted medical apparatus.
22.Abnormal Lesion: A non-specific term denoting any abnormal imaging findings within the lungs, which may encompass a range of presentations such as nodules, masses, opacities, or other anomalies.
23.Lung Granuloma: A small pulmonary nodule, usually measuring less than 3 cm in diameter, frequently exhibiting calcification.
24.Calcified Granuloma: A calcified granuloma is observed, manifesting as a high-density nodule on imaging studies.
25.Tissue Calcification: Calcifications within soft tissue are noted, appearing as areas of increased density on imaging, indicative of calcified spots or plaques.
26.No Finding: The specified anatomical areas appear normal, and the reported symptoms are negative or absent.

Examples:

Sentence: there is no focal consolidation pleural effusion or pneumothorax.
Label: 26

Sentence: the cardiac mediastinal and hilar contours are normal.
Label: 26

Sentence: bilateral nodular opacities that most likely represent nipple shadows.
Label: 9

Sentence: the cardiomediastinal silhouette is normal.
Label: 26

Sentence: chronic deformity of the posterior left sixth and seventh ribs are noted.
Label: 16

Sentence: The patient shows no signs of free air below the right hemidiaphragm.
Label: 26

Special Note:
If the sentence describes normal anatomical regions or uses phrases like "no," "not present," "negative" or start with "The patient shows no signs of" for symptoms, the label should be "No Finding" (26).
If the sentence describes a specific abnormality or lesion, use the appropriate label.

In case you forgot, let me repeat these labels:
1. Atelectasis
2. Pleural Effusion
3. Pneumothorax
4. Cardiomegaly
5. Lung Opacity
6. Pneumonia
7. Pulmonary Mass
8. Edema
9. Lung Nodule
10. Lung Infiltration
11. Pulmonary Fibrosis
12. Pulmonary Emphysema
13. Pleural Thickening
14. Hernia
15. Pulmonary Consolidation
16. Bone Fracture
17. Consolidation
18. Enlarged Cardiomediastinum
19. Pleural Other
20. Lung Lesion
21. Support Devices
22. Abnormal Lesion
23. Lung Granuloma
24. Calcified Granuloma
25. Tissue Calcification
26. No Finding

Now, please select the most appropriate label for the following sentence and output only the corresponding number. 
Notice: you only need to output label(pure numbers), do not output any other thing!
"""
    messages = [
        {"role": "system", "content": prompt},
        {"role": "user", "content": sent}
    ]
    return messages

def prompt_2_pro_data(sent):
    prompt = """
Assume you are an experienced radiologist. Help me identify the correct medical condition label for the given radiology report sentence. Below are the medical condition labels with corresponding numbers and their medical descriptions:

1.Atelectasis: Lung tissue exhibits signs of partial or complete atelectasis, with decreased lung volume and increased localized radiographic density.
2.Pleural Effusion: There is an abnormal accumulation of fluid within the pleural cavity, which is evident on X-ray imaging as a blunted costophrenic angle or the presence of a fluid level.
3.Pneumothorax: There is evidence of free air within the thoracic cavity, resulting in partial or complete atelectasis. This is characterized by a retracted lung edge and an area devoid of lung markings.
4.Cardiomegaly: The cardiac silhouette is enlarged, indicating a size beyond the normal range.
5.Opacity: A lung region demonstrates increased radiopacity, potentially suggestive of inflammatory changes, a tumor, or hemorrhage.
6.Pneumonia: Patchy to diffuse lung infiltrates are observed, frequently associated with air bronchograms.
7.Pulmonary Mass: A lung mass, either well-circumscribed or poorly defined, is present, typically measuring over 3 cm in diameter.
8.Edema: Interstitial or alveolar fluid accumulation in the lungs is evident, characterized by increased and indistinct lung markings, a common finding in cardiogenic pulmonary edema.
9.Lung Nodule: A round or oval opacity within the lung, measuring less than 3 cm in diameter.
10.Lung Infiltration: Patchy or reticular opacities are noted within the lung tissue, suggestive of inflammatory processes or other infiltrative conditions.
11.Fibrosis: Interstitial lung thickening and fibrosis are present, exhibiting a reticular pattern and a honeycomb-like appearance on imaging studies.
12.Emphysema: Overinflation of the lungs with alveolar destruction is observed, manifesting as increased lung lucency and expanded lung fields on the chest X-ray.
13.Pleural Thickening: Pleural layer thickening is evident, appearing as broadened pleural shadows on imaging, often attributed to chronic inflammation or fibrosis.
14.Hernia: Internal organ protrusion through either normal or abnormal openings is observed; in the case of a diaphragmatic hernia, this may present as an abnormal position and contour of the diaphragm on X-ray imaging.
15.Consolidation: The lung alveoli are opacified with fluid or solid material, manifesting as regions of increased density with indistinct borders on imaging. the lung tissue exhibits a solidified appearance, a common finding in cases of pneumonia.
16.Bone Fracture: A discontinuity within the bone structure is observed on X-ray, characterized by a disrupted cortical bone and the presence of fracture lines.
17.Enlarged Cardiomediastinum: An enlargement of the mediastinal shadow or cardiomediastinal silhouette is noted.
18.Pleural Other: Other pleural abnormalities, such as pleural calcifications or plaques, are present, exhibiting specific imaging features that are indicative of the underlying condition.
19.Lung Lesion: An encompassing term for a variety of abnormal imaging findings within the lung, encompassing nodules, masses, infiltrates, and other anomalies.
20.Support Devices: Visualized on imaging are various medical devices, including catheters, stents, prosthetic heart valves, and other implanted or inserted medical apparatus.
21.Abnormal Lesion: A non-specific term denoting any abnormal imaging findings within the lungs, which may encompass a range of presentations such as nodules, masses, opacities, or other anomalies.
22.Lung Granuloma: A small pulmonary nodule, usually measuring less than 3 cm in diameter, frequently exhibiting calcification.
23.Calcified Granuloma: A calcified granuloma is observed, manifesting as a high-density nodule on imaging studies.
24.Tissue Calcification: Calcifications within soft tissue are noted, appearing as areas of increased density on imaging, indicative of calcified spots or plaques.
25.No Mention: None of the above symptoms are mentioned or related, cannot exist with any other labels at the same time.

Examples:

Sentence: there is no focal consolidation pleural effusion or pneumothorax.
Label: 15, 2, 3

Sentence: bilateral nodular opacities that most likely represent nipple shadows.
Label: 9

Sentence: chronic deformity of the posterior left sixth and seventh ribs are noted.
Label: 16

Sentence: the patient shows no signs of free air below the right hemidiaphragm.
Label: 3

Sentence: the imaged upper abdomen shows no remarkable findings.
Label: 25

Sentence: the patient's overall condition is normal.
Label: 25

Special Note:
If the sentence mentions a condition (whether positive or negative), use the corresponding label.
If the sentence describes multiple conditions, except 25(No Mention), output multiple labels, separated by commas ",".
Remember, 25(No Mention) and other labels cannot exist at the same time.

In case you forgot, let me repeat these labels:
1. Atelectasis
2. Pleural Effusion
3. Pneumothorax
4. Cardiomegaly
5. Opacity
6. Pneumonia
7. Pulmonary Mass
8. Edema
9. Lung Nodule
10. Lung Infiltration
11. Fibrosis
12. Emphysema
13. Pleural Thickening
14. Hernia
15. Consolidation
16. Bone Fracture
17. Enlarged Cardiomediastinum
18. Pleural Other
19. Lung Lesion
20. Support Devices
21. Abnormal Lesion
22. Lung Granuloma
23. Calcified Granuloma
24. Tissue Calcification
25. No Mention

Now, please select the most appropriate label for the following sentence and output only the corresponding number(s).
Notice: you only need to output the label (pure numbers), do not output anything else!
"""
    messages = [
        {"role": "system", "content": prompt},
        {"role": "user", "content": sent}
    ]
    return messages

def prompt_3_data(sent):
    prompt = """
Assume you are an experienced radiologist. Help me determine whether the patient has Cardiomegaly symptoms. Below are the medical condition labels with their descriptions:

16: mention the patient's designated site: bony structures or bone fracture.
17: mention the patient's designated site: mediastinal shadow or mediastinal outline or mediastinal or hilar outline.
25: the patient looks like normal.
26: there is other symptoms present, such as vascular congestion, thoracic kyphosis, aorta tortuous and so on.

Examples:

Sentence: The patient's bony structures appear normal.
Label: 16

Sentence: The mediastinal and hilar contours appear normal.
Label: 17

Sentence: The patient's pulmonary vascular congestion is stable and mild.
Label: 26

Sentence: The patient exhibits persistent thoracic kyphosis, with mild wedging observed in a mid-thoracic vertebral body.
Label: 26

Sentence: The patient shows mild vascular engorgement.
Label: 26

Sentence: The patient had no acute intrathoracic symptoms.
Label: 25

Sentence: The aorta is tortuous.
Label: 26

Sentence: The aorta shows slight tortuosity.
Label: 26

Special Note:
If the sentence does not mention neither [bony structures, bone fracture] or [mediastinal shadow, mediastinal outline, mediastinal, hilar outline], while the patient is not healthy and present some symptoms, choose 26.

In case you forgot, let me repeat these labels:
16: mention the patient's designated site: bony structures or bone fracture.
17: mention the patient's designated site: mediastinal shadow or mediastinal outline or mediastinal or hilar outline.
25: the patient looks like normal.
26: there is other symptoms present.

Now, please select the most appropriate label for the following sentence and output only the corresponding number. 
Notice: you only need to output label(pure numbers), do not output any other thing!
"""
    messages = [
        {"role": "system", "content": prompt},
        {"role": "user", "content": sent}
    ]
    return messages

def prompt_3_pro_data(sent):
    prompt = """
Assume you are an experienced radiologist. Help me identify whether the given radiology report sentence contains diagnostic information.
If the sentence contains diagnostic information related to a medical condition, output "25".
If the sentence does not contain any diagnostic information, output "X".

Examples:

Sentence: The patient would need to be repositioned, with the tube pulled back approximately 10 cm and then redirected towards the lower abdomen.
Label: X

Sentence: The patient had no acute intrathoracic symptoms.
Label: 25

Now, please select the most appropriate label for the following sentence and output only the corresponding number. 
Notice: you only need to output label(pure numbers), do not output any other thing!
"""
    messages = [
        {"role": "system", "content": prompt},
        {"role": "user", "content": sent}
    ]
    return messages

def prompt_4_data(sent):
    prompt = """
Assume you are an experienced radiologist. Help me identify whether the given radiology report sentence contains specific diagnostic information.

If the sentence mention the patient's designated site: bony structures or bone fracture, output "17".
else, output "25".

Examples:

Sentence: The patient's cardiomediastinal silhouette appears normal.
Label: 17

Sentence: The patient exhibits persistent thoracic kyphosis, with mild wedging observed in a mid-thoracic vertebral body.
Label: 25

Now, please select the most appropriate label for the following sentence and output only the corresponding number. 
Notice: you only need to output label(pure numbers), do not output any other thing!
"""
    messages = [
        {"role": "system", "content": prompt},
        {"role": "user", "content": sent}
    ]
    return messages

def prompt_5_data(sent):
    prompt = """
Assume you are an experienced radiologist. Help me identify whether the given radiology report sentence describes "Cardiomegaly" or "Enlarged Cardiomediastinum."
Use the following labels for classification:

4.Cardiomegaly: The cardiac silhouette is enlarged, indicating an increased heart size beyond the normal range. This is specifically identified by an increased cardiothoracic ratio on imaging.
17.Enlarged Cardiomediastinum: An enlargement of the mediastinal shadow or cardiomediastinal silhouette is noted, which may include enlarged mediastinal or hilar contours. This can indicate the presence of mediastinal tumors, hematoma, or other abnormalities within the mediastinum or involving the heart and surrounding structures.

Examples:

Input: The cardiac silhouette is enlarged, indicating cardiomegaly.
Output: 4

Input: There is enlargement of the mediastinal shadow.
Output: 17

Input: The mediastinum appears widened on the X-ray.
Output: 17

Input: No evidence of cardiomegaly is observed.
Output: 4

Now, please select the most appropriate label for the following sentence.
Notice: output only the corresponding number ("4" or "17").
"""
    messages = [
        {"role": "system", "content": prompt},
        {"role": "user", "content": sent}
    ]
    return messages

def inference(messages, model, tokenizer):
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)
    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    outputs = model.generate(
        input_ids,
        max_new_tokens=256,
        eos_token_id=terminators,
        do_sample=False,
        temperature=0.6,
        top_p=0.9,
    )
    response = outputs[0][input_ids.shape[-1]:]
    response = tokenizer.decode(response, skip_special_tokens=True)

    return response


def split_csv(data, output_prefix, chunk_size=20000):
    total_rows = len(data)
    num_chunks = (total_rows // chunk_size) + 1
    for i in range(num_chunks):
        start_row = i * chunk_size
        end_row = start_row + chunk_size
        chunk = data[start_row:end_row]
        output_file = f"{output_prefix}_part_{i + 1}.csv"
        chunk.to_csv(output_file, index=False, header=None)
        print(f"Saved {output_file}")

def append_to_csv(file_path, row):
    with open(file_path, mode='a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(row)

def parse_string_to_list(string):
    try:
        string = re.sub(r"(?<=\[|,)\s*'(.*?)'\s*(?=,|\])", r'"\1"', string)
        result = ast.literal_eval(string)
        if isinstance(result, list):
            return result
        else:
            raise ValueError("The parsed result is not a list.")
    except (SyntaxError, ValueError) as e:
        print(f"Error parsing string: {e}")
        return None

def split_sentences(text):
    text = text.replace('\n', ' ')
    sentences = re.split(r'[.。！？；]', text)
    return [s.strip() + '.' for s in sentences if s.strip()]

def generate():
    args = parse_args()

    model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, trust_remote_code=True, torch_dtype=torch.float16)
    model.cuda()
    tokenizer = AutoTokenizer.from_pretrained(
        args.model_name_or_path,
        model_max_length=1500,
        use_fast=False,
        trust_remote_code=True
    )

    filepaths = f'/mnt/nvme_share/wuwl/project/CARZero-main/Dataset/MIMIC/mimic_report_split_{args.split_id}.csv'
    mimic_report = pd.read_csv(filepaths)
    XH_data_sent = mimic_report['Report Impression']
    XH_data_sent = XH_data_sent.values.tolist()
    XH_data_sent_list = []
    XH_data_sent_list_temp = []
    for sample in XH_data_sent:
        if type(sample) == str:
            output_list = split_sentences(sample)
            XH_data_sent_list.append([str(output_list)])
            XH_data_sent_list_temp.append(output_list)
        else:
            XH_data_sent_list.append([])
            XH_data_sent_list_temp.append([])

    for sample in XH_data_sent_list:
        if type(sample) == None:
            raise ValueError("数据解析错误")

    filepaths = f'/mnt/nvme_share/wuwl/project/CARZero-main/Dataset/MIMIC_new/cut_report_part_{args.split_id}.csv'
    df = pd.DataFrame(XH_data_sent_list, columns=['sent'])
    df.to_csv(filepaths, index=False, header=False)

    file_path = f'/mnt/nvme_share/wuwl/project/CARZero-main/Dataset/MIMIC_new/sent_label_part_{args.split_id}.csv'
    if os.path.exists(file_path):
        history = pd.read_csv(file_path, header=None)
        cache = len(history)
    else:
        cache = 0

    for idx in tqdm(range(0, len(XH_data_sent_list_temp))):
        if idx < cache:
            continue
        sample = XH_data_sent_list_temp[idx]
        label_sent = []
        if len(sample) == 0:
            for _ in range(59):
                label_sent.append('0')
        else:
            for i in range(0, len(sample)):
                data_sent = sample[i]
                pattern = r'\bno\b'
                if re.search(pattern, data_sent) is not None:
                    symbol = '-'
                else:
                    symbol = '+'
                output_temp = prompt_2_pro_data(data_sent)
                output_sent = inference(output_temp, model, tokenizer)
                if ',' in output_sent:
                    output_sent = output_sent.split(', ')
                    temp = ', '.join([number + symbol for number in output_sent])
                    label_sent.append(temp)
                else:
                    label_sent.append(output_sent + symbol)
        if len(label_sent) < 59:
            for _ in range(59 - len(label_sent)):
                label_sent.append('0')
        append_to_csv(file_path, label_sent)


if __name__ == '__main__':
    generate()
