import pandas as pd
import numpy as np
import os
import re
import ast
import tqdm

def merge_csv_files(directory, input_file, output_file):
    # 初始化一个空的DataFrame
    merged_df = pd.DataFrame()

    # 遍历指定目录下的csv文件
    for i in range(1, 13):  # 假设文件名为1到12的数字
        file_name = f"{input_file}_{i}.csv"
        file_path = os.path.join(directory, file_name)

        # 检查文件是否存在
        if os.path.exists(file_path):
            # 读取csv文件
            df = pd.read_csv(file_path, header=None)
            # 将读取的DataFrame追加到merged_df中
            merged_df = pd.concat([merged_df, df], ignore_index=True)
        else:
            print(f"文件 {file_path} 不存在，跳过合并。")

    # 将合并后的DataFrame保存为新的csv文件
    file_path_output = os.path.join(directory, output_file)
    merged_df.to_csv(file_path_output, index=False, header=False)
    print(f"合并后的文件已保存为 {file_path_output}")

def parse_string_to_list(string):
    try:
        string = re.sub(r"(?<=\[|,)\s*'(.*?)'\s*(?=,|\])", r'"\1"', string)
        result = ast.literal_eval(string)
        if isinstance(result, list):
            return result
        else:
            raise ValueError("The parsed result is not a list.")
    except (SyntaxError, ValueError) as e:
        print(f"Error parsing string: {e}")
        return None


# merge_csv_files('/mnt/nvme_share/wuwl/project/CARZero-main/Dataset/MIMIC_new/', 'LLM_cut_report_part', 'LLM_cut_report_temp.csv')
# merge_csv_files('/mnt/nvme_share/wuwl/project/CARZero-main/Dataset/MIMIC_new/', 'LLM_sent_label_part', 'LLM_sent_label_temp.csv')

sent_original_path = '/mnt/nvme_share/wuwl/project/CARZero-main/Dataset/MIMIC_temp/cut_report_part_final.csv'
label_original_path = '/mnt/nvme_share/wuwl/project/CARZero-main/Dataset/MIMIC_temp/sent_label_part_final.csv'
sent_original = pd.read_csv(sent_original_path, header=None).reset_index(drop=True).values.tolist()
label_original = pd.read_csv(label_original_path, header=None).reset_index(drop=True).values.tolist()
sent_original_list = []
for sent_sample in tqdm.tqdm(sent_original, desc='Loading chopped sentences'):
    sent_original_list.append(parse_string_to_list(sent_sample[0]))
label_original_list = []
for label_sample in tqdm.tqdm(label_original, desc='Loading sentences labels'):
    temp = [element for element in label_sample[:50] if element != '0' and element != 0]
    label_original_list.append(temp)


sent_llm_path = '/mnt/nvme_share/wuwl/project/CARZero-main/Dataset/MIMIC_new/LLM_cut_report_temp.csv'
label_llm_path = '/mnt/nvme_share/wuwl/project/CARZero-main/Dataset/MIMIC_new/LLM_sent_label_temp.csv'
sent_llm = pd.read_csv(sent_llm_path, header=None).reset_index(drop=True).values.tolist()
label_llm = pd.read_csv(label_llm_path, header=None).reset_index(drop=True).values.tolist()
sent_llm_list = []
for sent_sample in tqdm.tqdm(sent_llm, desc='Loading chopped sentences'):
    sent_llm_list.append(parse_string_to_list(sent_sample[0]))
label_llm_list = []
for label_sample in tqdm.tqdm(label_llm, desc='Loading sentences labels'):
    temp = [element for element in label_sample if element != '0' and element != 0]
    label_llm_list.append(temp)


sent_all_list = []
label_all_list = []
for i in tqdm.tqdm(range(len(sent_original_list))):
    if type(sent_llm_list[i]) != list:
        sent_temp = sent_original_list[i]
        label_temp = label_original_list[i]
    else:
        sent_temp = sent_original_list[i] + sent_llm_list[i]
        label_temp = label_original_list[i] + label_llm_list[i]
    sent_all_list.append(str(sent_temp))
    if len(label_temp) < 100:
        for _ in range(100 - len(label_temp)):
            label_temp.append('0')
    label_all_list.append(label_temp)

pd.DataFrame(sent_all_list).to_csv('/mnt/nvme_share/wuwl/project/CARZero-main/Dataset/MIMIC_temp/LLM_cut_report_final.csv', index=False, header=False)
pd.DataFrame(label_all_list).to_csv('/mnt/nvme_share/wuwl/project/CARZero-main/Dataset/MIMIC_temp/LLM_sent_label_final.csv', index=False, header=False)
