# 根据new_verify_data文件夹下面的QA的question去定位idx

import json
import os


def read_json_file(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)


def find_mapping_indices(verify_questions, original_qa_list, dataset_name):
    """
    在原始QA列表中找到verify_questions中每个问题的索引
    
    Args:
        verify_questions: 来自new_verify_data/extracted_QAs的问题列表
        original_qa_list: 原始的QA列表(QAs_wikidata, QAs_tcelongbench, QAs_long_dialog)
        dataset_name: 数据集名称，用于处理不同的数据结构
    
    Returns:
        mapping: 字典，键为verify_questions中的索引，值为在original_qa_list中找到的(sample_idx, question_idx)
    """
    mapping = {}
    
    for verify_idx, verify_item in enumerate(verify_questions):
        verify_question = verify_item["Question"]
        
        # 针对tcelongbench特殊处理
        if dataset_name == "tcelongbench":
            # tcelongbench的原始数据是字典格式，键为样本索引
            for sample_idx, qa_list in original_qa_list.items():
                for question_idx, qa in enumerate(qa_list):
                    if "Question" in qa and qa["Question"] == verify_question:
                        # 保留sample_idx为字符串格式
                        mapping[verify_idx] = (sample_idx, question_idx)
                        break
                
                # 如果已找到匹配，继续下一个verify question
                if verify_idx in mapping:
                    break
        else:
            # 处理wikidata和long_dialog
            for sample_idx, sample in enumerate(original_qa_list):
                # 处理不同的数据结构
                if isinstance(sample, list):
                    # QAs_wikidata 格式
                    for question_idx, qa in enumerate(sample):
                        if "Question" in qa and qa["Question"] == verify_question:
                            mapping[verify_idx] = (sample_idx, question_idx)
                            break
                else:
                    # 其他可能的格式
                    if "questions" in sample:
                        for question_idx, qa in enumerate(sample["questions"]):
                            if "text" in qa and qa["text"] == verify_question:
                                mapping[verify_idx] = (sample_idx, question_idx)
                                break
                
                # 如果已找到匹配，继续下一个verify question
                if verify_idx in mapping:
                    break
    
    return mapping


def process_dataset(dataset_name, sample_indices, base_dir="/home/weishaohang/workspace/Omni-Temp"):
    """处理单个数据集，找到新验证数据中问题的索引映射"""
    
    # 读取new_verify_data中的问题
    verify_path = os.path.join(base_dir, "new_verify_data/extracted_QAs", dataset_name)
    verify_questions = {}
    
    # 遍历目录中的所有QA文件
    for filename in os.listdir(verify_path):
        if filename.endswith(".json"):
            file_path = os.path.join(verify_path, filename)
            verify_data = read_json_file(file_path)
            verify_questions[filename] = verify_data
    
    # 读取原始QA文件
    qa_path = os.path.join(base_dir, f"QAs_{dataset_name}")
    qa_files = {}
    
    if os.path.exists(qa_path):
        for filename in os.listdir(qa_path):
            if filename.endswith(".json"):
                file_path = os.path.join(qa_path, filename)
                qa_data = read_json_file(file_path)
                qa_files[filename] = qa_data
    
    # 构建映射
    mappings = {}
    for verify_filename, verify_data in verify_questions.items():
        # 找到匹配的原始QA文件名
        for qa_filename, qa_data in qa_files.items():
            if verify_filename == qa_filename:
                mapping = find_mapping_indices(verify_data, qa_data, dataset_name)
                if mapping:  # 只添加非空映射
                    mappings[verify_filename] = mapping
                else:
                    print(f"警告: 在处理{dataset_name}的{verify_filename}时没有找到匹配项")
                break
    
    # 保存结果
    output_path = os.path.join(base_dir, f"TIME-Lite/mapping_idx_from_TIME/{dataset_name}_verify_to_original_mapping.json")
    with open(output_path, 'w') as f:
        json.dump(mappings, f, indent=4)
    
    print(f"已处理完 {dataset_name} 数据集，映射结果保存至 {output_path}")
    
    return mappings


def main():
    # 首先读取三个sample_indices.json文件
    sample_indices_wiki = read_json_file("/home/weishaohang/workspace/Omni-Temp/TIME-Lite/mapping_idx_from_TIME/wikidata_sample_indices.json")
    sample_indices_tcelongbench = read_json_file("/home/weishaohang/workspace/Omni-Temp/TIME-Lite/mapping_idx_from_TIME/tcelongbench_sample_indices.json")
    sample_indices_long_dialog = read_json_file("/home/weishaohang/workspace/Omni-Temp/TIME-Lite/mapping_idx_from_TIME/long_dialog_sample_indices.json")
    
    # 然后读取new_verify_data文件夹下面的QA的question，一一比对QAs_wikidata、QAs_tcelongbench、QAs_long_dialog对应QA.json，找到对应的问题idx
    # 每次找回的idx应该是二元组，分别是sample_idx和该sample_idx下的question_idx
    
    # 处理三个数据集
    print("处理 wikidata 数据集...")
    wiki_mappings = process_dataset("wikidata", sample_indices_wiki)
    
    print("处理 tcelongbench 数据集...")
    tcelongbench_mappings = process_dataset("tcelongbench", sample_indices_tcelongbench)
    
    print("处理 long_dialog 数据集...")
    long_dialog_mappings = process_dataset("long_dialog", sample_indices_long_dialog)
    
    # 整合所有映射结果
    all_mappings = {
        "wikidata": wiki_mappings,
        "tcelongbench": tcelongbench_mappings,
        "long_dialog": long_dialog_mappings
    }
    
    # 保存所有映射结果
    output_path = "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/mapping_idx_from_TIME/all_verify_to_original_mappings.json"
    with open(output_path, 'w') as f:
        json.dump(all_mappings, f, indent=4)
    
    print(f"所有映射结果已保存至 {output_path}")


if __name__ == "__main__":
    main()