import os 
import json
import calendar  # 添加calendar模块导入
import torch
import tiktoken
from model import *
from utils import *
from tqdm import tqdm
import yaml
import argparse
from pathlib import Path
import logging
import datetime
import sys
import random
import numpy as np
import re
# 添加RAG所需的库
from rank_bm25 import BM25Okapi
import openai

# 添加检索器类型选择
RETRIEVER_TYPE_OPTIONS = ["bm25", "vector", "hybrid"]
# 默认使用混合检索
DEFAULT_RETRIEVER_TYPE = "bm25"
# 添加全局变量，可以在启动脚本时通过参数覆盖
retriever_type = DEFAULT_RETRIEVER_TYPE

# context路径
TCELONGBENCH_CHUNKS_BASE_DIR = "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/contexts/tcelongbench"
LONG_DIALOG_CONTEXT_PATH = "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/contexts/long_dialog/long_dialog_sample_context_list.json"
WIKIDATA_CONTEXT_PATH = "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/contexts/wikidata/wikidata_sample_context_list.json"

######################## logging setup ########################
def setup_logger(model_name, setting, data_source):
    # 创建logs目录（如果不存在）
    logs_dir = Path("/home/weishaohang/workspace/Omni-Temp/logs")
    os.makedirs(logs_dir, exist_ok=True)
    
    # 创建带有时间戳的日志文件名
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    log_filename = f"{model_name}_{setting}_{data_source}_{timestamp}.log"
    log_filepath = logs_dir / log_filename
    
    # 配置日志记录器
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    
    # 清除现有的处理器（避免重复添加）
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)
    
    # 创建文件处理器
    file_handler = logging.FileHandler(log_filepath, encoding='utf-8')
    file_handler.setLevel(logging.INFO)
    
    # 创建控制台处理器（仅显示WARNING及以上级别的消息）
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(logging.WARNING)  # 将控制台日志级别设置为WARNING，这样INFO级别的日志就不会显示在控制台
    
    # 创建格式化器
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(formatter)
    console_handler.setFormatter(formatter)
    
    # 添加处理器到记录器
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)
    
    # 记录初始信息
    logging.info(f"开始评估 - 模型: {model_name}, 设定: {setting}, 数据源: {data_source}")
    logging.info(f"日志文件保存在: {log_filepath}")
    
    return logger

######################## preliminaries and prompt paths ########################
mapping_source_and_qtype_to_prompt_type = {
    "wikidata": {
        "L1_1": "multi_choice_qa",
        "L1_2": "free_form_qa_for_time_expression",
        "L1_3": "free_form_qa", # TODO:
        "L1_4": "single_choice_qa",
        "L1_5": "single_choice_qa",
        "L2_1": "free_form_qa", # TODO:
        "L2_2": "free_form_qa", # TODO:
        "L2_3": "free_form_qa_with_refusal",
        "L3_1": "free_form_qa_with_refusal",
        "L3_2": "free_form_qa", # TODO:
        "L3_3": "single_choice_qa_for_forecast",
        "L3_4": "free_form_qa_for_false_premise",
    },
    "tcelongbench": {
        "L1_2": "free_form_qa_for_time_expression", # TODO:
        "L1_3": "free_form_qa", # TODO:
        "L1_4": "single_choice_qa",
        "L1_5": "single_choice_qa",
        "L2_1_multi_choice": "single_choice_qa",
        "L2_2_multi_choice": "single_choice_qa",
        "L2_3_multi_choice": "single_choice_qa",
        "L3_1_multi_choice": "single_choice_qa",
        "L3_2": "free_form_qa", # TODO:
        "L3_4_multi_choice": "single_choice_qa_for_false_premise",  
    },
    "long_dialog": {
        "L1_1": "multi_choice_qa",
        "L1_2": "free_form_qa_for_time_expression", # TODO:
        "L1_3": "free_form_qa", # TODO:
        "L1_4": "single_choice_qa",
        "L1_5": "single_choice_qa",
        "L2_1": "single_choice_qa",
        "L2_2_multi_choice": "single_choice_qa",
        "L2_3_multi_choice": "single_choice_qa", 
        "L3_1_multi_choice": "single_choice_qa",
        "L3_2": "free_form_qa", # TODO:
        "L3_4_multi_choice": "single_choice_qa_for_false_premise",  
    }
}

mapping_setting_prompt_type_to_prompt_path = {
    "base": {# 基础设定，即提供上下文，zero-shot
        "multi_choice_qa": "/home/weishaohang/workspace/Omni-Temp/prompts/evaluation/with_context_zero_shot/multi_choice_qa.txt",
        "free_form_qa": "/home/weishaohang/workspace/Omni-Temp/prompts/evaluation/with_context_zero_shot/free_form_qa.txt",
        "free_form_qa_with_refusal": "/home/weishaohang/workspace/Omni-Temp/prompts/evaluation/with_context_zero_shot/free_form_qa_with_refusal.txt",
        "free_form_qa_for_time_expression": "/home/weishaohang/workspace/Omni-Temp/prompts/evaluation/with_context_zero_shot/free_form_qa_for_time_expression.txt",
        "multi_choice_qa_for_forecast": "/home/weishaohang/workspace/Omni-Temp/prompts/evaluation/with_context_zero_shot/multi_choice_qa_for_forecast.txt",
        "free_form_qa_for_false_premise": "/home/weishaohang/workspace/Omni-Temp/prompts/evaluation/with_context_zero_shot/free_form_qa_for_false_premise.txt",
        "multi_choice_qa_for_false_premise": "/home/weishaohang/workspace/Omni-Temp/prompts/evaluation/with_context_zero_shot/multi_choice_qa_for_false_premise.txt",
        "single_choice_qa": "/home/weishaohang/workspace/Omni-Temp/prompts/evaluation/with_context_zero_shot/single_choice_qa.txt",
        "single_choice_qa_for_false_premise": "/home/weishaohang/workspace/Omni-Temp/prompts/evaluation/with_context_zero_shot/single_choice_qa_for_false_premise.txt",
        "single_choice_qa_for_forecast": "/home/weishaohang/workspace/Omni-Temp/prompts/evaluation/with_context_zero_shot/single_choice_qa_for_forecast.txt"
    },
    "RAG": {
        "multi_choice_qa": "/home/weishaohang/workspace/Omni-Temp/prompts/evaluation/RAG/multi_choice_qa.txt",
        "free_form_qa": "/home/weishaohang/workspace/Omni-Temp/prompts/evaluation/RAG/free_form_qa.txt",
        "free_form_qa_with_refusal": "/home/weishaohang/workspace/Omni-Temp/prompts/evaluation/RAG/free_form_qa_with_refusal.txt",
        "free_form_qa_for_time_expression": "/home/weishaohang/workspace/Omni-Temp/prompts/evaluation/RAG/free_form_qa_for_time_expression.txt",
        "multi_choice_qa_for_forecast": "/home/weishaohang/workspace/Omni-Temp/prompts/evaluation/RAG/multi_choice_qa_for_forecast.txt",
        "free_form_qa_for_false_premise": "/home/weishaohang/workspace/Omni-Temp/prompts/evaluation/RAG/free_form_qa_for_false_premise.txt",
        "multi_choice_qa_for_false_premise": "/home/weishaohang/workspace/Omni-Temp/prompts/evaluation/RAG/multi_choice_qa_for_false_premise.txt",
        "single_choice_qa": "/home/weishaohang/workspace/Omni-Temp/prompts/evaluation/RAG/single_choice_qa.txt",
        "single_choice_qa_for_false_premise": "/home/weishaohang/workspace/Omni-Temp/prompts/evaluation/RAG/single_choice_qa_for_false_premise.txt",
        "single_choice_qa_for_forecast": "/home/weishaohang/workspace/Omni-Temp/prompts/evaluation/RAG/single_choice_qa_for_forecast.txt"
    }
}

mapping_source_and_qtype_to_json_path = {
    "wikidata": {
        "L1_1": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/wikidata/L1_1_QAs.json",
        "L1_2": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/wikidata/L1_2_QAs.json",
        "L1_3": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/wikidata/L1_3_QAs.json",
        "L1_4": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/wikidata/L1_4_QAs.json",
        "L1_5": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/wikidata/L1_5_QAs.json",
        "L2_1": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/wikidata/L2_1_QAs.json",
        "L2_2": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/wikidata/L2_2_QAs.json",
        "L2_3": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/wikidata/L2_3_QAs.json",
        "L3_1": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/wikidata/L3_1_QAs.json",
        "L3_2": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/wikidata/L3_2_QAs.json",
        "L3_3": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/wikidata/L3_3_QAs.json",
        "L3_4": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/wikidata/L3_4_QAs.json",
    },
    "tcelongbench": {
        "L1_2": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/tcelongbench/L1_2_QAs.json",
        "L1_3": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/tcelongbench/L1_3_QAs.json",
        "L1_4": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/tcelongbench/L1_4_QAs.json",
        "L1_5": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/tcelongbench/L1_5_QAs.json",
        "L3_2": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/tcelongbench/L3_2_QAs.json",
        "L2_1_multi_choice": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/tcelongbench/L2_1_QAs_multi_choice.json",
        "L2_2_multi_choice": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/tcelongbench/L2_2_QAs_multi_choice.json",
        "L2_3_multi_choice": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/tcelongbench/L2_3_QAs_multi_choice.json",
        "L3_1_multi_choice": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/tcelongbench/L3_1_QAs_multi_choice.json",
        "L3_4_multi_choice": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/tcelongbench/L3_4_QAs_multi_choice.json",
    },
    "long_dialog": {
        "L1_1": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/long_dialog/L1_1_QAs.json",
        "L1_2": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/long_dialog/L1_2_QAs.json",
        "L1_3": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/long_dialog/L1_3_QAs.json",
        "L1_4": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/long_dialog/L1_4_QAs.json",
        "L1_5": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/long_dialog/L1_5_QAs.json",
        "L2_1": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/long_dialog/L2_1_QAs.json",
        "L3_2": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/long_dialog/L3_2_QAs.json",
        "L2_2_multi_choice": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/long_dialog/L2_2_QAs_multi_choice.json",
        "L2_3_multi_choice": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/long_dialog/L2_3_QAs_multi_choice.json",
        "L3_1_multi_choice": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/long_dialog/L3_1_QAs_multi_choice.json",
        "L3_4_multi_choice": "/home/weishaohang/workspace/Omni-Temp/TIME-Lite/QAs/long_dialog/L3_4_QAs_multi_choice.json",
    }
}

data_source_list = ["wikidata", "tcelongbench", "long_dialog"]

setting_list = ["base", "RAG"]

mapping_source_to_qtype_list = {
    "wikidata": ["L1_1", "L1_2", "L1_3", "L1_4", "L1_5", "L2_1", "L2_2", "L2_3", "L3_1", "L3_2", "L3_3", "L3_4"],
    "tcelongbench": ["L1_2", "L1_3", "L1_4", "L1_5", "L2_1_multi_choice", "L2_2_multi_choice", "L2_3_multi_choice", "L3_1_multi_choice", "L3_2", "L3_4_multi_choice"],
    "long_dialog": ["L1_1", "L1_2", "L1_3", "L1_4", "L1_5", "L2_1", "L2_2_multi_choice", "L2_3_multi_choice", "L3_1_multi_choice", "L3_2", "L3_4_multi_choice"]
}

mapping_source_to_setting_list = {
    "wikidata": ["base", "RAG"],
    "tcelongbench": ["base", "RAG"],
    "long_dialog": ["base", "RAG"]
}

######################## load and dump data ########################
def load_prompt(setting, data_source, qtype):
    prompt_type = mapping_source_and_qtype_to_prompt_type[data_source][qtype]
    prompt_path = mapping_setting_prompt_type_to_prompt_path[setting][prompt_type]
    with open(prompt_path, "r", encoding="utf-8") as f:
        prompt = f.read()
    logging.debug(f"加载提示模板: {prompt_path}")
    return prompt

def load_qa_data(data_source, qtype):
    file_path = mapping_source_and_qtype_to_json_path[data_source][qtype]
    logging.info(f"加载QA数据: {file_path}")
    return json.load(open(file_path, "r", encoding="utf-8"))

def check_if_results_exist(model, setting, data_source, qtype):
    # 根据设定是否为RAG决定路径
    if setting == "RAG":
        # 在RAG设定下，添加检索器类型子目录
        path = f"/home/weishaohang/workspace/Omni-Temp/results_time_lite/{model.model_name}/{setting}/{retriever_type}/{data_source}/{qtype}.json"
    else:
        # 非RAG设定保持原样
        path = f"/home/weishaohang/workspace/Omni-Temp/results_time_lite/{model.model_name}/{setting}/{data_source}/{qtype}.json"
        
    logging.info(f"检查结果是否存在: {path}")
    return os.path.exists(path)

def dump_qa_data(results, model, setting, data_source, qtype):
    logging.info(f"保存 {data_source} {qtype} {setting} {model.model_name} 的QA结果...")
    
    # 根据设定是否为RAG决定路径
    if setting == "RAG":
        # 在RAG设定下，添加检索器类型子目录
        path = f"/home/weishaohang/workspace/Omni-Temp/results_time_lite/{model.model_name}/{setting}/{retriever_type}/{data_source}/{qtype}.json"
    else:
        # 非RAG设定保持原样
        path = f"/home/weishaohang/workspace/Omni-Temp/results_time_lite/{model.model_name}/{setting}/{data_source}/{qtype}.json"
        
    os.makedirs(os.path.dirname(path), exist_ok=True)   # 如果目录不存在，则创建目录（递归）
    with open(path, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=4, ensure_ascii=False)
    logging.info(f"结果已保存到: {path}")
    print(f"Dumped {data_source} {qtype} {setting} {model.model_name} qa data to {path}")

def make_prompt(setting, data_source, qtype, qa_idx, question):
    # template
    prompt_template = load_prompt(setting, data_source, qtype)

    # context
    context = get_context(setting, data_source, qtype, qa_idx)
    
    # prompt
    prompt = prompt_template.format(context=context, question=question)
    return prompt

def get_context(setting, data_source, qtype, qa_idx):
    if data_source == "tcelongbench":
        if setting != "RAG":
            raise ValueError("TCELongBench的context仅在RAG设定下有效")
        question_path = mapping_source_and_qtype_to_json_path[data_source][qtype]
        base_json_path = os.path.basename(question_path)
        chunks_path = os.path.join(TCELONGBENCH_CHUNKS_BASE_DIR, retriever_type, base_json_path)
        chunks_data = json.load(open(chunks_path, "r", encoding="utf-8"))
        chunk_list = chunks_data[qa_idx]
        context = "\n\n".join(
            [f"[{i+1}] Title: {chunk['title']}, Day: {chunk['date']}\nContent: {chunk['content']}" 
             for i, chunk in enumerate(chunk_list)]
            )
        return context
    else:
        if setting != "base":
            raise ValueError(f"{data_source}的context仅在base设定下有效")
        question_path = mapping_source_and_qtype_to_json_path[data_source][qtype]
        with open(question_path, "r", encoding="utf-8") as f:
            question_data = json.load(f)
        context_index = question_data[qa_idx]["Index"]
        context_path = LONG_DIALOG_CONTEXT_PATH if 'long_dialog' in data_source else WIKIDATA_CONTEXT_PATH
        context = json.load(open(context_path, "r", encoding="utf-8"))[context_index]
        return context


######################## eval ########################
# NOTE 评估tcelongbench时，需要剔除最后一天的articles

def gen_pred_results(model, setting, data_source, qtype):
    logging.info(f"开始生成 {data_source} {qtype} {setting} {model.model_name} 的预测结果...")
    print(f"\n\nGenerating predictions for {data_source} {qtype} {setting} {model.model_name} qa data...")
    
    # 读取QAs
    QAs = load_qa_data(data_source, qtype)

    # 获取QAs的idx
    idx_list = list(QAs.keys()) if isinstance(QAs, dict) else range(len(QAs))
    
    # 初始化所有预测结果
    results = [[] for _ in idx_list]
    
    # 收集所有数据，便于并行处理
    all_prompts = []
    all_questions = []
    all_gold_answers = []

    
    # 首先收集所有的问题和提示
    for qa_idx in tqdm(idx_list, desc=f"Collecting prompts for {data_source}/{qtype}/{setting}/{model.model_name}", total=len(idx_list)):
        question = QAs[qa_idx]["Question"]
        gold_answer = QAs[qa_idx]["Gold Answer"]
        
        # make prompt
        prompt = make_prompt(setting, data_source, qtype, qa_idx, question)
        all_prompts.append(prompt)
        all_questions.append(question)
        all_gold_answers.append(gold_answer)

    # 并行生成预测结果
    pred_answers = model.generate(all_prompts)

    # 将结果映射回原始数据结构
    for i, (pred_answer, question, gold_answer) in enumerate(zip(pred_answers, all_questions, all_gold_answers)):
        
        # 创建基本结果字典
        result_dict = {
            "Question": question,
            "Gold Answer": gold_answer,
            "Pred Answer": pred_answer
        }
        
        # 添加结果到适当的位置
        results[i].append(result_dict)
    
    dump_qa_data(results, model, setting, data_source, qtype)
    logging.info(f"完成 {data_source} {qtype} {setting} {model.model_name} 的预测结果生成")
    
    # 返回所有prompts用于token统计
    return all_prompts


# 生成所有QAs的预测结果
def gen_pred_results_for_all_qa_data(model, setting, data_source):
    logging.info(f"开始为 {data_source} {setting} {model.model_name} 生成所有QA类型的预测结果...")
    
    # 收集所有prompts用于token统计
    all_prompts_combined = []
    
    for qtype in mapping_source_to_qtype_list[data_source]:
        if check_if_results_exist(model, setting, data_source, qtype):
            logging.info(f"结果已存在，跳过评估: {data_source} {qtype} {setting} {model.model_name}")
        else:
            # 清理CUDA缓存以减少碎片化
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            
            # 收集该qtype的所有prompts
            qtype_prompts = gen_pred_results(model, setting, data_source, qtype)
            all_prompts_combined.extend(qtype_prompts)
    
    # 使用tiktoken统计token数量
    if all_prompts_combined:
        # 拼接所有prompts
        all_text = "\n".join(all_prompts_combined)
        
        # 使用GPT-4的tokenizer (cl100k_base)
        encoding = tiktoken.get_encoding("cl100k_base")
        tokens = encoding.encode(all_text)
        token_count = len(tokens)
        
        # 记录并打印token统计信息
        logging.info(f"所有prompts的总token数量: {token_count}")
        print(f"\n总计统计了 {len(all_prompts_combined)} 个prompts，总token数量: {token_count}")
    
    logging.info(f"完成 {data_source} {setting} {model.model_name} 所有QA类型的预测结果生成")


def main():
    # 创建命令行参数解析器
    parser = argparse.ArgumentParser(description='评估模型在不同数据集上的表现')
    parser.add_argument('--config', type=str, default='evaluation/config/eval.yaml', 
                        help='配置文件路径')
    parser.add_argument('--model', type=str, default=None, 
                        help='模型名称，会覆盖配置文件中的设置')
    parser.add_argument('--setting', type=str, default=None, 
                        help='评估设定，会覆盖配置文件中的设置')
    parser.add_argument('--data_source', type=str, default=None, 
                        help='数据源，会覆盖配置文件中的设置')
    # 添加检索器类型选择参数
    parser.add_argument('--retriever_type', type=str, default=DEFAULT_RETRIEVER_TYPE,
                        choices=RETRIEVER_TYPE_OPTIONS,
                        help='检索器类型: bm25(仅使用BM25), vector(仅使用向量检索), hybrid(混合使用)')
    args = parser.parse_args()
    
    # 检查配置文件是否存在
    config_path = Path(args.config)
    if not config_path.exists():
        raise FileNotFoundError(f"配置文件 {args.config} 不存在")
    
    # 加载配置文件
    config = yaml.safe_load(open(config_path, "r", encoding="utf-8"))
    
    # 命令行参数覆盖配置文件
    if args.model is not None:
        config["model"] = args.model
    if args.setting is not None:
        config["setting"] = args.setting
    if args.data_source is not None:
        config["data_source"] = args.data_source
    
    # 设置检索器类型
    global retriever_type
    if args.retriever_type is not None:
        retriever_type = args.retriever_type
        config["retriever_type"] = retriever_type
    elif "retriever_type" in config and config["retriever_type"] in RETRIEVER_TYPE_OPTIONS:
        retriever_type = config["retriever_type"]
    else:
        config["retriever_type"] = retriever_type
    
    logging.info(f"使用检索器类型: {retriever_type}")
    logging.info(f"Chunks将从以下目录读取: {os.path.join(TCELONGBENCH_CHUNKS_BASE_DIR, retriever_type)}")
    
    # 验证配置
    if config["setting"] not in setting_list:
        raise ValueError(f"无效的设定: {config['setting']}，可用选项: {setting_list}")
    if config["data_source"] not in data_source_list:
        raise ValueError(f"无效的数据源: {config['data_source']}，可用选项: {data_source_list}")
    if config["setting"] not in mapping_source_to_setting_list[config["data_source"]]:
        raise ValueError(f"数据源 {config['data_source']} 不支持设定 {config['setting']}")
    
    # 设置日志记录器
    setup_logger(config["model"], config["setting"], config["data_source"])
    
    # 打印配置信息
    logging.info("评估配置信息:")
    logging.info(f"  - 配置文件: {args.config}")
    logging.info(f"  - 模型: {config['model']}")
    logging.info(f"  - 设定: {config['setting']}")
    logging.info(f"  - 数据源: {config['data_source']}")
    
    print(f"正在使用以下配置进行评估:")
    print(f"  - 配置文件: {args.config}")
    print(f"  - 模型: {config['model']}")
    print(f"  - 设定: {config['setting']}")
    print(f"  - 数据源: {config['data_source']}")
    
    # 初始化模型并进行评估
    global model
    model = MODEL(config["model"])
    setting = config["setting"]
    data_source = config["data_source"]
    
    # 设置随机种子并记录
    random.seed(42)
    np.random.seed(42)
    logging.info(f"设置随机种子: 42")
    
    # 创建结果目录
    if setting == "RAG":
        # 在RAG设定下，添加检索器类型子目录
        result_dir = Path(f"/home/weishaohang/workspace/Omni-Temp/results_time_lite/{model.model_name}/{setting}/{retriever_type}/{data_source}")
    else:
        # 非RAG设定保持原样
        result_dir = Path(f"/home/weishaohang/workspace/Omni-Temp/results_time_lite/{model.model_name}/{setting}/{data_source}")
        
    os.makedirs(result_dir, exist_ok=True)
    logging.info(f"创建结果目录: {result_dir}")
    
    # 保存使用的配置
    with open(result_dir / "config.yaml", "w", encoding="utf-8") as f:
        yaml.dump(config, f, default_flow_style=False, allow_unicode=True)
    logging.info(f"配置已保存到: {result_dir / 'config.yaml'}")
    
    try:
        # 执行评估
        gen_pred_results_for_all_qa_data(model, setting, data_source)
        logging.info("评估完成")
    except Exception as e:
        logging.error(f"评估过程中发生错误: {str(e)}", exc_info=True)
        raise

if __name__ == "__main__":
    main()