import json
import os
from openai import OpenAI
import numpy as np

class ModelRouter:
    def __init__(self, base_url, api_key):
        self.client = OpenAI(
            base_url=base_url,
            api_key=api_key,
            timeout=18000,
        )

    def select_model(self, question, available_models):
        if not question.strip():
            return None
            
        prompt = f"""
You are a model router designed to select the most suitable AI model from a given list of models ({json.dumps(available_models)}) based on the input question ({json.dumps(question)}). The selection should be made by evaluating the model's strengths across six dimensions: coding, knowledge, math, reasoning, roleplay, and writing. Use the following performance insights to guide your decision:

- gpt4o: Strong in writing, with moderate general knowledge.
- Qwen2.5-7B-Instruct: Excels in reasoning and roleplay, with solid writing skills.
- gemma-3-4b-it: Good at roleplay and writing, with decent reasoning capabilities.
- Kimi-K2-Instruct: Stands out in coding and knowledge, with strong roleplay performance.
- Llama-3.1-8B-Instruct: Effective in writing, with reasonable knowledge and reasoning.
- GPT5: Highly capable in reasoning, knowledge, and math, with good overall balance.
- gpt-3.5-turbo-1106: Strong in writing and math, with moderate coding skills.
- claude-3.7-sonnet-thinking: Excels in knowledge, reasoning, and coding.
- o3mini: Outstanding in reasoning and coding, with strong writing skills.
- deepseek-r1: Strong in coding and writing, with good reasoning and roleplay.
- qwen2.5-72b-instruct: Balanced performance, particularly strong in math and writing.
- gemini-2.5-pro: Exceptional in math, with solid coding and reasoning.
- gemma-3-27b-it: Excels in coding and reasoning, with good writing skills.
- Qwen3-32B: Strong in math and reasoning, with balanced overall performance.
- claude-3.5-sonnet-20241022: Outstanding in roleplay and knowledge, with strong reasoning.
- QwQ-32B: Good in writing and math, with solid reasoning skills.
- llama3.3-70B-instruct: Excels in writing, with decent roleplay and reasoning.
- doubao-1-5-thinking-pro-250415: Exceptional in writing, with good roleplay skills.
- gemini-2.5-flash: Top performer in math, writing, and reasoning, with strong overall ability.

### Routing Logic:
1. Analyze the {question} to identify the primary focus (e.g., coding, knowledge, math, reasoning, roleplay, or writing). If the question involves multiple aspects, prioritize based on the most dominant theme.
2. Select the model with the strongest performance in the identified focus area. If multiple models are strong, consider their overall balance across dimensions.
3. If no clear focus is identifiable, choose the model with the broadest and strongest overall capabilities.
4. Return the selected model's name as the output.

### Output Format:
- Return only the model name (e.g., "gemini-2.5-flash") with no additional text or explanation.
"""

        messages = [{"role": "system", "content": prompt},
                   {"role": "user", "content": f"Question: {question}, Available models: {json.dumps(available_models)}"}]

        response = self.client.chat.completions.create(
            model="deepseek-r1",
            messages=messages,
            temperature=0.7,
            stream=False,
        )

        return response.choices[0].message.content.strip()

def find_question_in_directory(question, directory):
    """在指定目录及其子目录中查找包含特定问题的JSON文件"""
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".json"):
                try:
                    with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                        data = json.load(f)
                        # 处理不同类型的JSON结构（列表或字典）
                        if isinstance(data, list):
                            for item in data:
                                if isinstance(item, dict) and item.get("question") == question:
                                    return item
                        elif isinstance(data, dict) and data.get("question") == question:
                            return data
                except (json.JSONDecodeError, UnicodeDecodeError):
                    continue
    return None

def evaluate_questions(questions_file, router, models):
    # 定义结果目录
    results_dirs = [
        
    ]
    human_judge_dir = 
    
    # 加载问题
    try:
        with open(questions_file, 'r', encoding='utf-8') as f:
            questions_data = json.load(f)
    except FileNotFoundError:
        print(f"错误：找不到问题文件 {questions_file}")
        return
    except json.JSONDecodeError:
        print(f"错误：无法解析问题文件 {questions_file}")
        return
    
    # 确保我们只处理前16个问题
    questions = [item["question"] for item in questions_data][:16]
    if len(questions) < 16:
        print(f"警告：问题文件只有 {len(questions)} 个问题，但需要16个")
    
    # 初始化统计变量
    valid_count = 0  # 有效问题计数（找到结果的问题）
    right_count = 0
    total_tokens_list = []
    time_list = []
    
    # 处理每个问题
    for i, question in enumerate(questions, 1):
        print(f"\n处理问题 {i}/16: {question[:50]}...")
        
        # 1. 选择模型
        selected_model = router.select_model(question, models)
        if not selected_model:
            print(f"  警告：问题 '{question[:20]}...' 无法选择模型")
            continue
        
        print(f"  选择的模型: {selected_model}")
        
        # 2. 在结果目录中查找匹配的问题
        result_item = None
        for results_dir in results_dirs:
            model_dir = os.path.join(results_dir, selected_model)
            if os.path.exists(model_dir):
                result_item = find_question_in_directory(question, model_dir)
                if result_item:
                    break
        
        if not result_item:
            print(f"  错误：在 {selected_model} 目录中找不到问题")
            continue
        
        # 3. 提取结果数据
        judge_result = result_item.get("judge result")
        cost_tokens = result_item.get("cost_tokens", {})
        total_tokens = cost_tokens.get("total_tokens")
        time_value = result_item.get("time")
        
        if total_tokens is not None:
            total_tokens_list.append(total_tokens)
        if time_value is not None:
            time_list.append(time_value)
        
        # 4. 在人类标注目录中查找匹配问题
        human_item = find_question_in_directory(question, human_judge_dir)
        if not human_item:
            print(f"  警告：在HumanJudge目录中找不到问题")
            continue
        
        # 5. 比较结果
        human_label = human_item.get("label")
        if human_label is None:
            print(f"  警告：人类标注中没有label字段")
            continue
            
        # 只有找到所有必要数据的问题才计入统计
        valid_count += 1
        if judge_result == human_label:
            right_count += 1
            print("  结果匹配 ✓")
        else:
            print(f"  结果不匹配: 模型输出={judge_result}, 人工标签={human_label}")
    
    # 计算统计结果（只基于有效问题）
    if valid_count == 0:
        print("\n警告：没有找到任何有效问题，无法计算统计结果")
        return
    # 计算统计结果
    acc = (right_count / valid_count) * 100
    avg_tokens = np.mean(total_tokens_list) if total_tokens_list else 0
    avg_time = np.mean(time_list) if time_list else 0
    
    # 打印最终结果
    print("\n" + "="*50)
    print("评估结果摘要（基于有效问题）")
    print("="*50)
    print(f"有效问题数量: {valid_count}/16")
    print(f"正确数量: {right_count}/{valid_count}")
    print(f"准确率 (ACC): {acc:.2f}%")
    print(f"平均总token数: {avg_tokens:.2f}")
    print(f"平均时间: {avg_time:.2f} 秒")
    print("="*50)

if __name__ == "__main__":
    # 初始化模型路由
    router = ModelRouter(
        base_url=
        api_key=
    )
    
    # 可用模型列表
    models = [
        "gpt4o", "Qwen2.5-7B-Instruct", "gemma-3-4b-it", "Kimi-K2-Instruct", 
        "Llama-3.1-8B-Instruct", "GPT5", "gpt-3.5-turbo-1106", 
        "claude-3.7-sonnet-thinking", "o3mini", "deepseek-r1", 
        "qwen2.5-72b-instruct", "gemini-2.5-pro", "gemma-3-27b-it", 
        "Qwen3-32B", "claude-3.5-sonnet-20241022", "QwQ-32B", 
        "llama3.3-70B-instruct", "doubao-1-5-thinking-pro-250415", 
        "gemini-2.5-flash"
    ]
    
    # 问题文件路径（需要根据实际情况修改）
    questions_file = 
    
    # 执行评估
    evaluate_questions(questions_file, router, models)