#!/usr/bin/env python3
"""
RP Benchmark Evaluation Script
Evaluates role-playing performance of language models using LLM-as-a-judge approach
"""

import json
import os
import sys
import argparse
import time
import logging
from typing import List, Dict, Any, Optional
from collections import defaultdict
from dataclasses import dataclass
import requests
from tqdm import tqdm
import yaml
from openai import OpenAI
import re
import concurrent.futures
import threading
import math

Chinese_Examples = """
1. 首长，我们所面临的这场战争，敌我力量之悬殊是人类战争史上前所未有的，所以我认为，在相当长的一段时间里，太空军所面临的最大危险是失败主义。 
2. (手敲着桌子) 这就是你们说的大礼？ 
3. (怀疑地）你为什么盯着那片树篱看？
4. 坐下。（指着床）
"""

English_Examples = """
1. Commander, the war we are facing now is so imbalanced in terms of power that it's unprecedented in human history. Therefore, I believe that for a long period, the greatest threat to the Space Force will be defeatism.
2. (Bangs hand on the table) This is the grand gift you spoke of? 
3. (Suspiciously) Why are you staring at the hedge?
4. Sit down. (Points at the bed)
"""

# Logging configuration
logger = logging.getLogger(__name__)

@dataclass
class ModelConfig:
    """Model Configuration Class"""
    api_key: str
    base_url: str
    model: str
    temperature: float = 0.6
    max_tokens: int = 16384
    top_p: Optional[float] = None
    
def load_model_config(config_path: str, model_name: str) -> ModelConfig:
    """Load model configuration from YAML file"""
    with open(config_path, 'r', encoding='utf-8') as f:
        config_data = yaml.safe_load(f)
    
    if model_name not in config_data['models']:
        raise ValueError(f"Model '{model_name}' not found in config file")
    
    model_config = config_data['models'][model_name]
    return ModelConfig(
        api_key=model_config['api_key'],
        base_url=model_config['base_url'],
        model=model_config['model'],
        temperature=model_config.get('temperature', 0.6),
        max_tokens=model_config.get('max_tokens', 16384),
        top_p=model_config.get('top_p', None)
    )

class ModelAPI:
    """Generic model API interface supporting multiple model types"""

    def __init__(self, config: ModelConfig, name: str = "Unknown"):
        """Initialize client"""
        self.config = config
        self.name = name

        # For models that require OpenAI client
        if self._needs_openai_client():
            self.client = OpenAI(api_key=config.api_key, base_url=config.base_url)
        else:
            self.client = None

        logger.info(f"Initialize LLM client [{name}]: {config.model}")

    def _needs_openai_client(self) -> bool:
        """Check if OpenAI client is needed"""
        return not any(self.config.model.startswith(prefix) for prefix in ["gemini", "claude"])

    def _extract_final_response_from_r1(self, raw_content: str) -> str:
        """Extract final response from DeepSeek-R1 response, removing think tags content"""
        try:
            # If think tags are present, extract content outside the tags
            if "<think>" in raw_content and "</think>" in raw_content:
                # Find the position of the last </think> tag
                last_think_end = raw_content.rfind("</think>")
                if last_think_end != -1:
                    # Extract content after </think> tag as final response
                    final_response = raw_content[last_think_end + len("</think>"):].strip()
                    if final_response:
                        logger.info(f"Extracted final response: {final_response[:100]}...")
                        return final_response
                    else:
                        logger.info("Final response after think tags is empty, returning original content")

            # If no think tags are detected or extraction fails, return original content
            logger.info("No think tags detected, returning original content")
            return raw_content
            
        except Exception as e:
            logger.error(f"Response processing failed: {e}")
            return raw_content

    def _get_claude_response(self, prompt: str, model: str, max_tokens: int, temperature: float) -> str:
        """Get Claude model response"""
        headers = {
            'anthropic-version': '2023-06-01',
            'content-type': 'application/json',
            "Authorization": f"Bearer {self.config.api_key}"
        }
        data = {
            "model": model,
            "messages": [{"role": "user", "content": prompt}],
            "max_tokens": max_tokens,
            "temperature": temperature,
            "top_p": self.config.top_p,
            "stream": False
        }
        response = requests.post(
            url=f"{self.config.base_url}/v1/messages",
            headers=headers,
            data=json.dumps(data),
            timeout=1800,
        )
        return response.json()['content'][0]['text'].strip()

    def _get_claude_thinking_response(self, prompt: str, model: str, max_tokens: int) -> str:
        """Get Claude model thinking response"""
        headers = {
            'anthropic-version': '2023-06-01',
            'anthropic-beta': 'output-128k-2025-02-19',
            'content-type': 'application/json',
            "Authorization": f"Bearer {self.config.api_key}"
        }
        data = {
            "model": model,
            "max_tokens": max_tokens,
            "thinking": {
                "type": "enabled",
                "budget_tokens": 4096
            },
            "messages": [{"role": "user", "content": prompt}],
            "stream": False
        }
        response = requests.post(
            url=f"{self.config.base_url}/v1/messages",
            headers=headers,
            data=json.dumps(data),
            timeout=1800,
        )
        return response.json()['content'][1]['text'].strip()

    def _get_gemini_response(self, prompt: str, model: str, max_tokens: int, temperature: float, thinking_budget: int = 0) -> str:
        """Get Gemini model response"""
        url = f"{self.config.base_url}/v1beta/models/{model}:generateContent?key={self.config.api_key}"
        headers = {"Content-Type": "application/json"}
        data = {
            "contents": [{"parts": [{"text": prompt}]}],
            "generationConfig": {
                "maxOutputTokens": max_tokens,
                "temperature": temperature,
                "topP": self.config.top_p,
                "thinkingConfig": {"thinkingBudget": thinking_budget}
            }
        }
        response = requests.post(url=url, headers=headers, data=json.dumps(data), timeout=180).json()
        return response['candidates'][0]['content']['parts'][0]['text'].strip()
    
    def _get_gemini_response_with_token_count(self, prompt: str, model: str, max_tokens: int = 32768, temperature: float = 0.0, thinking_budget: int = -1) -> str:
        """Get Gemini model response with token count"""
        url = f"{self.config.base_url}/v1beta/models/{model}:generateContent?key={self.config.api_key}"
        headers = {"Content-Type": "application/json"}
        data = {
            "contents": [{"parts": [{"text": prompt}]}],
            "generationConfig": {
                "maxOutputTokens": max_tokens,
                "temperature": temperature,
                "topP": self.config.top_p,
                "thinkingConfig": {"thinkingBudget": thinking_budget}
            }
        }
        response = requests.post(url=url, headers=headers, data=json.dumps(data), timeout=180).json()
        thinking_token_count = response.get('usageMetadata', {}).get('thoughtsTokenCount', 0)
        return response['candidates'][0]['content']['parts'][0]['text'].strip(), thinking_token_count

    def generate_response(self, prompt: str, temperature: float = None, max_tokens: int = None, retries: int = 3, backoff_factor: float = 2.0) -> str:
        """Generate model response with retry mechanism"""
        # Use passed parameters, or default values from config
        temp = temperature if temperature is not None else self.config.temperature
        tokens = max_tokens if max_tokens is not None else self.config.max_tokens
        top_p = self.config.top_p
        
        for attempt in range(1, retries + 1):
            try:
                logger.info(f"Model [{self.name}] [{self.config.model}] is generating a response...")

                # Qwen3 series
                if self.config.model in ["Qwen3-235B", "Qwen3-8B", "Qwen3-32B"]:
                    logger.info("Disable Qwen3 thinking mode")
                    response = self.client.chat.completions.create(
                        model=self.config.model,
                        messages=[{"role": "user", "content": prompt}],
                        extra_body={"chat_template_kwargs": {"enable_thinking": False}},
                        temperature=temp,
                        max_tokens=tokens,
                        top_p=top_p
                    )
                    return response.choices[0].message.content.strip()

                # Qwen3-Thinking series
                elif self.config.model in ["Qwen3-235B-Thinking", "Qwen3-8B-Thinking", "Qwen3-32B-Thinking"]:
                    logger.info("Enable Qwen3 thinking mode")
                    response = self.client.chat.completions.create(
                        model=self.config.model.replace("-Thinking", ""),
                        messages=[{"role": "user", "content": prompt}],
                        extra_body={"chat_template_kwargs": {"enable_thinking": True}},
                        temperature=temp,
                        max_tokens=tokens,
                        top_p=top_p
                    )
                    raw_content = response.choices[0].message.content.strip()
                    return self._extract_final_response_from_r1(raw_content)
                
                # GPT series
                elif self.config.model in ["gpt-4o", "gpt-4.1-nano", "gpt-4.1-mini", "gpt-4.1", "gpt-4.5-preview"]:
                    response = self.client.chat.completions.create(
                        model=self.config.model,
                        messages=[{"role": "user", "content": prompt}],
                        temperature=temp,
                        max_tokens=tokens,
                        top_p=top_p
                    )
                    return response.choices[0].message.content.strip()

                # O3 series
                elif self.config.model in ["o3", "o3-mini", "o4-mini"]:
                    response = self.client.chat.completions.create(
                        model=self.config.model,
                        messages=[{"role": "user", "content": prompt}],
                        max_completion_tokens=tokens,
                        top_p=top_p
                    )
                    return response.choices[0].message.content.strip()
                
                # DeepSeek-V3
                elif self.config.model == "DeepSeek-V3":
                    response = self.client.chat.completions.create(
                        model="mihoyo/DeepSeek-V3",
                        messages=[{"role": "user", "content": prompt}],
                        temperature=temp,
                        max_tokens=tokens,
                        top_p=top_p
                    )
                    return response.choices[0].message.content.strip()
                
                # DeepSeek-R1
                elif self.config.model == "DeepSeek-R1":
                    response = self.client.chat.completions.create(
                        model="mihoyo/DeepSeek-R1",
                        messages=[{"role": "user", "content": prompt}],
                        temperature=temp,
                        max_tokens=tokens,
                        top_p=top_p
                    )
                    raw_content = response.choices[0].message.content.strip()
                    return self._extract_final_response_from_r1(raw_content)
                
                # MiniMax-M1
                elif self.config.model == "MiniMax-M1":
                    response = self.client.chat.completions.create(
                        model="minimax/MiniMax-M1",
                        messages=[{"role": "user", "content": prompt}],
                        temperature=temp,
                        max_tokens=tokens,
                        top_p=top_p
                    )
                    raw_content = response.choices[0].message.content.strip()
                    return self._extract_final_response_from_r1(raw_content)
                
                # Gemini-2.5-Pro
                elif self.config.model == "gemini-2.5-pro":
                    return self._get_gemini_response(prompt, self.config.model, tokens, temp, thinking_budget=-1)
                
                # Gemini-2.5-Flash
                elif self.config.model == "gemini-2.5-flash":
                    return self._get_gemini_response(prompt, self.config.model, tokens, temp, thinking_budget=0)
                
                # Claude-3.5-Sonnet
                elif self.config.model == "claude-3.5-sonnet":
                    return self._get_claude_response(prompt, "claude-3-5-sonnet-20241022", tokens, temp)
                
                # Claude-3.7-Sonnet
                elif self.config.model == "claude-3.7-sonnet":
                    return self._get_claude_response(prompt, "claude-3-7-sonnet-20250219", tokens, temp)
                
                # Claude-4-Sonnet
                elif self.config.model == "claude-4-sonnet":
                    return self._get_claude_response(prompt, "claude-sonnet-4-20250514", tokens, temp)
                
                # Claude-4-Sonnet-Thinking
                elif self.config.model == "claude-4-sonnet-thinking":
                    return self._get_claude_thinking_response(prompt, "claude-sonnet-4-20250514", tokens)
                
                # Mistral-Small
                elif self.config.model == "mistral-small":
                    response = self.client.chat.completions.create(
                        model="mistral-small",
                        messages=[{"role": "user", "content": prompt}],
                        temperature=temp,
                        max_tokens=tokens,
                        top_p=top_p
                    )
                    return response.choices[0].message.content.strip()
                
                # Magistral-Small
                elif self.config.model == "magistral-small":
                    response = self.client.chat.completions.create(
                        model="magistral-small",
                        messages=[{"role": "user", "content": prompt}],
                        temperature=temp,
                        max_tokens=tokens,
                        top_p=top_p
                    )
                    raw_content = response.choices[0].message.content.strip()
                    return self._extract_final_response_from_r1(raw_content)
                
                # Higgs-70B
                elif self.config.model == "higgs-70b":
                    response = self.client.chat.completions.create(
                        model="higgs-70b",
                        messages=[{"role": "user", "content": prompt}],
                        temperature=temp,
                        max_tokens=tokens,
                        top_p=top_p
                    )
                    return response.choices[0].message.content.strip()
                
                # Other models with OpenAI format
                else:
                    logger.info(f"Other models with OpenAI format API call")
                    response = self.client.chat.completions.create(
                        model=self.config.model,
                        messages=[{"role": "user", "content": prompt}],
                        temperature=temp,
                        max_tokens=tokens,
                        top_p=top_p
                    )
                    return response.choices[0].message.content.strip()

            except Exception as e:
                print(f"[Attempt {attempt}] API request failed: {e}")
                logger.warning(f"Failed to generate (have tried {attempt}/{retries}): {e}")

                if attempt < retries:
                    wait_time = backoff_factor ** attempt  # Exponential backoff
                    print(f"Retrying in {wait_time:.1f} seconds...")
                    time.sleep(wait_time)

        print("Error: Failed after multiple attempts")
        return None
    
    def generate_response_with_thinking_count(self, prompt: str, max_retries: int = 3) -> Optional[tuple[str, int]]:
        """Generate model response with thinking token count (if supported)"""
        for attempt in range(max_retries):
            try:
                logger.info(f"Model [{self.name}] [{self.config.model}] is generating response...")
                # gemini-2.5-pro
                if self.config.model == "gemini-2.5-pro":
                    response, thinking_count = self._get_gemini_response_with_token_count(prompt=prompt, model=self.config.model, max_tokens=self.config.max_tokens, temperature=self.config.temperature)
                    return response, thinking_count
                # Other models do not support thinking count
                else:
                    response = self.generate_response(prompt, max_retries=1)
                    return response, 0 if response else None
            except Exception as e:
                logger.warning(f"Failed to generate (have tried {attempt + 1}/{max_retries}): {e}")
                if attempt < max_retries - 1:
                    time.sleep(2 ** attempt)  
                else:
                    logger.error(f"Failed to generate after multiple attempts: {e}")
                    return None

def load_criteria_from_yaml(file_path):
    """
    Load the strategy string for a given evaluation dimension.

    Args:
        dim (str): The key of the dimension, e.g., "CR", "FR", "RR", "CA", or "PA".
        file_path (str): Path to the YAML file storing strategies.

    Returns:
        str: The strategy prompt for the given dimension.

    Raises:
        KeyError: If the dimension is not found in the YAML file.
        FileNotFoundError: If the YAML file does not exist.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        strategy_dict = yaml.safe_load(f)
    
    return strategy_dict


RESPONSE_STRATEGY = load_criteria_from_yaml("./configs/customrpbench_response_criteria.yaml")
EVALUATION_STRATEGY = load_criteria_from_yaml("./configs/customrpbench_evaluation_criteria.yaml")

class RPBenchmarkEvaluator:
    """RP Benchmark Evaluator Class"""
    
    def __init__(self, test_api: ModelAPI, judge_api: ModelAPI, 
                 test_temperature: float = 0.7, test_max_tokens: int = 1000,
                 judge_temperature: float = 0.0, judge_max_tokens: int = 8192,
                ):
        self.test_api = test_api
        self.judge_api = judge_api
        self.test_temperature = test_temperature
        self.test_max_tokens = test_max_tokens
        self.judge_temperature = judge_temperature
        self.judge_max_tokens = judge_max_tokens
        
        self.progress_lock = threading.Lock()
        self.progress_bar = None
        
    def _process_data_chunk(self, data_chunk: List[Dict], chunk_id: int) -> List[Dict]:
        """Process a chunk of data points for concurrent evaluation"""
        chunk_results = []
        
        for data_point in data_chunk:
            try:
                results = self.evaluate_data_point(data_point)
                chunk_results.extend(results)
                
                if self.progress_bar:
                    with self.progress_lock:
                        self.progress_bar.update(1)
                        
            except Exception as e:
                print(f"Chunk {chunk_id} has error when evaluation: {e}")
                continue
        
        return chunk_results
        
    def build_dialogue_history(self, dialogue_list: List[Dict], target_index: int) -> str:
        """Build dialogue history up to the target index"""
        history_parts = []
        for i in range(target_index):
            item = dialogue_list[i]
            character = item.get("character", "Unknown")
            message = item.get("message", "")
            history_parts.append(f"{character}: {message}")
        
        return "\n".join(history_parts)
    
    def create_rp_prompt(self, data_point: Dict, dialogue_item: Dict, history: str) -> str:
        """Create role-playing prompt for the model"""
        world_view = data_point.get("world_view", "")
        character_profiles = data_point.get("character_profiles", {})
        
        # Extract language and profiles
        language = data_point.get("language", "")
        my_profile = ""
        character_profile = ""
        
        if character_profiles:
            profiles_values = list(character_profiles.values())
            if len(profiles_values) >= 2:
                my_profile = profiles_values[0]
                character_profile = profiles_values[1]
        
        if language == "Chinese":
            motivation = "参与到当前对话场景中，根据自己的性格和背景做出反应。"
            examples = Chinese_Examples
        else:
            motivation = "Engage in the current dialogue scenario and respond according to your character's personality and background."
            examples = English_Examples
        
        dimension = dialogue_item.get("dimension", "CR")
        dimension_strategy = RESPONSE_STRATEGY.get(dimension, RESPONSE_STRATEGY["CR"])
        
        prompt = f"""
You are role-playing a character based on the following profile. Use colloquial language to respond.
If My profile is in English, please respond in English.
If My profile is in Chinese, please respond in Chinese.

# World view
{world_view}

# My profile
{json.dumps(my_profile, ensure_ascii=False, indent=2)}

# Other Character profiles
{character_profile}

# My motivation
{motivation}

# Dialogue History
{history}

# Reply Strategy (You should follow this strategy in your response)
{dimension_strategy}

# Response Format Each response consists of an action (optional) and a sentence without the speaker's name in the beginning like <Name>:. Add () outside the action. 
Here are some examples: 
{examples}

[IMPORTANT!] Please do not use fixed and repeated sentences similar to the ##Dialogue History##

# Response(only one sentence in {language} without any explanation):
"""
        return prompt.strip()

    def create_judge_prompt(self, history: str, response1: str, response2: str, dimension_strategy: str, character_profile: List, model1_name: str = "Model 1", model2_name: str = "Model 2") -> str:
        """Create judge prompt for pairwise comparison"""
        prompt = f"""You are a judge for an AI NPC system. You need to compare two responses according to the provided chat criteria using a pairwise comparison approach. Please provide a final score.

    # Provided chat criteria
    {dimension_strategy}

    # Dialogue history
    {history}
    
    # Character profiles
    {json.dumps(character_profile, ensure_ascii=False, indent=2)}

    # {model1_name}
    response: {response1}

    # {model2_name}
    response: {response2}

    # Scoring Guidelines:
    Please evaluate the responses using a 5-point Likert scale:
    - **1**: Strong preference for {model1_name} - {model1_name} is significantly better
    - **2**: Moderate preference for {model1_name} - {model1_name} is somewhat better
    - **3**: Tie - Both responses are roughly equivalent in quality
    - **4**: Moderate preference for {model2_name} - {model2_name} is somewhat better
    - **5**: Strong preference for {model2_name} - {model2_name} is significantly better

    This scoring method penalizes models more heavily for large losses, effectively distinguishing performance across models.

    # Output format:
    Explanation: <detailed explanation of the choice including specific strengths/weaknesses and reasoning for the score>

    Score: <1, 2, 3, 4, or 5>
    """
        return prompt.strip()
    
    def parse_judge_response(self, response: str) -> tuple[str, int]:
        """Parse judge response to extract explanation and score"""
        explanation = ""
        score = 3  # Default score is tie

        # Try multiple ways to extract Explanation
        explanation_match = re.search(r'Explanation:\s*(.*?)(?=\n\s*Score:|$)', response, re.DOTALL | re.IGNORECASE)
        if explanation_match:
            explanation = explanation_match.group(1).strip()

        # Try multiple ways to extract Score
        # Method 1: Directly search for "Score: <number>"
        score_match = re.search(r'Score:\s*(\d+)', response, re.IGNORECASE)
        if score_match:
            try:
                score = int(score_match.group(1))
                if score < 1 or score > 5:
                    score = 3
            except ValueError:
                score = 3
        else:
            # Method 2: Look for the last line starting with "Score:"
            lines = response.strip().split('\n')
            for line in reversed(lines):
                line = line.strip()
                if line.startswith("Score:"):
                    score_text = line.replace("Score:", "").strip()
                    # Extract number, possibly within parentheses
                    number_match = re.search(r'(\d+)', score_text)
                    if number_match:
                        try:
                            score = int(number_match.group(1))
                            if score < 1 or score > 5:
                                score = 3
                        except ValueError:
                            score = 3
                    break
        
        return explanation, score
    
    def score_to_points(self, score: int, test_model_first: bool) -> float:
        """Convert judge score to points based on the position of the test model"""
        if test_model_first:
            # When generated response is first
            score_mapping = {1: 3.0, 2: 1.0, 3: 0.5, 4: 0.0, 5: 0.0}
        else:
            # When generated response is second
            score_mapping = {1: 0.0, 2: 0.0, 3: 0.5, 4: 1.0, 5: 3.0}
        
        return score_mapping.get(score, 0.5)
    
    def evaluate_data_point(self, data_point: Dict) -> List[Dict]:
        """Evaluate a single data point containing multiple dialogues"""
        dialogue = data_point.get("dialogue", [])
        character_profiles = data_point.get("character_profiles", {})
        results = []
        
        for i, item in enumerate(dialogue):
            if "model" in item:
                # Build dialogue history
                history = self.build_dialogue_history(dialogue, i)

                # Create RP prompt
                rp_prompt = self.create_rp_prompt(data_point, item, history)
                
                # Generate model response
                if self.test_api.config.model == "gemini-2.5-pro":
                    test_response_all = self.test_api.generate_response_with_thinking_count(rp_prompt)
                    if test_response_all:
                        test_response, test_model_thinking_count = test_response_all
                    else:
                        test_response = None
                else:
                    test_response = self.test_api.generate_response(
                        rp_prompt, self.test_temperature, self.test_max_tokens
                    )
                
                # Get baseline response (ground truth)
                if "baseline" in item:  
                    baseline_response = item.get("baseline", "")
                else:
                    baseline_response = item.get("message", "")
                
                # Get evaluation dimension strategy
                dimension = item.get("dimension", "CR")
                dimension_strategy = EVALUATION_STRATEGY.get(dimension, EVALUATION_STRATEGY["CR"])
                
                # Get character profile for judge prompt
                character_profiles_list = list(character_profiles.values())
                
                # First comparison: test model first
                judge_prompt_1 = self.create_judge_prompt(
                    history, test_response, baseline_response, dimension_strategy, character_profiles_list,
                    "Test Model", "Baseline"
                )
                
                judge_response_1 = self.judge_api.generate_response(
                    judge_prompt_1, self.judge_temperature, self.judge_max_tokens
                )
                
                explanation_1, score_1 = self.parse_judge_response(judge_response_1)
                points_1 = self.score_to_points(score_1, test_model_first=True)
                
                # Second comparison: baseline first
                judge_prompt_2 = self.create_judge_prompt(
                    history, baseline_response, test_response, dimension_strategy, character_profiles_list,
                    "Baseline", "Test Model"
                )
                
                judge_response_2 = self.judge_api.generate_response(
                    judge_prompt_2, self.judge_temperature, self.judge_max_tokens
                )
                
                explanation_2, score_2 = self.parse_judge_response(judge_response_2)
                points_2 = self.score_to_points(score_2, test_model_first=False)
                
                # Total points
                total_points = points_1 + points_2
                
                result = {
                    "data_point_id": data_point.get("model_name", "unknown"),
                    "dialogue_index": i,
                    "character_name": data_point.get("character_name", ""),
                    "dimension": dimension,
                    "history": history,
                    "raw_prompt": rp_prompt,
                    "test_response": test_response,
                    "baseline_response": baseline_response,
                    "judge_response_1": judge_response_1,
                    "explanation_1": explanation_1,
                    "score_1": score_1,
                    "points_1": points_1,
                    "judge_response_2": judge_response_2,
                    "explanation_2": explanation_2,
                    "score_2": score_2,
                    "points_2": points_2,
                    "total_points": total_points
                }
                
                if self.test_api.config.model == "gemini-2.5-pro":
                    result["thinking_token_count"] = test_model_thinking_count
                
                results.append(result)

                time.sleep(0.1)
        
        return results
    
    def calculate_dimension_statistics(self, all_results: List[Dict]) -> Dict:
        """Calculate statistics for each evaluation dimension"""
        dimension_stats = defaultdict(lambda: {"total_points": 0.0, "count": 0})
        max_score = self.score_to_points(1, test_model_first=True)

        # Count scores and occurrences for each dimension
        for result in all_results:
            dimension = result["dimension"]
            dimension_stats[dimension]["total_points"] += result["total_points"]
            dimension_stats[dimension]["count"] += 1

        # Calculate total evaluations and number of dimensions
        total_evaluations = len(all_results)
        num_dimensions = len(dimension_stats)

        # Calculate maximum possible score (each sample max score is 3.0)
        max_possible_score = total_evaluations * max_score * 2

        # New weighted calculation method
        average_evaluations_per_dimension = total_evaluations / num_dimensions if num_dimensions > 0 else 0
        weighted_total = 0.0
        dimension_summary = {}
        
        for dimension, stats in dimension_stats.items():
            # Calculate average score for the dimension
            avg_score = stats["total_points"] / stats["count"] if stats["count"] > 0 else 0

            # Weighted score based on average evaluations per dimension
            weighted_score = avg_score * average_evaluations_per_dimension
            weighted_total += weighted_score
            
            original_weight = stats["count"] / total_evaluations if total_evaluations > 0 else 0
            
            dimension_max_score = stats["count"] * max_score * 2
            final_score_percentage = (stats["total_points"] / dimension_max_score) * 100 if dimension_max_score > 0 else 0
            
            dimension_summary[dimension] = {
                "total_points": stats["total_points"],
                "count": stats["count"],
                "avg_score": avg_score,
                "original_weight": original_weight,
                "weighted_score": weighted_score,
                "final_score": final_score_percentage,
                "max_possible_score": dimension_max_score
            }
        
        return {
            "dimension_breakdown": dimension_summary,
            "weighted_total_score": weighted_total,
            "average_evaluations_per_dimension": average_evaluations_per_dimension,
            "max_possible_score": max_possible_score
        }
        
    def evaluate_benchmark_concurrent(self, benchmark_data: List[Dict], concurrency: int = 4) -> Dict:
        """Evaluate the benchmark data using concurrent processing"""
        total_data_points = len(benchmark_data)
        max_score = self.score_to_points(1, test_model_first=True)
        
        chunk_size = math.ceil(total_data_points / concurrency)
        
        data_chunks = []
        for i in range(0, total_data_points, chunk_size):
            chunk = benchmark_data[i:i + chunk_size]
            data_chunks.append(chunk)
        
        actual_concurrency = len(data_chunks)

        print(f"Start to evaluate {total_data_points} data points...")
        print(f"Using {actual_concurrency} concurrent threads, each processing about {chunk_size} data points")

        # Create global progress bar
        from tqdm import tqdm
        self.progress_bar = tqdm(total=total_data_points, desc="Evaluation Progress", unit="data point")
        
        all_results = []
        
        try:
            # Use ThreadPoolExecutor for I/O-bound tasks
            with concurrent.futures.ThreadPoolExecutor(max_workers=actual_concurrency) as executor:
                # Submit tasks
                future_to_chunk = {
                    executor.submit(self._process_data_chunk, chunk, i): i 
                    for i, chunk in enumerate(data_chunks)
                }
                
                # Collect results as they complete
                for future in concurrent.futures.as_completed(future_to_chunk):
                    chunk_id = future_to_chunk[future]
                    try:
                        chunk_results = future.result()
                        all_results.extend(chunk_results)
                    except Exception as e:
                        print(f"There is an error when processing chunk {chunk_id}: {e}")
                        continue
                        
        finally:
            # Ensure the progress bar is properly closed
            self.progress_bar.close()
            self.progress_bar = None

        # Calculate overall statistics
        total_points = sum(result["total_points"] for result in all_results)
        total_evaluations = len(all_results)

        # Calculate dimension statistics
        dimension_stats = self.calculate_dimension_statistics(all_results)
        
        # Calculate final score percentage
        max_possible_score = total_evaluations * max_score * 2
        final_score_percentage = (dimension_stats["weighted_total_score"] / max_possible_score) * 100 if max_possible_score > 0 else 0
        
        evaluation_summary = {
            "total_evaluations": total_evaluations,
            "total_points": total_points,
            "max_possible_score": max_possible_score,
            "weighted_total_score": dimension_stats["weighted_total_score"],
            "final_score": final_score_percentage,
            "average_evaluations_per_dimension": dimension_stats["average_evaluations_per_dimension"],
            "dimension_breakdown": dimension_stats["dimension_breakdown"],
            "test_model": self.test_api.config.model,
            "judge_model": self.judge_api.config.model,
            "concurrency_used": actual_concurrency,
            "detailed_results": all_results
        }
        
        return evaluation_summary
    
    def evaluate_benchmark(self, benchmark_data: List[Dict]) -> Dict:
        """Evaluate the benchmark"""
        all_results = []
        total_points = 0.0
        total_evaluations = 0
        max_score = self.score_to_points(1, test_model_first=True)

        print(f"Total {len(benchmark_data)} data points...")

        for data_point in tqdm(benchmark_data, desc="Evaluation Progress"):
            try:
                results = self.evaluate_data_point(data_point)
                all_results.extend(results)
                
                # Update total points and evaluations
                for result in results:
                    total_points += result["total_points"]
                    total_evaluations += 1
                    
            except Exception as e:
                print(f"There is an error when evaluating data point: {e}")
                continue

        # Calculate dimension statistics
        dimension_stats = self.calculate_dimension_statistics(all_results)
        
        # Calculate final score percentage
        max_possible_score = total_evaluations * max_score * 2
        final_score_percentage = (dimension_stats["weighted_total_score"] / max_possible_score) * 100 if max_possible_score > 0 else 0
        
        evaluation_summary = {
            "total_evaluations": total_evaluations,
            "total_points": total_points,
            "max_possible_score": max_possible_score,
            "weighted_total_score": dimension_stats["weighted_total_score"],
            "final_score": final_score_percentage,
            "average_evaluations_per_dimension": dimension_stats["average_evaluations_per_dimension"],
            "dimension_breakdown": dimension_stats["dimension_breakdown"],
            "test_model": self.test_api.config.model,
            "judge_model": self.judge_api.config.model,
            "detailed_results": all_results
        }
        
        return evaluation_summary

def load_benchmark_data(file_path: str) -> List[Dict]:
    """Load benchmark data"""
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    if isinstance(data, list):
        return data
    elif isinstance(data, dict) and "data" in data:
        return data["data"]
    else:
        raise ValueError("Unsupported benchmark data format")

def save_results(results: Dict, output_path: str):
    """Save evaluation results"""
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

def save_summary(results: Dict, summary_path: str):
    """Save evaluation summary"""
    summary = {
        "test_model": results["test_model"],
        "judge_model": results["judge_model"],
        "total_evaluations": results["total_evaluations"],
        "total_points": results["total_points"],
        "max_possible_score": results["max_possible_score"],
        "weighted_total_score": results["weighted_total_score"],
        "final_score": results["final_score"],
        "average_evaluations_per_dimension": results["average_evaluations_per_dimension"],
        "dimension_breakdown": results["dimension_breakdown"]
    }

    # If summary file already exists, append to existing data
    if os.path.exists(summary_path):
        try:
            with open(summary_path, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
            if not isinstance(existing_data, list):
                existing_data = [existing_data]
        except:
            existing_data = []
    else:
        existing_data = []
    
    existing_data.append(summary)
    
    with open(summary_path, 'w', encoding='utf-8') as f:
        json.dump(existing_data, f, ensure_ascii=False, indent=2)

def main():
    parser = argparse.ArgumentParser(description="RP Benchmark Evaluation")
    parser.add_argument("--config", default="./configs/models.yaml", help="Path to model configuration YAML file")
    parser.add_argument("--test_model", required=True, help="Test model name from config")
    parser.add_argument("--judge_model", required=True, help="Judge model name from config")
    parser.add_argument("--dataset", required=True, help="Path to CustomRPBench dataset JSON file")
    parser.add_argument("--output", required=True, help="Output file path for detailed results")
    parser.add_argument("--summary", required=True, help="Summary file path for final scores")
    parser.add_argument("--concurrency", type=int, default=1, help="Number of concurrent threads for evaluation")
    
    args = parser.parse_args()
    
    # Load model configurations
    try:
        test_config = load_model_config(args.config, args.test_model)
        judge_config = load_model_config(args.config, args.judge_model)
        print(f"Successfully load the test model config: {args.test_model}")
        print(f"Successfully load the judge model config: {args.judge_model}")
    except Exception as e:
        print(f"There is an error when loading the model config: {e}")
        sys.exit(1)
        
    # Create ModelAPI 
    test_api = ModelAPI(test_config, "Test Model")
    judge_api = ModelAPI(judge_config, "Judge Model")
    
    # Create RPBenchmarkEvaluator
    evaluator = RPBenchmarkEvaluator(
        test_api, judge_api, 
        test_config.temperature, test_config.max_tokens,
        judge_config.temperature, judge_config.max_tokens
    )

    # Load benchmark data
    print("Load benchmark data...")
    benchmark_data = load_benchmark_data(args.dataset)
    print(f"Successfully load {len(benchmark_data)} data points")
    
    # Evaluate benchmark
    print("Start evaluation...")
    if args.concurrency == 1:
        results = evaluator.evaluate_benchmark(benchmark_data)
    else:
        results = evaluator.evaluate_benchmark_concurrent(benchmark_data, args.concurrency)
    
    # Save results
    print("Save results...")
    save_results(results, args.output)
    save_summary(results, args.summary)

    print(f"Evaluation completed!")
    print(f"Test Model: {args.test_model}")
    print(f"Judge Model: {args.judge_model}")
    print(f"Total Evaluations: {results['total_evaluations']}")
    print(f"Total Points: {results['total_points']:.2f}")
    print(f"Weighted Total Score: {results['weighted_total_score']:.2f}")
    print(f"Max Possible Score: {results['max_possible_score']:.2f}")
    print(f"Final Score: {results['final_score']:.2f}")
    print(f"Average Evaluations per Dimension: {results['average_evaluations_per_dimension']:.2f}")

    # Print dimension breakdown
    print("\nDimension Breakdown:")
    for dimension, stats in results['dimension_breakdown'].items():
        print(f"  {dimension}: Total Points={stats['total_points']:.2f}, Count={stats['count']}, Average Score={stats['avg_score']:.3f}")
        print(f"    Original Weight={stats['original_weight']:.3f}, Weighted Score={stats['weighted_score']:.3f}")
        print(f"    Max Possible Score={stats['max_possible_score']:.2f}")
        print(f"    Final Score={stats['final_score']:.3f}")

    print(f"\nDetailed results saved to: {args.output}")
    print(f"Summary saved to: {args.summary}")

if __name__ == "__main__":
    main()