#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import argparse
import json
import os
import random
import logging
from typing import Dict, List, Any, Tuple, Optional
from dataclasses import dataclass
from openai import OpenAI
import time
import re
import yaml
import copy
import requests
from dotenv import load_dotenv

load_dotenv()

ANTHROPIC_BASE_URL = os.getenv("ANTHROPIC_BASE_URL")
ANTHROPIC_API_KEY = os.getenv("API_KEY")

GEMINI_BASE_URL = os.getenv("GEMINI_BASE_URL")
GEMINI_API_KEY = os.getenv("API_KEY")

OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL")
OPENAI_API_KEY = os.getenv("API_KEY")

SCRIPT_DIR_EN = "./character_scene_pool/en"
SCRIPT_DIR_ZH = "./character_scene_pool/zh"

PUBLIC_ATTRIBUTE_PATH = "./configs/character_public_attributes.yaml"

CRITIQUE = """
There are 5 evaluation criteria you can choose for judgement:
1. Context Reliance (CR)
  Definition:
  Measures the agent's ability to accurately use and respond to contextually available information, and to avoid generating information that contradicts to the provided context. 
  This includes:
  • facts explicitly or implicitly stated in the prompt (e.g., persona, scenario, dialogue instructions, reply strategy)
  • ongoing dialogue history
  • memory elements

  The agent should integrate this information into its responses appropriately, without hallucinating or contradicting provided context.
  
  Test Case 1
  - Persona: A seasoned knight in a medieval fantasy world, tasked with protecting a young prince.
  - Context:
    - User: So you've been guarding the prince since he was a child?
    - (Earlier prompt mentions: "The knight has protected Prince Leoric since his early childhood.")
  - Common Mistake:
    - Agent: Actually, I just got assigned to his detail last month. (Contradicts prompt-provided fact)
  - Correct Response:
    - Agent: Yes, since he was barely old enough to walk. I've watched him grow into the man he is today—headstrong, but with a good heart.

2. Factual Recall (FR)
  Definition:
  Assesses the agent's ability to recall and apply accurate world knowledge not stated in the prompt but assumed to be part of general pretraining or public domain knowledge, while avoiding hallucinations.
  This includes:
  • facts about public IPs (e.g., Hogwarts houses, lightsaber mechanics),
  • implicit setting details known to fans or readers.
  • basic common sense under the world view (e.g., what people in the modern world look like, people in the fantasy world can use magic)
  
  Test Case 1
  - Persona: Harry Potter
  - Example:
    - User: Harry, I still can't believe you were in Hufflepuff. You always seemed so brave!
  - Common Mistake:
    - Agent: Thanks! I tried to live up to Hufflepuff's values. (Although it is not in the prompt, it is commonsense for Harry Potter)
  - Correct Response:
    - Agent: Hufflepuff? Nah—I was in Gryffindor. Brave, maybe. But definitely not known for patience.

3. Reflective Reasoning (RR)
  Definition:
  Measures the agent's ability to reason and self‑reflect in a plausibly human way. The agent can:
  • offer concise, coherent explanations for its opinions or actions
  • acknowledge uncertainty or error
  • update its stance when presented with new evidence
  • articulate short "thought processes" or rationales that feel natural and believable to humans (without requiring full chain‑of‑thought disclosure)
  
  Test Case 1
  - Persona: AI brainstorming partner
  - Example:
    - User: You suggested targeting Arctic explorers. Isn't that a bit niche?
  - Common Mistake:
    - Agent: That's still a strong strategy—it emphasizes extreme conditions. (Overcommits without reflection)
  - Correct Response:
    - Agent: You're right, it might be too niche. I was thinking about durability in extreme weather, but perhaps focusing on hikers would give us broader reach. Thanks for pointing that out.

4. Conversational Ability (CA)
  Definition:
  Evaluates the agent's overall ability to engage in the whole dynamic and natural dialogue. This includes:
  • maintaining coherent persona behavior and emotional consistency
  • tracking who is speaking to whom in multi-party conversations
  • recognizing when to respond or remain silent
  • advancing stalled dialogue naturally through topic shifts, questions, or prompts
  Test Case 1
  - Context: Group chat with User A (emotional), User B (casual), and Agent (Bot).
    - User A: (crying)
    - User B: Hey, Bot, gimme a beer!
    - User A: (crying more)
  - Common Mistake:
    - Agent: Here's your beer, B! (Fails to prioritize emotional cue from A)
  - Correct Response:
    - Agent: Hey B, one sec—A, are you okay? Want to talk about what's going on? (Emotion-aware and participant-sensitive)

5. Preference Alignment (PA)
  Definition:
  Assesses how well the agent aligns with human conversational preferences and expectations. This includes:
  • avoiding repetition, generic or robotic phrasing(obvious templating), awkward logic
  • producing emotionally resonant, empathetic, or humorous replies when appropriate
  • sound more human-like in tone and word order, making them less AI feeling
  Test Case 1
  - Persona: Supportive friend
  - Example:
    - User: I finally got that promotion I worked so hard for!
  - Common Mistake:
    - Agent: Congratulations. That is good. (Flat tone, robotic phrasing)
  - Correct Response:
    - Agent: That's amazing! You totally deserve it—I'm so happy for you! All that effort paid off big time!(Empathetic, natural, socially intuitive)

Please select the most suitable one as the evaluation criterion according to the provided dialogue scenario. Please note that you must select one and only one.
"""

Chinese_Examples = """
1. 首长，我们所面临的这场战争，敌我力量之悬殊是人类战争史上前所未有的，所以我认为，在相当长的一段时间里，太空军所面临的最大危险是失败主义。 
2. (手敲着桌子) 这就是你们说的大礼？ 
3. (怀疑地）你为什么盯着那片树篱看？
4. 坐下。（指着床）
"""

English_Examples = """
1. Commander, the war we are facing now is so imbalanced in terms of power that it's unprecedented in human history. Therefore, I believe that for a long period, the greatest threat to the Space Force will be defeatism.
2. (Bangs hand on the table) This is the grand gift you spoke of? 
3. (Suspiciously) Why are you staring at the hedge?
4. Sit down. (Points at the bed)
"""

# Logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def get_claude_4_sonnet_response(prompt: str, model: str = "claude-sonnet-4-20250514", max_tokens: int = 32768, temperature: float = 0.0) -> str:
    """Get response from Claude model.

    Args:
        prompt (str): Input prompt for the model
        model (str, optional): Claude model name. Defaults to "claude-sonnet-4-20250514".
        max_tokens (int, optional): Maximum tokens in response. Defaults to 32768.
        temperature (float, optional): Sampling temperature. Defaults to 0.0.

    Returns:
        str: Model response text
    """
    headers = {
        'anthropic-version': '2023-06-01',
        'content-type': 'application/json',
        "Authorization":f"Bearer {ANTHROPIC_API_KEY}"
    }
    data = {
        "model": model,
        "messages": [
            {"role": "user", 
             "content": prompt
            }
        ],
        "max_tokens": max_tokens,
        "temperature": temperature,
        "stream": False
    }
    response = requests.post(
        url=f"{ANTHROPIC_BASE_URL}/v1/messages",
        headers=headers,
        data=json.dumps(data),
        timeout=1800,
    )

    response = response.json()['content'][0]['text'].strip()

    return response

def get_claude_4_sonnet_thinking_response(prompt: str, model: str = "claude-sonnet-4-20250514", max_tokens: int = 32768) -> str:
    """Get reasoning response from Claude model with thinking enabled.

    Args:
        prompt (str): Input prompt for the model
        model (str, optional): Claude model name. Defaults to "claude-sonnet-4-20250514".
        max_tokens (int, optional): Maximum tokens in response. Defaults to 32768.

    Returns:
        str: Model response text with reasoning
    """
    headers = {
        'anthropic-version': '2023-06-01',
        'anthropic-beta': 'output-128k-2025-02-19',
        'content-type': 'application/json',
        "Authorization":f"Bearer {ANTHROPIC_API_KEY}"
    }
    data = {
        "model": model,
        "max_tokens": max_tokens,
        "thinking":{
            "type": "enabled",
            "budget_tokens": 4096
        },
        "messages": [
            {
                "role": "user", 
                "content": prompt
            }
        ],
        "stream": False
    }
    response = requests.post(
        url=f"{ANTHROPIC_BASE_URL}/v1/messages",
        headers=headers,
        data=json.dumps(data),
        timeout=1800,
    )
    
    response = response.json()['content'][1]['text'].strip()

    return response

def get_gemini_2_5_pro_response(prompt: str, model: str = "gemini-2.5-pro", max_tokens: int = 32768, temperature: float = 0.0) -> str:
    """Get response from Gemini model.

    Args:
        prompt (str): Input prompt for the model
        model (str, optional): Gemini model name. Defaults to "gemini-2.5-pro".
        max_tokens (int, optional): Maximum tokens in response. Defaults to 32768.
        temperature (float, optional): Sampling temperature. Defaults to 0.0.

    Returns:
        str: Model response text
    """
    url = f"{GEMINI_BASE_URL}/v1beta/models/{model}:generateContent?key={GEMINI_API_KEY}"
    headers = {
        "Content-Type": "application/json",
    }
    data = {
        "contents": [
            {
                "parts": [
                    {
                        "text": prompt
                    }
                ]
            }
        ],
        "generationConfig": {
            "maxOutputTokens": max_tokens,
            "temperature": temperature,
            "thinkingConfig": {
                # "thinkingBudget": 1024
                # Thinking off:
                # "thinkingBudget": 0
                # Turn on dynamic thinking:
                "thinkingBudget": -1
            }
        }
    }
    response = requests.post(url=url, headers=headers, data=json.dumps(data), timeout=180).json()
    # logger.info(f"Gemini raw response: {response}")
    return response['candidates'][0]['content']['parts'][0]['text'].strip()

def get_gemini_2_5_pro_response_with_thinking_count(prompt: str, model: str = "gemini-2.5-pro", max_tokens: int = 32768, temperature: float = 0.0) -> tuple[str, int]:
    """Get response from Gemini model with thinking token count.

    Args:
        prompt (str): Input prompt for the model
        model (str, optional): Gemini model name. Defaults to "gemini-2.5-pro".
        max_tokens (int, optional): Maximum tokens in response. Defaults to 32768.
        temperature (float, optional): Sampling temperature. Defaults to 0.0.

    Returns:
        tuple[str, int]: Model response text and thinking token count
    """
    url = f"{GEMINI_BASE_URL}/v1beta/models/{model}:generateContent?key={GEMINI_API_KEY}"
    headers = {
        "Content-Type": "application/json",
    }
    data = {
        "contents": [
            {
                "parts": [
                    {
                        "text": prompt
                    }
                ]
            }
        ],
        "generationConfig": {
            "maxOutputTokens": max_tokens,
            "temperature": temperature,
            "thinkingConfig": {
                "thinkingBudget": -1
            }
        }
    }
    response = requests.post(url=url, headers=headers, data=json.dumps(data), timeout=180).json()
    thinking_token_count = response.get('usageMetadata', {}).get('thoughtsTokenCount', 0)
    return response['candidates'][0]['content']['parts'][0]['text'].strip(), thinking_token_count

def get_gemini_2_5_flash_response(prompt: str, model: str = "gemini-2.5-flash", max_tokens: int = 32768, temperature: float = 0.0) -> str:
    """Get response from Gemini model.

    Args:
        prompt (str): Input prompt for the model
        model (str, optional): Gemini model name. Defaults to "gemini-2.5-flash".
        max_tokens (int, optional): Maximum tokens in response. Defaults to 32768.
        temperature (float, optional): Sampling temperature. Defaults to 0.0.

    Returns:
        str: Model response text
    """
    url = f"{GEMINI_BASE_URL}/v1beta/models/{model}:generateContent?key={GEMINI_API_KEY}"
    headers = {
        "Content-Type": "application/json",
    }
    data = {
        "contents": [
            {
                "parts": [
                    {
                        "text": prompt
                    }
                ]
            }
        ],
        "generationConfig": {
            "maxOutputTokens": max_tokens,
            "temperature": temperature,
            "thinkingConfig": {
                "thinkingBudget": 0
            }
        }
    }
    response = requests.post(url=url, headers=headers, data=json.dumps(data), timeout=180).json()
    # return response
    return response['candidates'][0]['content']['parts'][0]['text'].strip()


@dataclass
class ModelConfig:
    """Model Configuration Class"""
    api_key: str
    base_url: str
    model: str
    temperature: float = 0.6
    max_tokens: int = 16384

@dataclass
class EvaluationResult:
    """Evaluation Result Data Class"""
    scenario_id: str
    total_rounds: int
    model1_wins: int
    model2_wins: int
    ties: int
    dialogue_history: List[Dict]
    win_lose_details: List[Dict]

class LLMClient:
    """LLM Client Wrapper Class"""
    
    def __init__(self, config: ModelConfig, name: str = "Unknown"):
        """Initialize client"""
        self.config = config
        self.name = name
        self.client = OpenAI(api_key=config.api_key, base_url=config.base_url)
        logger.info(f"Initialized LLM client [{name}]: {config.model}")

    def _extract_final_response_from_r1(self, raw_content: str) -> str:
        """Extract final response from DeepSeek-R1 output with <think> tags."""
        try:
            # If <think> tags are present, extract content outside the tags
            if "<think>" in raw_content and "</think>" in raw_content:
                # Find the last </think> tag position
                last_think_end = raw_content.rfind("</think>")
                if last_think_end != -1:
                    # Extract content after </think> tag as final response
                    final_response = raw_content[last_think_end + len("</think>"):].strip()
                    if final_response:
                        logger.info(f"DeepSeek-R1 extracted final response: {final_response[:100]}...")
                        return final_response
                    else:
                        logger.info("DeepSeek-R1 extracted no valid response after </think> tag")

            # If no think tags are present or extraction fails, return original content
            logger.info("DeepSeek-R1 extracted no think tags, returning original content")
            logger.info(f"raw response: {raw_content}")
            return raw_content
            
        except Exception as e:
            logger.error(f"DeepSeek-R1 response processing failed: {e}")
            return raw_content

    def generate_response(self, prompt: str, max_retries: int = 3) -> Optional[str]:
        """Generate response with retry mechanism"""
        for attempt in range(max_retries):
            try:
                logger.info(f"Model [{self.name}] [{self.config.model}] is generating a response...")
                if self.config.model == "Qwen3-235B" or self.config.model == "Qwen3-8B" or self.config.model == "Qwen3-32B":
                    logger.info("disable Qwen3 thinking mode")
                    response = self.client.chat.completions.create(
                        model=self.config.model,
                        messages=[{"role": "user", "content": prompt}],
                        extra_body={"chat_template_kwargs": {"enable_thinking": False}},
                        temperature=self.config.temperature,
                        max_tokens=self.config.max_tokens
                    )
                    return response.choices[0].message.content.strip()
                # Qwen3-Thinking
                elif self.config.model == "Qwen3-235B-Thinking" or self.config.model == "Qwen3-8B-Thinking" or self.config.model == "Qwen3-32B-Thinking":
                    logger.info("enable Qwen3 thinking mode")
                    response = self.client.chat.completions.create(
                        model=self.config.model.replace("-Thinking", ""),
                        messages=[{"role": "user", "content": prompt}],
                        extra_body={"chat_template_kwargs": {"enable_thinking": True}},
                        temperature=self.config.temperature,
                        max_tokens=self.config.max_tokens
                    )
                    raw_content = response.choices[0].message.content.strip()
                    # logger.info(f"Qwen3-Thinking raw response: {response}")
                    return self._extract_final_response_from_r1(raw_content)
                # gpt
                elif self.config.model == "gpt-4o" or self.config.model == "gpt-4.1-nano" or self.config.model == "gpt-4.1-mini" or self.config.model == "gpt-4.1" or self.config.model == "gpt-4.5-preview":
                    response = self.client.chat.completions.create(
                        model=self.config.model,
                        messages=[{"role": "user", "content": prompt}],
                        temperature=self.config.temperature,
                        max_tokens=self.config.max_tokens
                    )
                    return response.choices[0].message.content.strip()
                # o3
                elif self.config.model == "o3" or self.config.model == "o3-mini" or self.config.model == "o4-mini":
                    response = self.client.chat.completions.create(
                        model=self.config.model,
                        messages=[{"role": "user", "content": prompt}],
                        max_completion_tokens=self.config.max_tokens
                    )
                    return response.choices[0].message.content.strip()
                # DeepSeek-V3
                elif self.config.model == "DeepSeek-V3":
                    response = self.client.chat.completions.create(
                        model="DeepSeek-V3",
                        messages=[{"role": "user", "content": prompt}],
                        temperature=self.config.temperature,
                        max_tokens=self.config.max_tokens
                    )
                    return response.choices[0].message.content.strip()
                # DeepSeek-R1
                elif self.config.model == "DeepSeek-R1":
                    response = self.client.chat.completions.create(
                        model="DeepSeek-R1",
                        messages=[{"role": "user", "content": prompt}],
                        temperature=self.config.temperature,
                        max_tokens=self.config.max_tokens
                    )
                    raw_content = response.choices[0].message.content.strip()
                    return self._extract_final_response_from_r1(raw_content)
                # minimax-m1
                elif self.config.model == "MiniMax-M1":
                    response = self.client.chat.completions.create(
                        model="minimax/MiniMax-M1",
                        messages=[{"role": "user", "content": prompt}],
                        temperature=self.config.temperature,
                        max_tokens=self.config.max_tokens
                    )
                    raw_content = response.choices[0].message.content.strip()
                    return self._extract_final_response_from_r1(raw_content)
                # gemini-2.5-pro
                elif self.config.model == "gemini-2.5-pro":
                    response = get_gemini_2_5_pro_response(prompt=prompt, model=self.config.model, max_tokens=self.config.max_tokens, temperature=self.config.temperature)
                    return response
                # gemini-2.5-flash
                elif self.config.model == "gemini-2.5-flash":
                    response = get_gemini_2_5_flash_response(prompt=prompt, model=self.config.model, max_tokens=self.config.max_tokens, temperature=self.config.temperature)
                    return response
                # claude-3.5-sonnet
                elif self.config.model == "claude-3.5-sonnet":
                    response = get_claude_4_sonnet_response(prompt=prompt, model="claude-3-5-sonnet-20241022", max_tokens=self.config.max_tokens, temperature=self.config.temperature)
                    return response
                # claude-3.7-sonnet
                elif self.config.model == "claude-3.7-sonnet":
                    response = get_claude_4_sonnet_response(prompt=prompt, model="claude-3-7-sonnet-20250219", max_tokens=self.config.max_tokens, temperature=self.config.temperature)
                    return response
                # claude-4-sonnet
                elif self.config.model == "claude-4-sonnet":
                    response = get_claude_4_sonnet_response(prompt=prompt, model="claude-sonnet-4-20250514", max_tokens=self.config.max_tokens, temperature=self.config.temperature)
                    return response
                # claude-4-sonnet-thinking
                elif self.config.model == "claude-4-sonnet-thinking":
                    response = get_claude_4_sonnet_thinking_response(prompt=prompt, model="claude-sonnet-4-20250514", max_tokens=self.config.max_tokens)
                    return response
                # mistral-small
                elif self.config.model == "mistral-small":
                    response = self.client.chat.completions.create(
                        model="mistral-small",
                        messages=[{"role": "user", "content": prompt}],
                        temperature=self.config.temperature,
                        max_tokens=self.config.max_tokens
                    ).choices[0].message.content.strip()
                    logger.info(f"Mistral-small response: {response[:100]}...")
                    return response
                # magistral-small
                elif self.config.model == "magistral-small":
                    response = self.client.chat.completions.create(
                        model="magistral-small",
                        messages=[{"role": "user", "content": prompt}],
                        temperature=self.config.temperature,
                        max_tokens=self.config.max_tokens
                    )
                    raw_content = response.choices[0].message.content.strip()
                    return self._extract_final_response_from_r1(raw_content)
                # higgs-70b
                elif self.config.model == "higgs-70b":
                    response = self.client.chat.completions.create(
                        model="higgs-70b",
                        messages=[{"role": "user", "content": prompt}],
                        temperature=1.0,
                        max_tokens=256,
                        top_p=0.95
                    )
                    return response.choices[0].message.content.strip()
                # others
                else:
                    logger.info(f"Other models compatible with OpenAI format API calls")
                    response = self.client.chat.completions.create(
                        model=self.config.model,
                        messages=[{"role": "user", "content": prompt}],
                        temperature=self.config.temperature,
                        max_tokens=self.config.max_tokens
                    )
                    return response.choices[0].message.content.strip()
            except Exception as e:
                logger.warning(f"Generation fails (have tried {attempt + 1}/{max_retries}): {e}")
                if attempt < max_retries - 1:
                    time.sleep(2 ** attempt)  
                else:
                    logger.error(f"Final failure: {e}")
                    return None

    def generate_response_with_thinking_count(self, prompt: str, max_retries: int = 3) -> Optional[tuple[str, int]]:
        """Generate response with retry mechanism, returning response content and thinking token count"""
        for attempt in range(max_retries):
            try:
                logger.info(f"Model [{self.name}] [{self.config.model}] is generating a response...")
                # gemini-2.5-pro
                if self.config.model == "gemini-2.5-pro":
                    response, thinking_count = get_gemini_2_5_pro_response_with_thinking_count(prompt=prompt, model=self.config.model, max_tokens=self.config.max_tokens, temperature=self.config.temperature)
                    return response, thinking_count
                else:
                    response = self.generate_response(prompt, max_retries=1)
                    return response, 0 if response else None
            except Exception as e:
                logger.warning(f"Generation fails (have tried {attempt + 1}/{max_retries}): {e}")
                if attempt < max_retries - 1:
                    time.sleep(2 ** attempt)  
                else:
                    logger.error(f"Final failure: {e}")
                    return None

class DialogueSimulator:
    """Dialogue Simulator Class"""
    
    def __init__(self, config_path: str):
        """Initialize the simulator"""
        self.config = self._load_config(config_path)
        self.model1_client = LLMClient(ModelConfig(**self.config["model1"]), name="model1")
        self.model2_client = LLMClient(ModelConfig(**self.config["model2"]), name="model2")
        self.characters_client = LLMClient(ModelConfig(**self.config["characters"]), name="characters")
        self.director_client = LLMClient(ModelConfig(**self.config["director"]), name="director")
        self.judge_client = LLMClient(ModelConfig(**self.config["LLMjudge"]), name="LLMjudge")
        self.few_shot_examples = ""
        
    def _load_config(self, config_path: str) -> Dict:
        """Load configuration file"""
        try:
            with open(config_path, 'r', encoding='utf-8') as f:
                return json.load(f)
        except Exception as e:
            logger.error(f"Failed to load configuration file: {e}")
            raise
    
    def load_character_profile(self, profile_path: str) -> List[Dict]:
        """Load character profile file"""
        try:
            with open(profile_path, 'r', encoding='utf-8') as f:
                return json.load(f)
        except Exception as e:
            logger.error(f"Failed to load character profile file: {e}")
            raise
        
    def load_evaluation_dimension(self, dimension_path: str) -> Dict:
        """Load evaluation dimension file"""
        try:
            with open(dimension_path, "r", encoding="utf-8") as f:
                tag_mapping = yaml.safe_load(f)
                return tag_mapping
        except Exception as e:
            logger.error(f"Failed to load evaluation dimension file: {e}")
            raise
        
    def random_key_value_pair_with_weight(self, d: dict, dimension_counts: Dict[str, int]) -> tuple:
        if not d:
            raise ValueError("Input dictionary is empty, unable to select.")

        if not dimension_counts:
            key = random.choice(list(d.keys()))
            return key, d[key]

        # Calculate the weight for each dimension (less frequent dimensions have higher weight)
        dimension_keys = list(d.keys())
        counts = [dimension_counts.get(key, 0) for key in dimension_keys]

        # Weight formula: wᵢ = max_count - cᵢ + 1
        max_count = max(counts) if counts else 0
        weights = [max_count - count + 1 for count in counts]
        
        # Calculate probabilities
        total_weight = sum(weights)
        probabilities = [w/total_weight for w in weights]
        
        logger.info("=" * 60)
        logger.info("🎯 Algorithm Execution")
        logger.info(f"Current count: {dict(zip(dimension_keys, counts))}")
        logger.info(f"Computing weights: {dict(zip(dimension_keys, weights))}")
        logger.info(f"Selection probabilities: {dict(zip(dimension_keys, [f'{p:.1%}' for p in probabilities]))}")

        # Weighted random selection algorithm
        random_value = random.uniform(0, total_weight)  # r ∈ [0, total_weight)

        # Cumulative distribution selection
        cumulative_weight = 0
        for i, weight in enumerate(weights):
            cumulative_weight += weight  # F(dᵢ)
            if random_value <= cumulative_weight:
                key = dimension_keys[i]
                logger.info(f"✅ Selected Dimension: {key} (Prob: {probabilities[i]:.1%})")
                logger.info("=" * 60)
                return key, d[key]

        # Fallback
        key = dimension_keys[-1]
        return key, d[key]
    
    def get_scenarios_from_script_dir(self, script_dir: str) -> List[Dict]:
        """Obtain all scenarios from the script directory"""
        scenarios = []

        # Get all json files
        json_files = [f for f in os.listdir(script_dir) if f.endswith('.json')]

        # Obtain all scenarios
        for json_file in json_files:
            try:
                file_path = os.path.join(script_dir, json_file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    plots = data["Plots"]
                    world_view = data["World_view"]
                    for plot in plots:
                        plot["scenario"]["world_view"] = world_view
                    if isinstance(plots, list):
                        scenarios.extend(plots)
            except Exception as e:
                logger.warning(f"Processing {json_file} failed: {e}")
                continue

        logger.info(f"Total scenarios obtained: {len(scenarios)}")
        return scenarios
    
    def check_dimension_requirements(self, dimension_counts: Dict[str, int], min_n: int) -> bool:
        """Check if all dimensions meet the minimum requirement"""
        for dimension in self.dimension_answer.keys():
            if dimension_counts.get(dimension, 0) < min_n:
                return False
        return True
    
    def build_character_prompt(self, world_view: str, current_situation: str, 
                             my_profile: Dict, character_profile: List, motivation: str, 
                             history: str, script: str = "None", strategy: str = "Free chat") -> str:
        """Build character dialogue prompt"""
        
        return f"""
You are role-playing a character based on the following profile. Use colloquial language to respond.
If my profile is in English, please respond in English.
If my profile is in Chinese, please respond in Chinese.

# World view
{world_view}

# My profile
{json.dumps(my_profile, ensure_ascii=False, indent=2)}

# Character profiles
{character_profile}

# Current Situation
{current_situation}

# Original Script(This is the original script. You can refer to the plot and classic sentences in the dialogue. However, there are more characters in the dialogue now, so you don't need to follow the script)
{script}

# My motivation
{motivation}

# Current Real Dialogue History
{history}

# Chatting Strategy
{strategy}

# Chatting Goal
Prioritize smooth conversations. If ##Chatting Strategy## has specific requests, consider following them in your response.

# Response Format
Each response consists of an action (optional) and a sentence without the speaker's name in the beginning like <Name>:. Add () outside the action. Here are some examples:
{self.few_shot_examples}

[IMPORTANT!] Please do not use fixed and repeated sentences similar to the ##Current Real Dialogue History##
# Response(only one sentence without any explanation):
"""
    
    def build_evaluation_character_prompt(self, world_view: str, my_profile: Dict, character_profile: List, 
                                        motivation: str, history: str) -> str:
        """Build evaluation character prompt (without current situation and motivation)"""

        return f"""
You are role-playing a character based on the following profile. Use colloquial language to respond.
If my profile is in English, please respond in English.
If my profile is in Chinese, please respond in Chinese.

# World view
{world_view}

# My profile
{json.dumps(my_profile, ensure_ascii=False, indent=2)}

# Character profile
{character_profile}

# My motivation
{motivation}

# Dialogue History
{history}

# Response Format
Each response consists of an action (optional) and a sentence without the speaker's name in the beginning like <Name>:. Add () outside the action. Here are some examples:
{self.few_shot_examples}

[IMPORTANT!] Please do not use fixed and repeated sentences similar to the ##Dialogue History##
# Response(only one sentence without any explanation):
"""
    
    def build_nsp_prompt(self, target_characters: str, all_characters: List[str], 
                        dialogue_history: List[Dict]) -> str:
        """构建Next Speaker Prediction prompt"""
        history_text = "\n".join([
            f"{item['character']}: {item['message']}" 
            for item in dialogue_history
        ])

        return f"""Your task is to predict the next speaker for a role-playing game that is designed to test the performance of one specific character ##The Tested Character## powered by LLM in a general groupchat. That is, you need to determine which character might act next based on their previous interactions. Choose a name from this list: {all_characters}. If it's unclear who should act next, output "random". If you believe the scene or conversation should finish, output "<END CHAT>".

===The Tested Character===
{target_characters}

===Dialogue History===
{history_text}

===Requirements===
We are testing the {target_characters}, so please let {target_characters} speak appropriately.
[IMPORTANT!] Please do not let any character including {target_characters} speak more than 3 rounds continuously.
[IMPORTANT!] Try to balance the responses of all characters in {all_characters} as much as possible (for example, if there are 10 rounds of dialogue between three characters, then each character speaks an average of three sentences)
The conversation must last at least 10 rounds before it ends.

Next speaker:"""
    
    def build_judge_prompt_general(self, history: str, critique: str, model1: str, model2: str, 
                          response1: str, response2: str, character_profiles: List) -> str:
        """Build LLM judge prompt with Arena Hard style scoring"""
        return f"""You are a judge for an AI NPC system. You need to compare two responses from two different models according to the provided chat criteria using a pairwise comparison approach. Please provide both a choice and a detailed score.

{critique}

# Dialogue history
{history}

# {model1}
response: {response1}

# {model2}
response: {response2}

# Character Profiles
{json.dumps(character_profiles, ensure_ascii=False, indent=2)}

# Scoring Guidelines:
Please evaluate the responses using a 5-point Likert scale:
- **1**: Strong preference for {model1} - {model1} is significantly better
- **2**: Moderate preference for {model1} - {model1} is somewhat better
- **3**: Tie - Both responses are roughly equivalent in quality
- **4**: Moderate preference for {model2} - {model2} is somewhat better
- **5**: Strong preference for {model2} - {model2} is significantly better

This scoring method penalizes models more heavily for large losses, effectively distinguishing performance across models.

# Output format (For the criteria selection, please only select one of the following criteria: CR, FR, RR, CA, PA and do not use full names):
explanation: <detailed explanation of the choice including the selected criteria, specific strengths/weaknesses, and reasoning for the score>

criteria: <CR or FR or RR or CA or PA>

score: <1, 2, 3, 4, or 5>

choice: <{model1} or {model2} or tie>
"""

    def build_judge_prompt(self, history: str, critique: str, model1: str, model2: str, 
                          response1: str, response2: str, character_profiles: List) -> str:
        """ Build LLM judge prompt with Arena Hard style scoring"""
        return f"""You are a judge for an AI NPC system. You need to compare two responses from two different models according to the provided chat criteria using a pairwise comparison approach. Please provide both a choice and a detailed score.

# Provided chat criteria
{critique}

# Dialogue history
{history}

# {model1}
response: {response1}

# {model2}
response: {response2}

# Character Profiles
{json.dumps(character_profiles, ensure_ascii=False, indent=2)}

# Scoring Guidelines:
Please evaluate the responses using a 5-point Likert scale:
- **1**: Strong preference for {model1} - {model1} is significantly better
- **2**: Moderate preference for {model1} - {model1} is somewhat better
- **3**: Tie - Both responses are roughly equivalent in quality
- **4**: Moderate preference for {model2} - {model2} is somewhat better
- **5**: Strong preference for {model2} - {model2} is significantly better

This scoring method penalizes models more heavily for large losses, effectively distinguishing performance across models.

# Output format:
explanation: <detailed explanation of the choice including specific strengths/weaknesses and reasoning for the score>

score: <1, 2, 3, 4, or 5>

choice: <{model1} or {model2} or tie>
"""
    
    def format_dialogue_history(self, dialogue_history: List[Dict]) -> str:
        """Format dialogue history for prompts"""
        if not dialogue_history:
            return "No dialogue history"

        formatted = []
        for item in dialogue_history:
            character = item.get('character', 'Unknown')
            message = item.get('message', '')
            formatted.append(f"{character}: {message}")
        
        return "\n".join(formatted)
    
    def extract_character_response(self, full_response: str) -> str:
        """Extract character response, removing inner monologue in brackets"""
        if not full_response:
            return ""

        # Remove content within brackets
        cleaned = re.sub(r'\[.*?\]', '', full_response).strip()

        # Remove quotes
        cleaned = cleaned.strip('"').strip("'").strip()
        
        return cleaned if cleaned else full_response.strip()
    
    def predict_next_speaker(self, target_character: str, all_characters: List[str], 
                           dialogue_history: List[Dict], has_target: bool = False) -> Tuple[str, bool]:
        """Predict the next speaker in the dialogue"""
        if has_target:
            return target_character, False
        
        if random.random() < self.test_ratio:
            without_target_character = [x for x in all_characters if x != target_character]
            char = random.choice(without_target_character)
            return char, True
        
        prompt = self.build_nsp_prompt(target_character, all_characters, dialogue_history)
        response = self.director_client.generate_response(prompt)
        
        if not response:
            return "random", False
        
        response = response.strip()

        # Check if the conversation should end
        if "<END CHAT>" in response.upper() or "END CHAT" in response.upper():
            return "<END CHAT>", False

        # Check if in character list
        for char in all_characters:
            if char in response:
                return char, False
        
        return "random", False
    
    def construct_other_characters(self, character_profiles: Dict, current_speaker: str) -> List:
        """construct other character profiles suitablely"""
        with open(PUBLIC_ATTRIBUTE_PATH, "r", encoding="utf-8") as f:
            configs = yaml.safe_load(f)
        public_attributes = configs.get("Public_attributes", [])
        other_profiles = []
        for name, profile in character_profiles.items():
            if name != current_speaker:
                other_profile = {key: profile.get(key, "") for key in public_attributes}
                other_profiles.append(other_profile)
        return other_profiles
    
    def judge_responses(self, history: str, critique: str, response1: str, response2: str, character_profiles: List, dimension: str = "None") -> Tuple[str, str, str, str, int]:
        """Judge two model responses and return the result"""
        provided_dimension = ""
        
        if dimension == "None":
            judge_prompt = self.build_judge_prompt_general(
                history, critique, "model1", "model2", response1, response2, character_profiles
            )
        else:
            provided_dimension = dimension
            judge_prompt = self.build_judge_prompt(
                history, critique, "model1", "model2", response1, response2, character_profiles
            )
        
        judge_response = self.judge_client.generate_response(judge_prompt)
        
        if not judge_response:
            return "tie", "no judge_response received", dimension, "", 3
        
        response_text_lower = judge_response.lower()
        response_text = judge_response
        
        explanation = ""
        dimension = ""
        choice = "tie"
        score = 3
        
        # extract explanation
        explanation_markers = ["### explanation:", "explanation:", "**explanation:**"]
        for marker in explanation_markers:
            if marker.lower() in response_text_lower:
                explanation_parts = response_text_lower.split(marker.lower())
                if len(explanation_parts) > 1:
                    explanation_text = explanation_parts[1]
                    for end_marker in ["### criteria:", "criteria:", "### choice:", "choice:", "selected criterion:", "final choice:", "**criteria:**", "**choice:**"]:
                        if end_marker.lower() in explanation_text:
                            explanation_text = explanation_text.split(end_marker.lower())[0]
                    explanation = explanation_text.strip()
                    break
        
        # extract criteria/dimension
        dimension_markers = ["### criteria:", "criteria:", "selected criterion:", "**criteria:**"]
        for marker in dimension_markers:
            if marker.lower() in response_text_lower:
                dimension_parts = response_text_lower.split(marker.lower())
                if len(dimension_parts) > 1:
                    dimension_text = dimension_parts[1]
                    for end_marker in ["### choice:", "choice:", "final choice:", "evaluation of responses:", "\n\n", "**choice:**"]:
                        if end_marker.lower() in dimension_text:
                            dimension_text = dimension_text.split(end_marker.lower())[0]
                    dimension = dimension_text.strip()
                    break
        
        # extract score
        score_markers = ["### score:", "score:", "**score:**"]
        for marker in score_markers:
            marker_lower = marker.lower()
            if marker_lower in response_text_lower:
                start_idx = response_text_lower.find(marker_lower)
                remaining_text = response_text[start_idx:].split('\n')
                
                for i in range(min(2, len(remaining_text))):
                    line = remaining_text[i].lower()
                    if ':' in line:
                        score_text = line.split(':')[1].strip()
                    else:
                        score_text = line.strip()
                    
                    score_text = score_text.replace('*', '').strip()
                    
                    # extract scores (1-5)
                    import re
                    score_match = re.search(r'[1-5]', score_text)
                    if score_match:
                        try:
                            score = int(score_match.group())
                            score = max(1, min(5, score))
                            break
                        except ValueError:
                            pass
                if score != 3:
                    break

        # extract choice
        choice_markers = ["### choice:", "choice:", "final choice:", "**choice**:", "**choice:**"]
        for marker in choice_markers:
            marker_lower = marker.lower()
            if marker_lower in response_text_lower:
                # find the position of marker in the response
                start_idx = response_text_lower.find(marker_lower)
                # obtrain the text after the marker
                remaining_text = response_text[start_idx:].split('\n')
                
                for i in range(min(2, len(remaining_text))):
                    line = remaining_text[i].lower()
                    if ':' in line:
                        choice_text = line.split(':')[1].strip()
                    else:
                        choice_text = line.strip()
                    
                    choice_text = choice_text.replace('*', '').strip()
                    
                    if any(m in choice_text.lower() for m in ["model1", "model 1"]):
                        choice = "model1"
                        break
                    elif any(m in choice_text.lower() for m in ["model2", "model 2"]):
                        choice = "model2"
                        break
                if choice != "tie":
                    break

        if dimension:
            dimension = dimension.replace('*', '').replace('#', '').strip().strip('()').strip()
            if "(" in dimension and ")" in dimension:
                dimension = dimension[dimension.find("(")+1:dimension.find(")")]
            dimension = dimension.split()[0].strip()
            dimension = dimension.upper() 
            
        if provided_dimension != "":
            dimension = provided_dimension

        if score in [1, 2] and choice not in ["model1"]:
            choice = "model1"
        elif score in [4, 5] and choice not in ["model2"]:
            choice = "model2"
        elif score == 3 and choice not in ["tie"]:
            choice = "tie"
            
        return choice, explanation, dimension, judge_response, score
    
    def simulate_scenario_dialogue(self, idx: int, dimension: Dict, target_character: Dict, plot: Dict, dimension_counts: Dict[str, int] = None) -> Optional[EvaluationResult]:
        """Simulate dialogue for a given scenario and evaluate the target character's performance"""
        scenario_id = f"scenario_{idx}"
        logger.info(f"Start simulation: {scenario_id}")
        
        if dimension_counts is not None:
            self.dimension_counts = dimension_counts
        
        # create the deep copy of plot to seperate the original data
        plot_copy = copy.deepcopy(plot)
        
        key_characters = plot_copy.get("key_characters", [])
        target_char_name = target_character["name"]
        
        # check if target character is already in the scene
        target_in_scene = any(char["name"] == target_char_name for char in key_characters)

        # If the target character is in the scene, skip this evaluation round
        if target_in_scene:
            logger.info(f"the same name issue in the {scenario_id}, skip it")
            return None
        
        target_motivation = f"{target_char_name} engages in the current situation and react according to your own personality and background."  
        plot_copy["scenario"]["Present_Characters"].append({
            "Name": target_char_name,
            "Motivation": target_motivation
        })
        key_characters.append({
            "name": target_char_name,
            "profile": target_character["profile"]
        })
        
        # build character profiles and motivations dict
        character_profiles = {}
        character_motivations = {}
        
        for char in key_characters:
            character_profiles[char["name"]] = char.get("profile", "")
        
        for char in plot_copy["scenario"]["Present_Characters"]:
            character_motivations[char["Name"]] = char.get("Motivation", "")
        
        all_characters = list(character_profiles.keys())
        dialogue_history = []
        original_dialogues = plot_copy.get("dialogues", [])
        world_view = plot_copy.get("scenario", {}).get("world_view", "")
        original_script = self.format_dialogue_history(original_dialogues)
        
        if original_dialogues:
            first_speaker = original_dialogues[0]["character"]
            if first_speaker in character_profiles:
                current_situation = plot_copy.get("scenario", {}).get("Current_Situation", "")
                char_profile = character_profiles[first_speaker]
                motivation = character_motivations.get(first_speaker, "")
                history = self.format_dialogue_history(dialogue_history)
                other_profiles = self.construct_other_characters(character_profiles, first_speaker)
                
                prompt = self.build_character_prompt(
                    world_view, current_situation, char_profile, other_profiles, motivation, history, original_script
                )
                
                response = self.characters_client.generate_response(prompt)
                if response:
                    clean_response = self.extract_character_response(response)
                    dialogue_history.append({
                        "character": first_speaker,
                        "message": clean_response,
                        "round": 1
                    })
        else:
            available_chars = [char for char in all_characters if char != target_char_name]
            if available_chars:
                first_speaker = random.choice(available_chars)
                current_situation = plot_copy.get("scenario", {}).get("Current_Situation", "")
                char_profile = character_profiles[first_speaker]
                motivation = character_motivations.get(first_speaker, "")
                history = self.format_dialogue_history(dialogue_history)
                other_profiles = self.construct_other_characters(character_profiles, first_speaker)
                
                prompt = self.build_character_prompt(
                    world_view, current_situation, char_profile, other_profiles, motivation, history
                )
                
                response = self.characters_client.generate_response(prompt)
                if response:
                    clean_response = self.extract_character_response(response)
                    dialogue_history.append({
                        "character": first_speaker,
                        "message": clean_response,
                        "round": 1
                    })
        
        # Initialize counters
        model1_wins = 0
        model2_wins = 0
        ties = 0
        win_lose_details = []
        
        # Start the dialogue simulation loop
        max_rounds = self.max_turns
        current_round = len(dialogue_history) + 1
        has_target = False
        has_dimension = "None"
        
        while current_round <= max_rounds:
            # Predict the next speaker
            next_speaker, has_target = self.predict_next_speaker(target_char_name, all_characters, dialogue_history, has_target)
            
            if next_speaker == "<END CHAT>":
                logger.info(f"Dialogue ended at round {current_round}")
                break
            
            if next_speaker == "random":
                if all_characters:
                    next_speaker = random.choice(all_characters)
                else:
                    break

            # If it's the turn of the evaluation character to speak
            if next_speaker == target_char_name:
                logger.info(f"Round {current_round}: Evaluation character {target_char_name} speaks")

                history = self.format_dialogue_history(dialogue_history)
                motivation = character_motivations.get(next_speaker, "")
                other_profiles = self.construct_other_characters(character_profiles, next_speaker)
                
                # Evaluation prompt (no context or motivation)
                prompt = self.build_evaluation_character_prompt(
                    world_view, target_character["profile"], other_profiles, motivation, history
                )
                
                # Generate responses from both models (source model and base model)
                model1_thinking_count = 0
                if self.model1_client.config.model == "gemini-2.5-pro":
                    model1_result = self.model1_client.generate_response_with_thinking_count(prompt)
                    if model1_result:
                        response1, model1_thinking_count = model1_result
                    else:
                        response1 = None
                else:
                    response1 = self.model1_client.generate_response(prompt)
                
                response2 = self.model2_client.generate_response(prompt)
                
                if response1 and response2:
                    clean_response1 = self.extract_character_response(response1)
                    clean_response2 = self.extract_character_response(response2)
                    
                    judge_critique = CRITIQUE
                    
                    if has_dimension != "None":
                        tag_method = "answer"
                        choice, explanation, dimension, judge_response, score = self.judge_responses(history, judge_critique, clean_response1, clean_response2, key_characters, has_dimension)
                        # Reset dimension flag
                        has_dimension = "None"
                    else:
                        tag_method = "detect"
                        choice, explanation, dimension, judge_response, score = self.judge_responses(history, judge_critique, clean_response1, clean_response2, key_characters)
                    
                    # Update counters based on judge's choice
                    if choice == "model1":
                        model1_wins += 1
                        chosen_response = clean_response1
                    elif choice == "model2":
                        model2_wins += 1
                        chosen_response = clean_response2
                    else:
                        ties += 1
                        chosen_response = clean_response1  
                    
                    detail_record = {
                        "round": current_round,
                        "prompt": prompt,
                        "model1_original_response": response1,
                        "model2_original_response": response2,
                        "model1_response": clean_response1,
                        "model2_response": clean_response2,
                        "judge_response": judge_response,
                        "choice": choice,
                        "dimension": dimension,
                        "method": tag_method,
                        "explanation": explanation,
                        "score": score
                    }
                    
                    if self.model1_client.config.model == "gemini-2.5-pro":
                        detail_record["model1_thinking_token_count"] = model1_thinking_count

                    win_lose_details.append(detail_record)

                    # Immediately update dimension counts
                    if dimension in dimension_counts:
                        dimension_counts[dimension] += 1
                        logger.info(f"📊: {dimension} -> {dimension_counts[dimension]}")
                        logger.info(f"📈: {dimension_counts}")
                    
                    # Append the chosen response to the dialogue history
                    dialogue_history.append({
                        "character": target_char_name,
                        "message": chosen_response,
                        "round": current_round,
                        "method": tag_method,
                        "dimension": dimension
                    })
                    
                else:
                    logger.warning(f"round {current_round}: model response generation failed")
                    break
            
            # If it's another character's turn to speak
            else:
                if next_speaker in character_profiles:
                    if has_target:
                        logger.info(f"round {current_round}: {next_speaker} speaks to {target_char_name}")
                    else:
                        logger.info(f"round {current_round}: {next_speaker} speaks")

                    current_situation = plot_copy.get("scenario", {}).get("Current_Situation", "")
                    char_profile = character_profiles[next_speaker]
                    motivation = character_motivations.get(next_speaker, "")
                    history = self.format_dialogue_history(dialogue_history)
                    other_profiles = self.construct_other_characters(character_profiles, next_speaker)
                    
                    if has_target:
                        has_dimension, dimension_question_criteria = self.random_key_value_pair_with_weight(self.dimension_question, dimension_counts)
                        prompt = self.build_character_prompt(
                            world_view, current_situation, char_profile, other_profiles, motivation, history, original_script, dimension_question_criteria
                        )
                    else:
                        prompt = self.build_character_prompt(
                            world_view, current_situation, char_profile, other_profiles, motivation, history, original_script
                        )
                    
                    response = self.characters_client.generate_response(prompt)
                    if response:
                        if has_target:
                            clean_response = self.extract_character_response(response)
                            dialogue_history.append({
                                "character": next_speaker,
                                "message": clean_response,
                                "round": current_round,
                                "test": "question",
                                "dimension": has_dimension
                            })
                        else:
                            clean_response = self.extract_character_response(response)
                            dialogue_history.append({
                                "character": next_speaker,
                                "message": clean_response,
                                "round": current_round
                            })
                    else:
                        logger.warning(f"round {current_round}: {next_speaker} response generation failed")
                        break
            
            current_round += 1
        
        result = EvaluationResult(
            scenario_id=scenario_id,
            total_rounds=len(dialogue_history),
            model1_wins=model1_wins,
            model2_wins=model2_wins,
            ties=ties,
            dialogue_history=dialogue_history,
            win_lose_details=win_lose_details
        )

        logger.info(f"round {scenario_id} completed: Model1 wins {model1_wins}, Model2 wins {model2_wins}, ties {ties}")
        return result
    
    def run_evaluation(self, character_profile_path: str, script_dir_selection: str, 
                  min_n: int, test_ratio: float, output_file: str, dimension: str, max_turns: int):
        """Run the full evaluation process"""
        logger.info("Starting evaluation process")
        self.test_ratio = test_ratio
        self.max_turns = max_turns

        # Load all target characters
        target_characters = self.load_character_profile(character_profile_path)
        logger.info(f"Loaded {len(target_characters)} target characters")

        # Get all scenario data
        if script_dir_selection == "zh":
            script_dir = SCRIPT_DIR_ZH
            self.few_shot_examples = Chinese_Examples
        else:
            script_dir = SCRIPT_DIR_EN
            self.few_shot_examples = English_Examples
        all_scenarios = self.get_scenarios_from_script_dir(script_dir)
        logger.info(f"Obtain {len(all_scenarios)} scenarios in total")

        # Get evaluation dimensions
        dimension_map = self.load_evaluation_dimension(dimension)
        logger.info(f"Obtain {int(len(dimension_map)/2)} evaluation dimensions in total")

        dimension_question = {}
        dimension_answer = {}
        
        for key, value in dimension_map.items():
            if key.endswith("_Question"):
                tag = key.replace("_Question", "")
                dimension_question[tag] = value
            elif key.endswith("_Answer"):
                tag = key.replace("_Answer", "")
                dimension_answer[tag] = value
                
        self.dimension_question = dimension_question
        self.dimension_answer = dimension_answer

        logger.info(f"Evaluation dimensions to be assessed: {list(self.dimension_answer.keys())}")
        logger.info(f"Each dimension requires at least {min_n} assessments")

        # Execute evaluation for each character
        all_character_results = []
        
        for target_character in target_characters:
            logger.info(f"Starting evaluation for character: {target_character['name']}")

            # Randomly shuffle the order of scenarios
            scenarios = all_scenarios.copy()
            random.shuffle(scenarios)

            # Initialize dimension counters
            dimension_counts = {dimension: 0 for dimension in self.dimension_answer.keys()}

            # Execute evaluation
            all_results = []
            total_model1_wins = 0
            total_model2_wins = 0
            total_ties = 0
            
            scenario_index = 0
            while not self.check_dimension_requirements(dimension_counts, min_n) and scenario_index < len(scenarios):
                scenario = scenarios[scenario_index]

                # Display detailed scenario and dimension status
                min_count = min(dimension_counts.values())
                max_count = max(dimension_counts.values())
                gap = max_count - min_count
                completed_dims = sum(1 for count in dimension_counts.values() if count >= min_n)
                total_dims = len(dimension_counts)
                
                logger.info("🎬 " + "=" * 60)
                logger.info(f"Scenario {scenario_index+1}/{len(scenarios)} starting evaluation")
                logger.info(f"Dimension completion progress: {completed_dims}/{total_dims} dimensions met requirements")
                logger.info(f"Current dimension counts: {dimension_counts}")
                logger.info(f"Balance status: Min={min_count}, Max={max_count}, Gap={gap}")
                logger.info("=" * 60)
                
                try:
                    result = self.simulate_scenario_dialogue(scenario_index, dimension, target_character, scenario, dimension_counts)
                    if result is None:
                        scenario_index += 1
                        continue
                        
                    all_results.append(result)
                    
                    total_model1_wins += result.model1_wins
                    total_model2_wins += result.model2_wins
                    total_ties += result.ties
                    
                    min_count = min(dimension_counts.values())
                    max_count = max(dimension_counts.values())
                    gap = max_count - min_count
                    completed_dims = sum(1 for count in dimension_counts.values() if count >= min_n)
                    total_dims = len(dimension_counts)

                    logger.info("🎬 Scene completion status:")
                    logger.info(f"   Current counts: {dimension_counts}")
                    logger.info(f"   Balance status: Min={min_count}, Max={max_count}, Gap={gap}")
                    logger.info(f"   Completion progress: {completed_dims}/{total_dims} dimensions met minimum requirements ({min_n} times)")
                    logger.info("-" * 50)
                    
                except Exception as e:
                    logger.error(f"Scenario {scenario_index+1} evaluation failed: {e}")

                scenario_index += 1

            # Check if all dimensions have met the minimum evaluation requirements
            if self.check_dimension_requirements(dimension_counts, min_n):
                logger.info(f"Character {target_character['name']} has met all minimum evaluation requirements for dimensions")
            else:
                logger.warning(f"Character {target_character['name']} has not met all minimum evaluation requirements for dimensions, scenario exhausted")

            # Statistics by dimension (including Arena Hard style scoring)
            dimension_results = {key: {
                "model1_wins": 0, "model2_wins": 0, "ties": 0,
                "model1_score_sum": 0, "total_evaluations": 0,
                "score_distribution": {"1": 0, "2": 0, "3": 0, "4": 0, "5": 0}
            } for key in self.dimension_answer}
            
            for result in all_results:
                for detail in result.win_lose_details:
                    if detail['dimension'] in dimension_results:
                        # Statistics for wins, losses, and ties
                        if detail['choice'] == 'model1':
                            dimension_results[detail['dimension']]['model1_wins'] += 1
                        elif detail['choice'] == 'model2':
                            dimension_results[detail['dimension']]['model2_wins'] += 1
                        else:  # tie
                            dimension_results[detail['dimension']]['ties'] += 1
                        
                        score = detail.get('score', 3)
                        
                        if score == 1:
                            model1_score_rate = 3.0
                        elif score == 2:
                            model1_score_rate = 1.0
                        elif score == 3:
                            model1_score_rate = 0.5
                        elif score == 4:
                            model1_score_rate = 0.0
                        elif score == 5:
                            model1_score_rate = 0.0
                        else:
                            model1_score_rate = 0.5
                        
                        dimension_results[detail['dimension']]['model1_score_sum'] += model1_score_rate
                        dimension_results[detail['dimension']]['total_evaluations'] += 1
                        dimension_results[detail['dimension']]['score_distribution'][str(score)] += 1
            
            valid_dimension_win_ratios = []
            valid_dimension_tie_ratios = []
            valid_dimension_lose_ratios = []
            valid_dimension_scores = []
            
            for key, value in dimension_results.items():
                total_evaluations = value["model1_wins"] + value["model2_wins"] + value["ties"]
                
                if total_evaluations > 0:
                    win_ratio = value["model1_wins"] / total_evaluations
                    tie_ratio = value["ties"] / total_evaluations
                    lose_ratio = value["model2_wins"] / total_evaluations
                    
                    value["model1_win_rate"] = win_ratio
                    value["model1_tie_rate"] = tie_ratio
                    value["model1_lose_rate"] = lose_ratio
                    
                    valid_dimension_win_ratios.append(win_ratio)
                    valid_dimension_tie_ratios.append(tie_ratio)
                    valid_dimension_lose_ratios.append(lose_ratio)
                    
                    if value["total_evaluations"] > 0:
                        avg_score_rate = value["model1_score_sum"] / value["total_evaluations"]
                        value["arena_hard_score"] = avg_score_rate
                        valid_dimension_scores.append(avg_score_rate)
                    else:
                        value["arena_hard_score"] = 0.5  
                        valid_dimension_scores.append(0.5)
                else:
                    value["model1_win_rate"] = 0.0
                    value["model1_tie_rate"] = 0.0
                    value["model1_lose_rate"] = 0.0
                    value["arena_hard_score"] = 0.5 
            
            macro_win_rate = sum(valid_dimension_win_ratios) / len(valid_dimension_win_ratios) if valid_dimension_win_ratios else 0.0
            macro_tie_rate = sum(valid_dimension_tie_ratios) / len(valid_dimension_tie_ratios) if valid_dimension_tie_ratios else 0.0
            macro_lose_rate = sum(valid_dimension_lose_ratios) / len(valid_dimension_lose_ratios) if valid_dimension_lose_ratios else 0.0
            macro_arena_hard_score = sum(valid_dimension_scores) / len(valid_dimension_scores) if valid_dimension_scores else 0.5
            
            # Calculate overall statistics (cumulative across all dimensions)
            actual_total_model1_wins = sum(value["model1_wins"] for value in dimension_results.values())
            actual_total_model2_wins = sum(value["model2_wins"] for value in dimension_results.values())
            actual_total_ties = sum(value["ties"] for value in dimension_results.values())
            actual_total_evaluations = actual_total_model1_wins + actual_total_model2_wins + actual_total_ties
            
            raw_win_rate = actual_total_model1_wins / actual_total_evaluations if actual_total_evaluations > 0 else 0.0
            raw_tie_rate = actual_total_ties / actual_total_evaluations if actual_total_evaluations > 0 else 0.0
            raw_lose_rate = actual_total_model2_wins / actual_total_evaluations if actual_total_evaluations > 0 else 0.0
            
            character_result = {
                "character_name": target_character["name"],
                "total_scenarios_available": len(all_scenarios),
                "scenarios_used": len(all_results),
                "total_evaluations": actual_total_evaluations,
                "model1_wins": actual_total_model1_wins,
                "model2_wins": actual_total_model2_wins,
                "ties": actual_total_ties,
                "model1_raw_win_rate": raw_win_rate,
                "model1_raw_tie_rate": raw_tie_rate,
                "model1_raw_lose_rate": raw_lose_rate,
                "model1_macro_win_rate": macro_win_rate,
                "model1_macro_tie_rate": macro_tie_rate,
                "model1_macro_lose_rate": macro_lose_rate,
                "model1_macro_arena_hard_score": macro_arena_hard_score,
                "dimension_counts": dimension_counts,
                "min_n_requirement": min_n,
                "all_dimensions_satisfied": self.check_dimension_requirements(dimension_counts, min_n),
                "detailed_results": []
            }
            
            for result in all_results:
                character_result["detailed_results"].append({
                    "scenario_id": result.scenario_id,
                    "total_rounds": result.total_rounds,
                    "model1_wins": result.model1_wins,
                    "model2_wins": result.model2_wins,
                    "ties": result.ties,
                    "dialogue_history": result.dialogue_history,
                    "win_lose_details": result.win_lose_details
                })
                    
            character_result["dimension_results"] = dimension_results
            all_character_results.append(character_result)
            
            final_min = min(dimension_counts.values()) if dimension_counts else 0
            final_max = max(dimension_counts.values()) if dimension_counts else 0
            final_gap = final_max - final_min
            all_satisfied = self.check_dimension_requirements(dimension_counts, min_n)
            
            logger.info("🎉 " + "=" * 60)
            logger.info(f"Character {target_character['name']} evaluation completed")
            logger.info("📈 Evaluation Statistics:")
            logger.info(f"   Scenarios Used: {len(all_results)}/{len(all_scenarios)}")
            logger.info(f"   Total Evaluations: {actual_total_evaluations}")
            logger.info("")
            logger.info("📊 Raw Statistics (Cumulative Across All Dimensions):")
            logger.info(f"   Model1 win rate: {raw_win_rate:.4f} ({actual_total_model1_wins})")
            logger.info(f"   Model1 tie rate: {raw_tie_rate:.4f} ({actual_total_ties})")
            logger.info(f"   Model1 lose rate: {raw_lose_rate:.4f} ({actual_total_model2_wins})")
            logger.info("")
            logger.info(f"   Model1 macro win rate: {macro_win_rate:.4f}")
            logger.info(f"   Model1 macro tie rate: {macro_tie_rate:.4f}")
            logger.info(f"   Model1 macro lose rate: {macro_lose_rate:.4f}")
            logger.info(f"   Model1 macro Arena Hard score: {macro_arena_hard_score:.4f}")
            
            logger.info("")
            logger.info("📋 Detailed Statistics:")
            for dim_name, dim_stats in dimension_results.items():
                dim_count = dimension_counts.get(dim_name, 0)
                win_rate = dim_stats.get("model1_win_rate", 0.0)
                tie_rate = dim_stats.get("model1_tie_rate", 0.0)
                lose_rate = dim_stats.get("model1_lose_rate", 0.0)
                arena_score = dim_stats.get("arena_hard_score", 0.5)
                
                logger.info(f"   {dim_name} ({dim_count}): win rate: {win_rate:.4f} |  tie rate: {tie_rate:.4f} |  lose rate: {lose_rate:.4f} | Arena Hard score: {arena_score:.4f}")
            logger.info("🎯 Dimension Balance Results:")
            logger.info(f"   Dimension Counts: {dimension_counts}")
            logger.info(f"   Balance: Min={final_min}, Max={final_max}, Gap={final_gap}")
            logger.info(f"   Min Requirement ({min_n}): {'✅ All Met' if all_satisfied else '❌ Not Met'}")

            if final_gap <= 2:
                balance_status = "🌟 Good"
            elif final_gap <= 5:
                balance_status = "✅ Moderate"
            else:
                balance_status = "⚠️ Poor"
            logger.info(f"   Balanced performance: {balance_status}")
            logger.info("=" * 60)
            
        # Overall statistics across all characters
        total_all_evaluations = sum(char["total_evaluations"] for char in all_character_results)
        total_all_model1_wins = sum(char["model1_wins"] for char in all_character_results)
        total_all_model2_wins = sum(char["model2_wins"] for char in all_character_results)
        total_all_ties = sum(char["ties"] for char in all_character_results)
        
        overall_win_rate = total_all_model1_wins / total_all_evaluations if total_all_evaluations > 0 else 0.0
        overall_tie_rate = total_all_ties / total_all_evaluations if total_all_evaluations > 0 else 0.0  
        overall_lose_rate = total_all_model2_wins / total_all_evaluations if total_all_evaluations > 0 else 0.0
        
        macro_overall_win_rate = sum(char["model1_macro_win_rate"] for char in all_character_results) / len(all_character_results) if all_character_results else 0.0
        macro_overall_tie_rate = sum(char["model1_macro_tie_rate"] for char in all_character_results) / len(all_character_results) if all_character_results else 0.0
        macro_overall_lose_rate = sum(char["model1_macro_lose_rate"] for char in all_character_results) / len(all_character_results) if all_character_results else 0.0
        macro_overall_arena_hard_score = sum(char["model1_macro_arena_hard_score"] for char in all_character_results) / len(all_character_results) if all_character_results else 0.5
        
        output_data = {
            "total_characters": len(target_characters),
            "min_n_requirement": min_n,
            "overall_statistics": {
                "total_evaluations": total_all_evaluations,
                "total_model1_wins": total_all_model1_wins,
                "total_model2_wins": total_all_model2_wins,
                "total_ties": total_all_ties,
                "overall_win_rate": overall_win_rate,
                "overall_tie_rate": overall_tie_rate,
                "overall_lose_rate": overall_lose_rate,
                "macro_overall_win_rate": macro_overall_win_rate,
                "macro_overall_tie_rate": macro_overall_tie_rate,
                "macro_overall_lose_rate": macro_overall_lose_rate,
                "macro_overall_arena_hard_score": macro_overall_arena_hard_score
            },
            "character_results": all_character_results
        }
        
        try:
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(output_data, f, ensure_ascii=False, indent=2)
            logger.info(f"The simulation result is saved in: {output_file}")
        except Exception as e:
            logger.error(f"Saving failed: {e}")
            raise
        
        # Final overall summary
        logger.info("🏆 " + "=" * 60)
        logger.info("All character evaluation completed")
        logger.info(f"📊 Overall statistics:")
        logger.info(f"   Total characters evaluated: {len(target_characters)}")
        logger.info(f"   Total evaluations conducted: {total_all_evaluations}")
        logger.info("")
        logger.info("📈 Cumulative statistics across all characters:")
        logger.info(f"   Model1 win rate: {overall_win_rate:.4f} ({total_all_model1_wins})")
        logger.info(f"   Model1 tie rate: {overall_tie_rate:.4f} ({total_all_ties})")
        logger.info(f"   Model1 lose rate: {overall_lose_rate:.4f} ({total_all_model2_wins})")
        logger.info("")
        logger.info("🎯 Macro average statistics across all characters:")
        logger.info(f"   Model1 macro win rate: {macro_overall_win_rate:.4f}")
        logger.info(f"   Model1 macro tie rate: {macro_overall_tie_rate:.4f}")
        logger.info(f"   Model1 macro lose rate: {macro_overall_lose_rate:.4f}")
        logger.info(f"   Model1 macro Arena Hard score: {macro_overall_arena_hard_score:.4f}")
        logger.info("")
        logger.info(f"  Final result file: {output_file}")
        logger.info("=" * 60)

def main():
    """Main function"""
    logger.info("RP Benchmark Simulation Started")
    parser = argparse.ArgumentParser(description="Run RP benchmark Simulation.")
    
    parser.add_argument(
        '--input_character_profile',
        type=str,
        required=True,
        help='Test character profile (JSON format)'
    )
    
    parser.add_argument(
        '--output_file',
        type=str,
        required=True,
        help='The file path to the simulation result (JSON format)'
    )
    
    parser.add_argument(
        '--script_dir_selection',
        type=str,
        required=True,
        choices=['zh', 'en'],
        help='Language selection for the script directory'
    )
    
    parser.add_argument(
        '--min_n',
        type=int,
        required=True,
        help='Minimum number of evaluations per dimension (must be > 0)'
    )
    
    parser.add_argument(
        '--config',
        type=str,
        required=True,
        help='Model configuration file (JSON format)'
    )
    
    parser.add_argument(
        '--dimension',
        type=str,
        required=True,
        default='./criteria_config.yaml',
        help='Evaluation dimension file (YAML format)'
    )
    
    parser.add_argument(
        '--test_ratio',
        type=float,
        required=True,
        default=0.3,
        help='The portion of test questions in the dialogue (0 < test_ratio <= 1)'
    )
    
    parser.add_argument(
        '--max_turns',
        type=int,
        required=True,
        default=20,
        help='The maximum number of turns in the dialogue (must be > 0)'
    )
    
    args = parser.parse_args()
    
    # Parameter validation
    if not os.path.exists(args.input_character_profile):
        logger.error(f"Test character profile file does not exist: {args.input_character_profile}")
        return
    
    if not os.path.exists(args.config):
        logger.error(f"Model configuration file does not exist: {args.config}")
        return
    
    if args.min_n <= 0:
        logger.error(f"Minimum number of evaluations per dimension must be greater than 0: {args.min_n}")
        return
    
    if not 0 < args.test_ratio <= 1:
        logger.error(f"Test ratio must be between 0 and 1: {args.test_ratio}")
        return
    
    if not os.path.exists(args.dimension):
        logger.error(f"Evaluation dimension file does not exist: {args.dimension}")
        return
    
    try:
        # Initialize and run the dialogue simulator
        logger.info("Initializing dialogue simulator...")
        simulator = DialogueSimulator(args.config)
        logger.info("Dialogue simulator initialized successfully")
        simulator.run_evaluation(
            args.input_character_profile,
            args.script_dir_selection,
            args.min_n,
            args.test_ratio,
            args.output_file,
            args.dimension,
            args.max_turns
        )
        
    except Exception as e:
        logger.error(f"An error occurred during evaluation: {e}")
        raise
    
if __name__ == "__main__":
    main()
    logger.info("RP Benchmark Simulation ended")