import asyncio
import json
import re
import os
import pandas as pd
from dataclasses import dataclass
from typing import Dict, List, Any, Union, Optional, AsyncGenerator
from datetime import datetime
import wandb
import glob
from tqdm.asyncio import tqdm_asyncio
import openai
import shutil

from autogen_core import (
    DefaultTopicId,
    MessageContext,
    RoutedAgent,
    SingleThreadedAgentRuntime,
    TypeSubscription,
    default_subscription,
    message_handler,
)
from autogen_core.models import (
    AssistantMessage,
    ChatCompletionClient,
    LLMMessage,
    SystemMessage,
    UserMessage,
)
from autogen_ext.models.openai import OpenAIChatCompletionClient
from claude_api import call_bedrock
from enum import Enum

MODEL_NAME = "Qwen/Qwen3-235B-A22B"

from sglang.utils import launch_server_cmd
from sglang.utils import wait_for_server, print_highlight, terminate_process

server_process, port = launch_server_cmd(
    f"uv run -m sglang.launch_server --model-path {MODEL_NAME} --tp 8 --attention-backend fa3"
)


wait_for_server(f"http://localhost:{port}")
print(f"Server started on http://localhost:{port}")

client = openai.AsyncClient(base_url=f"http://localhost:{port}/v1", api_key="None")

class ModelProvider(Enum):
    OPENAI = "openai"
    CLAUDE = "claude"

@dataclass
class ModelConfig:
    """Configuration for model initialization"""
    provider: ModelProvider
    model_name: str
    api_key: str = None
    model_kwargs: dict = None
    qwen_client: any = None

    def __post_init__(self):
        if not isinstance(self.provider, ModelProvider):
            self.provider = ModelProvider(self.provider.lower())
        if self.model_kwargs is None:
            self.model_kwargs = {}

class BedrockChatCompletionClient(ChatCompletionClient):
    """Chat completion client for AWS Bedrock Claude"""
    
    def __init__(self, model_id: str) -> None:
        self.model_id = model_id
        
    async def create(self, messages: List[LLMMessage]) -> AssistantMessage:
        """Create a chat completion using AWS Bedrock Claude"""
        # Transform all messages to format expected by Bedrock
        bedrock_messages = []
        for msg in messages:
            role = "user" if isinstance(msg, UserMessage) else "assistant" if isinstance(msg, AssistantMessage) else "system"
            bedrock_messages.append({
                "role": role,
                "content": [{"text": msg.content}]
            })
        
        # Call Bedrock
        response = call_bedrock(
            messages=bedrock_messages,
            model_id=self.model_id
        )
        
        # Extract response content from Bedrock's output structure
        try:
            if isinstance(response, dict):
                if 'output' in response:
                    message = response['output']['message']
                    if isinstance(message, dict) and 'content' in message:
                        content = message['content'][0]['text']
                    else:
                        content = str(message)
                else:
                    content = str(response)
            else:
                content = str(response)
        except Exception as e:
            print(f"Error extracting content from Bedrock response: {e}")
            content = str(response)
            
        return AssistantMessage(content=content, source="claude")

    async def create_stream(self, messages: List[LLMMessage]) -> AsyncGenerator[AssistantMessage, None]:
        """Stream is not supported for Bedrock Claude"""
        raise NotImplementedError("Streaming is not supported for Bedrock Claude")

    async def count_tokens(self, messages: List[LLMMessage]) -> int:
        """Count tokens is not supported for Bedrock Claude"""
        return 0  # Return 0 as token counting is not critical for functionality

    def capabilities(self) -> Dict[str, Any]:
        """Return capabilities of the model"""
        return {
            "streaming": False,
            "token_counting": False,
            "token_management": False,
            "model_info": True
        }

    def model_info(self) -> Dict[str, Any]:
        """Return model information"""
        return {
            "name": self.model_id,
            "provider": "claude",
            "max_tokens": None,  # Unknown for Bedrock
            "token_limit": None  # Unknown for Bedrock
        }

    def remaining_tokens(self) -> Optional[int]:
        """Return remaining tokens (not applicable for Bedrock)"""
        return None

    def total_usage(self) -> Optional[int]:
        """Return total token usage (not applicable for Bedrock)"""
        return None

    def actual_usage(self) -> Optional[int]:
        """Return actual token usage (not applicable for Bedrock)"""
        return None

    async def close(self):
        """Close any resources"""
        pass

class QwenChatCompletionClient(ChatCompletionClient):
    """Chat completion client for Qwen (sglang router)"""
    def __init__(self, client, model_name: str, max_parallel_requests: int = 5):
        self.client = client
        self.model_name = model_name
        self.max_parallel_requests = max_parallel_requests
        self._semaphore = asyncio.Semaphore(max_parallel_requests)

    async def _create_single(self, messages: List[LLMMessage]) -> AssistantMessage:
        """Create a single chat completion"""
        openai_messages = []
        for msg in messages:
            if isinstance(msg, UserMessage):
                role = "user"
            elif isinstance(msg, AssistantMessage):
                role = "assistant"
            elif isinstance(msg, SystemMessage):
                role = "system"
            else:
                role = "user"
            openai_messages.append({"role": role, "content": msg.content})
        
        async with self._semaphore:
            # 调用 sglang router
            response = await self.client.chat.completions.create(
                model=self.model_name,
                messages=openai_messages,
                stream=False,
                extra_body={
                    "chat_template_kwargs": {"enable_thinking": True},
                },
            )
            content = response.choices[0].message.content
            print('Qwen response out')
            return AssistantMessage(content=content, source="qwen")

    async def create(self, messages: List[LLMMessage]) -> AssistantMessage:
        """Create a chat completion"""
        return await self._create_single(messages)

    async def create_batch(self, messages_list: List[List[LLMMessage]]) -> List[AssistantMessage]:
        """Create multiple chat completions in parallel"""
        tasks = [self._create_single(messages) for messages in messages_list]
        return await asyncio.gather(*tasks)

    async def create_stream(self, messages: List[LLMMessage]):
        raise NotImplementedError("Streaming is not supported for Qwen sglang router")

    async def count_tokens(self, messages: List[LLMMessage]) -> int:
        return 0

    def capabilities(self) -> Dict[str, Any]:
        return {
            "streaming": False,
            "token_counting": False,
            "token_management": False,
            "model_info": True
        }

    def model_info(self) -> Dict[str, Any]:
        return {
            "name": self.model_name,
            "provider": "qwen",
            "max_tokens": None,
            "token_limit": None
        }

    def remaining_tokens(self) -> Optional[int]:
        return None

    def total_usage(self) -> Optional[int]:
        return None

    def actual_usage(self) -> Optional[int]:
        return None

    async def close(self):
        pass

def create_model_client(config: ModelConfig) -> ChatCompletionClient:
    """Create a model client based on the provided configuration"""
    if config.provider == ModelProvider.OPENAI:
        # 支持Qwen sglang router client
        if hasattr(config, "qwen_client") and config.qwen_client is not None:
            return QwenChatCompletionClient(
                client=config.qwen_client,
                model_name=config.model_name
            )
        return OpenAIChatCompletionClient(
            model=config.model_name,
            api_key=config.api_key or os.getenv("OPENAI_API_KEY"),
            **config.model_kwargs
        )
    elif config.provider == ModelProvider.CLAUDE:
        return BedrockChatCompletionClient(
            model_id=config.model_name
        )
    else:
        raise ValueError(f"Unsupported model provider: {config.provider}")

@dataclass
class InitialEvaluation:
    """Initial evaluation from a stakeholder"""
    stakeholder_name: str
    content: str

@dataclass
class DebateMessage:
    """Message during debate phase"""
    stakeholder_name: str
    content: str
    round: int
    next_speaker: str = ""  # The next stakeholder who should speak
    is_new_round: bool = False  # Signal that this is the start of a new round

@dataclass
class ScoreEvaluation:
    """Final evaluation with score from a stakeholder"""
    stakeholder_name: str
    content: str
    score: float = 0.0

@dataclass
class RankingEvaluation:
    """Final evaluation with ranking from a stakeholder"""
    stakeholder_name: str
    content: str
    rankings: List[Dict[str, Any]] = None

@dataclass
class AggregatedScoreEvaluation:
    """Final aggregated evaluation with score"""
    content: str
    average_score: float = 0.0

@dataclass
class AggregatedRankingEvaluation:
    """Final aggregated evaluation with rankings"""
    content: str
    rankings: List[Dict[str, Any]] = None

@dataclass
class DebateCompleteScore:
    """Signal that score-based debate is complete"""
    content: str
    average_score: float = 0.0

@dataclass
class DebateCompleteRanking:
    """Signal that ranking-based debate is complete"""
    content: str
    rankings: List[Dict[str, Any]] = None

@dataclass
class EvaluationRequest:
    """Request for evaluation"""
    content: str  # Content to be evaluated

@dataclass
class DebateStart:
    """Signal to start debate phase"""
    initial_evaluations: List[Dict[str, str]]  # List of stakeholder names and their initial evaluations

@dataclass
class FinalEvaluationRequest:
    """Request for final evaluation"""
    stakeholder_name: str
    debate_history: List[Dict[str, str]]  # Simplified debate history

@dataclass
class AggregatedEvaluation:
    """Final aggregated evaluation"""
    content: str
    average_score: float = 0.0  # Only for scoring mode
    rankings: List[Dict[str, Any]] = None  # Store final rankings with explanations

@dataclass
class DebateComplete:
    """Signal that debate is complete and final evaluations have been aggregated"""
    content: str
    average_score: float = 0.0
    rankings: List[Dict[str, Any]] = None  # Store final rankings with explanations

def log_debate(message: str, agent_name: str = "System", log_file: str = "debate_log.txt", message_type: str = "INFO"):
    """Log debate messages to file with structured format
    
    Args:
        message: The message content to log
        agent_name: Name of the agent generating the message
        log_file: Path to the log file
        message_type: Type of message (INFO, EVAL, DEBATE, SYSTEM)
    """
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    with open(log_file, "a", encoding="utf-8") as f:
        f.write(f"[{timestamp}] [{message_type}] {agent_name}: {message}\n")

class DebateLogger:
    """Class to handle debate logging with consistent formatting"""
    
    def __init__(self, id: str, story_title: str, group_name: str, annotator: str, output_dir: str = "qwen_qag_debate_logs", dimension: Optional[str] = None):
        # Create output directory if it doesn't exist
        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)
        
        # Generate unique filename based on timestamp
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        def safe_filename(s):
            return re.sub(r'[^a-zA-Z0-9_\-\.]', '_', str(s))
        if dimension == None:
            self.log_file = os.path.join(self.output_dir, f"debate_log_{id}_{safe_filename(story_title)}_{safe_filename(group_name)}_{safe_filename(annotator)}_{timestamp}.txt")
            self.results_file = os.path.join(self.output_dir, f"evaluation_results_{id}_{safe_filename(story_title)}_{safe_filename(group_name)}_{safe_filename(annotator)}_{timestamp}.json")
        else:
            self.log_file = os.path.join(self.output_dir, f"debate_log_{id}_{safe_filename(story_title)}_{safe_filename(annotator)}_{safe_filename(dimension)}_{timestamp}.txt")
            self.results_file = os.path.join(self.output_dir, f"evaluation_results_{id}_{safe_filename(story_title) }_{safe_filename(annotator)}_{safe_filename(dimension)}_{timestamp}.json")
        
        # Initialize log file with header
        with open(self.log_file, "w", encoding="utf-8") as f:
            f.write(f"=== Debate Session Started at {timestamp} ===\n\n")
    
    def log_message(self, message: str, agent_name: str, message_type: str = "INFO"):
        """Log a message with consistent formatting"""
        log_debate(message, agent_name, self.log_file, message_type)
    
    def log_evaluation(self, content: str, agent_name: str, eval_type: str = "INITIAL"):
        """Log an evaluation message"""
        self.log_message(f"{eval_type} Evaluation:\n{content}\n", agent_name, "EVAL")
    
    def log_debate_message(self, content: str, agent_name: str, round_num: int):
        """Log a debate message"""
        self.log_message(f"Round {round_num}:\n{content}\n", agent_name, "DEBATE")
    
    def log_system_event(self, event: str):
        """Log a system event"""
        self.log_message(event, "System", "SYSTEM")
    
    def save_results(self, results: Dict):
        """Save final results to JSON file"""
        with open(self.results_file, "w", encoding="utf-8") as f:
            json.dump(results, f, indent=2)

@default_subscription
class StakeholderAgent(RoutedAgent):
    def __init__(self, model_client: ChatCompletionClient, name: str, persona: Dict[str, Any], task_description: str,
                evaluation_content: Any, context: str, response_format: str, max_debate_rounds: int = 3,
                logger: DebateLogger = None, mode: str = "score", dimension: Optional[str] = None) -> None:
        super().__init__(f"Stakeholder agent {name}")
        self._name = name
        self._persona = persona
        self._model_client = model_client
        self._task_description = task_description
        self._evaluation_content = evaluation_content
        self._context = context
        self._response_format = response_format
        self._max_debate_rounds = max_debate_rounds
        self._history: List[LLMMessage] = []
        self._debate_round = 0
        self._received_initial_evaluations = False
        self._is_my_turn = False
        self._logger = logger
        self._debate_complete = False
        self._mode = mode
        self._dimension = dimension
        self._initial_eval_sent = False  # Track if initial evaluation has been sent
        
        self._system_message = SystemMessage(
            content=(
                f"YOU ARE {self._name}. Your demographic information is: {self._persona['Demographic Information']}.\n"
                f"Your perspective is: {self._persona['Perspective']}.\n"
                f"Your specialty is: {self._persona['Specialty']}.\n"
                f"Your psychological traits include {self._persona['Psychological Traits']}.\n"
                f"Socially, these are your relationships: {self._persona['Social Relationships']}.\n\n"
                f"Using your perspective or specialty, now you are participating in the following evaluation task: {self._task_description}\n\n"
                f"The content to be evaluated is: {self._evaluation_content}\n"
                f"The related context for the evaluation content is: {self._context}\n"
                f"You should use this format for your evaluation: {self._response_format}\n\n"
                "Follow the following steps:\n"
                "1. In phase 1 of the evaluation, you need to generate your initial evaluation result.\n"
                f"2. In phase 2 of the evaluation, there are other stakeholders with different specialties who are also doing the same evaluation task, and you will participate in a debate. During debate, you will express your opinions and listen to others' perspectives to decide whether you should change your evaluation decision.\n"
                "When others express their feedback, reflect on their input from your own perspective. Consider whether their viewpoints reveal aspects you may have overlooked. If others comment on your evaluation, you should reflect on your evaluation and decide whether to accept others' comments. However, you do not need to agree with others. You must base your evaluation on your own perspective and/or specialty.\n"
                "When it's your turn to speak, you MUST follow these rules:\n"
                "- If you have meaningful comments or critiques on previous feedback, express them clearly.\n"
                "- After expressing your comments, you MUST decide whether to continue the debate:\n"
                "  * If you believe further discussion would be valuable, end your message with a clear point for discussion.\n"
                "  * If you have no more points to discuss or believe the debate has reached a conclusion, you MUST explicitly state 'NO MORE COMMENTS' followed by your final evaluation in the required format.\n"
                "**Critical Instructions**:\n"
                "1. Your feedback and score must remain grounded in your own perspective and/or area of expertise, and follow the scoring standards. You are expected to make full use of the scale to reflect the quality of the evaluation content. Avoid always giving scores around the middle (2–3). Be bold and discerning in your judgments—do not hesitate to give high scores for excellent items and low scores for poor ones.\n"
                "2. Do not generate evaluations that duplicate or closely mirror those of other agents.\n"
                "3. You must use the provided response format.\n"
                "4. IMPORTANT: The debate MUST eventually end with you saying 'NO MORE COMMENTS' and providing your final evaluation."
            )
        )

    @message_handler
    async def handle_initial_request(self, message: EvaluationRequest, ctx: MessageContext) -> None:
        """Handle initial evaluation request"""
        if self._initial_eval_sent:  # Skip if already sent initial evaluation
            return
            
        # Create phase 1 prompt
        prompt = (
            "You are now in Phase 1 of the evaluation process. You need to provide your initial feedback and score of the content based on your perspective and/or specialty.\n\n"
            f"Evaluation task description: {self._task_description}\n\n"
            f"Content to be evaluated: {self._evaluation_content}\n\n"
            f"The related context for the evaluation content:\n{self._context}\n\n"
            f"Response format: {self._response_format}\n"
            f"**Important Reminder**:\n"
            f"- You must use the provided response format. Do not include additional text beyond the json object."
        )
        
        # Add the prompt to history
        self._history.append(UserMessage(content=prompt, source="user"))
        
        # Make an inference using the model
        model_result = await self._model_client.create([self._system_message] + self._history)
        
        # Add the response to history
        self._history.append(AssistantMessage(content=model_result.content, source=self._name))
        
        print(f"{'='*20} {self._name} Initial Evaluation {'='*20}\n{model_result.content}\n{'='*60}")
        if self._logger:
            self._logger.log_evaluation(model_result.content, self._name, "INITIAL")
        
        # Mark initial evaluation as sent
        self._initial_eval_sent = True
        
        # Publish initial evaluation
        await self.publish_message(
            InitialEvaluation(
                stakeholder_name=self._name,
                content=model_result.content
            ),
            topic_id=DefaultTopicId()
        )

    @message_handler
    async def handle_debate_start(self, message: DebateStart, ctx: MessageContext) -> None:
        """Handle debate start with initial evaluations from all stakeholders"""
        if not self._initial_eval_sent:  # Skip if haven't sent initial evaluation
            return
            
        self._debate_round = 1
        self._received_initial_evaluations = True
        
        # Create phase 2 prompt
        prompt = (
            "You are now entering Phase 2 of the evaluation process, where you need to participate in a debate process with other stakeholders like you.\n\n"
            f"Evaluation task description: {self._task_description}\n\n"
            "Here are the initial evaluations from all stakeholders:\n\n"
        )
        
        # Track if this agent is the first one (by checking if their evaluation is first in the list)
        is_first_agent = message.initial_evaluations[0]["stakeholder_name"] == self._name
        
        for eval_data in message.initial_evaluations:
            stakeholder_name = eval_data["stakeholder_name"]
            content = eval_data["content"]
            prompt += f"Stakeholder {stakeholder_name}'s initial evaluation:\n{content}\n\n"
        
        prompt += (
            "Your task is to evaluate these initial assessments based on your perspective and/or specialty.\n"
            "You should also reflect on the feedback from other stakeholders and decide whether to agree, disagree, or add nuances to the discussion based on your perspective and/or specialty.\n"
            f"If you have no more points to discuss, respond with 'NO MORE COMMENTS' followed by your final evaluation in this format: \n"
            f"{self._response_format}\n\n"
        )
        
        # Add the prompt to history
        self._history.append(UserMessage(content=prompt, source="user"))
        
        # If this is the first agent, automatically respond
        if is_first_agent:
            print(f"First agent {self._name} starting the debate")
            # Make an inference using the model
            model_result = await self._model_client.create([self._system_message] + self._history)
            
            # Add the response to history
            self._history.append(AssistantMessage(content=model_result.content, source=self._name))
            
            print(f"{'='*20} {self._name} Debate Round {self._debate_round} {'='*20}\n{model_result.content}\n{'='*60}")
            if self._logger:
                self._logger.log_debate_message(model_result.content, self._name, self._debate_round)
            
            # Publish debate message
            await self.publish_message(
                DebateMessage(
                    stakeholder_name=self._name,
                    content=model_result.content,
                    round=self._debate_round,
                    next_speaker=""  # Let coordinator decide next speaker
                ),
                topic_id=DefaultTopicId()
            )

    @message_handler
    async def handle_debate_message(self, message: DebateMessage, ctx: MessageContext) -> None:
        """Handle debate messages from other stakeholders"""
        # Skip if this is our own message
        if message.stakeholder_name == self._name:
            return
            
        # Update round if coordinator signals new round
        if message.stakeholder_name == "coordinator" and message.is_new_round:
            self._debate_round = message.round
            print(f"Agent {self._name} entering round {self._debate_round}")
            
        # Update turn status based on next_speaker
        was_my_turn = self._is_my_turn
        self._is_my_turn = message.next_speaker == self._name
        print(f"Agent {self._name} received message from {message.stakeholder_name}. The next speaker is {message.next_speaker}. Is this their turn? {self._is_my_turn}")
        print(f"Agent {self._name} had initial evaluations: {self._received_initial_evaluations}. Is the debate complete? {self._debate_complete}")
        
        # Only respond if it's our turn and we've received initial evaluations
        if self._is_my_turn and self._received_initial_evaluations and not self._debate_complete:
            if self._debate_round > self._max_debate_rounds:
                self._debate_complete = True
                return
            
            print(f"Agent {self._name} starting to speak in round {self._debate_round}")
            if self._logger:
                self._logger.log_system_event("="*20 + f"\nAgent {self._name} starting to speak in round {self._debate_round}\n" + "="*20)
            
            # Create reply prompt
            prompt = (
                f"Continuing the debate (Round {self._debate_round}):\n\n"
                "You need to provide your final evaluation now. Please respond with 'NO MORE COMMENTS' followed by your final evaluation in this format:\n"
                f"{self._response_format}\n\n"
                "**Important Reminder**: Your feedback and score should be based on your perspective and/or specialty. "
                "You MUST provide your final evaluation now."
            ) if "timed out" in message.content else (
                f"Continuing the debate (Round {self._debate_round}):\n\n"
                f"Now, it's your turn to speak. Based on all previous feedback from the debates and your reflection, you can decide whether to agree, disagree, or add nuances to the discussion based on your perspective and/or specialty.\n"
                f"If you have no more points to discuss, you must respond with 'NO MORE COMMENTS' followed by your final evaluation in this format: \n"
                f"{self._response_format}\n\n"
                f"**Important Reminder**: Your feedback and score should be based on your perspective and/or specialty. Avoid generating evaluations that duplicate or closely mirror those of other agents."
                f"If you have no more points to discuss, you must respond with 'NO MORE COMMENTS' and provide your final evaluation in the aforementioned format."
                f"You are expected to provide your final evaluation within {self._max_debate_rounds} rounds."
            )
            
            # Add the message to history
            self._history.append(UserMessage(content=prompt, source="user"))
            
            try:
                # Make an inference using the model with timeout
                model_result = await asyncio.wait_for(
                    self._model_client.create([self._system_message] + self._history),
                    timeout=25  # Set a timeout slightly shorter than coordinator's timeout
                )
                
                # Add the response to history
                self._history.append(AssistantMessage(content=model_result.content, source=self._name))
                
                print(f"{'='*20} {self._name} Debate Round {self._debate_round} {'='*20}\n{model_result.content}\n{'='*60}")
                if self._logger:
                    self._logger.log_debate_message(model_result.content, self._name, self._debate_round)
                
                # Reset turn status after speaking
                self._is_my_turn = False
                
                # Check if this agent is saying NO MORE COMMENTS and extract score
                content = model_result.content.strip()
                if 'NO MORE COMMENTS' in content.upper():
                    self._debate_complete = True
                    
                    # Try to extract score from the response
                    score = 0.0
                    try:
                        # Try to extract JSON data
                        if "```json" in content:
                            json_str = content.split("```json")[1].split("```")[0].strip()
                            data = json.loads(json_str)
                            if "score" in data:
                                score = float(data["score"])
                        else:
                            # Try to find score pattern
                            score_match = re.search(r'"score"\s*:\s*(\d+(\.\d+)?)', content)
                            if score_match:
                                score = float(score_match.group(1))
                    except Exception as e:
                        print(f"Error extracting score: {e}")
                        if self._logger:
                            self._logger.log_system_event(f"Error extracting score: {e}")
                    
                    # Publish final evaluation with score
                    await self.publish_message(
                        ScoreEvaluation(
                            stakeholder_name=self._name,
                            content=content,
                            score=score
                        ),
                        topic_id=DefaultTopicId()
                    )
                
                # Publish debate message with current round
                await self.publish_message(
                    DebateMessage(
                        stakeholder_name=self._name,
                        content=content,
                        round=self._debate_round,
                        next_speaker=""  # Let coordinator decide next speaker
                    ),
                    topic_id=DefaultTopicId()
                )
            except asyncio.TimeoutError:
                print(f"Model response timeout for {self._name}")
                if self._logger:
                    self._logger.log_system_event(f"Model response timeout for {self._name}")
                # Don't update turn status on timeout
                return
            except Exception as e:
                print(f"Error getting model response for {self._name}: {e}")
                if self._logger:
                    self._logger.log_system_event(f"Error getting model response for {self._name}: {e}")
                # Don't update turn status on error
                return

@default_subscription
class Coordinator(RoutedAgent):
    def __init__(
        self, 
        model_client: ChatCompletionClient,
        num_stakeholders: int,
        logger: DebateLogger,
        debate_rounds: int = 3,
        mode: str = "score",
        speaker_timeout: int = 30  # Timeout in seconds for waiting speaker response
    ) -> None:
        super().__init__("Coordinator")
        self._model_client = model_client
        self._num_stakeholders = num_stakeholders
        self._debate_rounds = debate_rounds
        self._mode = mode
        self._logger = logger
        self._initial_evaluations: List[InitialEvaluation] = []
        self._debate_messages: Dict[int, List[DebateMessage]] = {}
        self._debate_history: List[Dict[str, str]] = []
        self._stakeholder_message_counts: Dict[str, int] = {}
        self._current_round = 1
        self._debate_complete = False
        self._evaluation_complete = False
        self._no_issue_stakeholders: set = set()
        self._spoke_stakeholders: set = set()
        self._stakeholder_names: List[str] = []
        self._current_speaker_index: int = -1
        self._speaker_timeout = speaker_timeout
        self._waiting_for_speaker: Optional[str] = None
        self._speaker_timer: Optional[asyncio.Task] = None

    async def _handle_speaker_timeout(self) -> None:
        """Handle timeout for current speaker"""
        try:
            await asyncio.sleep(self._speaker_timeout)
            if self._waiting_for_speaker:
                print(f"Speaker {self._waiting_for_speaker} timed out")
                if self._logger:
                    self._logger.log_system_event(f"Speaker {self._waiting_for_speaker} timed out")
                
                # Check if the timed out speaker has provided final evaluation
                timed_out_speaker = self._waiting_for_speaker
                has_final_eval = timed_out_speaker in self._no_issue_stakeholders
                
                # Remove the timed out speaker from spoke_stakeholders to allow them to speak again if needed
                self._spoke_stakeholders.discard(timed_out_speaker)
                
                # Get available speakers, including the timed out speaker if they haven't provided final evaluation
                available_speakers = [
                    s for s in self._stakeholder_names 
                    if s not in self._no_issue_stakeholders 
                    and (s not in self._spoke_stakeholders or (s == timed_out_speaker and not has_final_eval))
                ]
                
                print(f"Available speakers after timeout: {available_speakers}")
                
                if available_speakers:
                    # If the timed out speaker hasn't provided final evaluation, give them another chance
                    if not has_final_eval and timed_out_speaker in available_speakers:
                        next_speaker = timed_out_speaker
                        print(f"Giving another chance to {next_speaker} for final evaluation")
                    else:
                        # Otherwise, select a new speaker
                        next_speaker = await self._select_next_speaker_with_llm(
                            DebateMessage(
                                stakeholder_name="coordinator",
                                content=f"Previous speaker {timed_out_speaker} timed out. Selecting new speaker.",
                                round=self._current_round
                            ),
                            available_speakers
                        )
                    
                    if next_speaker:
                        print(f"Selected speaker after timeout: {next_speaker}")
                        await self.publish_message(
                            DebateMessage(
                                stakeholder_name="coordinator",
                                content=f"Previous speaker timed out. {next_speaker}, please continue the discussion.",
                                round=self._current_round,
                                next_speaker=next_speaker
                            ),
                            topic_id=DefaultTopicId()
                        )
                        # Start new timer for the new speaker
                        self._waiting_for_speaker = next_speaker
                        if self._speaker_timer:
                            self._speaker_timer.cancel()
                        self._speaker_timer = asyncio.create_task(self._handle_speaker_timeout())
                else:
                    # If no available speakers (including those who need final evaluation)
                    if self._current_round < self._debate_rounds:
                        self._current_round += 1
                        await self._start_new_round(self._current_round)
                    else:
                        await self._aggregate_final_evaluations()
        except asyncio.CancelledError:
            pass

    @message_handler
    async def handle_debate_message(self, message: DebateMessage, ctx: MessageContext) -> None:
        """Handle debate messages"""
        if self._debate_complete:
            return

        # Cancel the timeout timer if this is the message we're waiting for
        if message.stakeholder_name == self._waiting_for_speaker:
            self._waiting_for_speaker = None
            if self._speaker_timer:
                self._speaker_timer.cancel()
                self._speaker_timer = None

        # Initialize the round if it doesn't exist
        if message.round not in self._debate_messages:
            self._debate_messages[message.round] = []
        
        # Add message to the appropriate round
        self._debate_messages[message.round].append(message)
        
        # Track message count per stakeholder
        if message.stakeholder_name not in self._stakeholder_message_counts:
            self._stakeholder_message_counts[message.stakeholder_name] = 1
        else:
            self._stakeholder_message_counts[message.stakeholder_name] += 1
        
        # Check if stakeholder has no more comments and update the set
        current_message_is_no_more = 'NO MORE COMMENTS' in message.content.strip().upper()
        if current_message_is_no_more:
            self._no_issue_stakeholders.add(message.stakeholder_name)
            print(f"Stakeholder {message.stakeholder_name} has no more comments to discuss")
        else:
            # If stakeholder raises a new point, remove them from no_issue set
            self._no_issue_stakeholders.discard(message.stakeholder_name)
        
        # Add to debate history
        self._debate_history.append({
            "phase": "debate",
            "round": message.round,
            "stakeholder_name": message.stakeholder_name,
            "content": message.content
        })
        
        print(f"Coordinator received debate message from {message.stakeholder_name} (Round {message.round})")
        print(f"Current no_issue_stakeholders: {self._no_issue_stakeholders}")
        
        # Check if all stakeholders have no more comments
        all_no_more = len(self._no_issue_stakeholders) == self._num_stakeholders
        if all_no_more:
            print("All stakeholders have no more comments to discuss. Ending debate.")
            # Schedule final aggregation with delay
            await asyncio.sleep(1)
            await self._aggregate_final_evaluations()
            return
            
        # Check if current round is complete
        current_round_speakers = set(msg.stakeholder_name for msg in self._debate_messages.get(self._current_round, []))
        self._spoke_stakeholders.update(current_round_speakers)
        all_spoken_current_round = len(current_round_speakers) == self._num_stakeholders
        
        print(f"Current round: {self._current_round}, Speakers this round: {current_round_speakers}")
        print(f"All spoken this round: {all_spoken_current_round}")
        
        if all_spoken_current_round:
            print(f"Round {self._current_round} complete. All stakeholders have spoken.")
            # Move to next round if not at max rounds
            if self._current_round < self._debate_rounds:
                self._current_round += 1
                await self._start_new_round(self._current_round)
            else:
                print(f"Reached maximum rounds ({self._debate_rounds}). Ending debate.")
                # Reached max rounds, end debate
                await self._aggregate_final_evaluations()
        else:
            # Get available speakers for this round
            available_speakers = [
                s.stakeholder_name for s in self._initial_evaluations 
                if s.stakeholder_name not in self._no_issue_stakeholders 
                and s.stakeholder_name not in self._spoke_stakeholders
            ]
            
            if available_speakers:
                # Use LLM to select next speaker based on message content
                next_speaker = await self._select_next_speaker_with_llm(message, available_speakers)
                
                if next_speaker and next_speaker not in self._no_issue_stakeholders:
                    print(f"Round {self._current_round} continuing. Selected next speaker: {next_speaker}")
                    
                    # Start timer for the new speaker
                    self._waiting_for_speaker = next_speaker
                    if self._speaker_timer:
                        self._speaker_timer.cancel()
                    self._speaker_timer = asyncio.create_task(self._handle_speaker_timeout())
                    
                    # Forward the message to the next speaker
                    await self.publish_message(
                        DebateMessage(
                            stakeholder_name=message.stakeholder_name,
                            content=message.content,
                            round=self._current_round,
                            next_speaker=next_speaker
                        ),
                        topic_id=DefaultTopicId()
                    )
                else:
                    print("Selected speaker already said NO MORE COMMENTS. Ending debate.")
                    # No available speakers, end debate
                    await self._aggregate_final_evaluations()
            else:
                print("No available speakers. Ending debate.")
                await self._aggregate_final_evaluations()

    async def _aggregate_final_evaluations(self) -> None:
        """Helper method to aggregate final evaluations and end debate"""
        if self._evaluation_complete:
            return
            
        self._evaluation_complete = True
        print("Generating aggregated evaluation.")
        
        # Create system prompt for aggregation
        system_prompt = (
            "You are an impartial evaluation aggregator. Your task is to review the evaluations from "
            "multiple stakeholders and provide a comprehensive summary that fairly represents all perspectives."
        )
        
        # Prepare aggregated evaluation content
        aggregated_content = self._format_aggregation_content()
        
        if self._mode == "score":
            # Calculate average score from the final evaluations received during debate
            scores = []
            for stakeholder_name in self._no_issue_stakeholders:
                # Find the last message from this stakeholder that contains NO MORE COMMENTS
                for round_messages in reversed(self._debate_messages.values()):
                    for msg in round_messages:
                        if msg.stakeholder_name == stakeholder_name and 'NO MORE COMMENTS' in msg.content.upper():
                            try:
                                if "```json" in msg.content:
                                    json_str = msg.content.split("```json")[1].split("```")[0].strip()
                                    data = json.loads(json_str)
                                    if "score" in data:
                                        scores.append(float(data["score"]))
                                else:
                                    score_match = re.search(r'"score"\s*:\s*(\d+(\.\d+)?)', msg.content)
                                    if score_match:
                                        scores.append(float(score_match.group(1)))
                            except Exception as e:
                                print(f"Error extracting score for {stakeholder_name}: {e}")
                            break
                    if scores:  # If we found a score, stop looking in earlier rounds
                        break
            
            average_score = sum(scores) / len(scores) if scores else 0
            print(f"Calculated average score: {average_score}")  # Debug print
            
            # Create user prompt for score mode
            user_prompt = (
                "Your summary should include key areas of agreement and disagreement, and an overall assessment that reflects the range of perspectives.\n"
                f"You are given all final evaluations in {aggregated_content} and their average score in {average_score}.\n"
                f"Return A clear, concise synthesis of stakeholder feedback, highlighting consensus, divergence, and an overall interpretation."
            )
            
            # Make an inference using the model
            model_result = await self._model_client.create([
                SystemMessage(content=system_prompt),
                UserMessage(content=user_prompt, source="coordinator")
            ])
            
            summary = model_result.content
            print(f"{'='*20} Final Aggregated Evaluation {'='*20}\n{summary}\n{'='*60}")
            if self._logger:
                self._logger.log_system_event(f"Final Aggregated Evaluation: {summary}")
            
            # Publish DebateComplete
            await self.publish_message(
                DebateCompleteScore(
                    content=summary,
                    average_score=average_score
                ),
                topic_id=DefaultTopicId()
            )
            print(f"Published DebateCompleteScore with average_score: {average_score}")  # Debug print
        else:
            # Handle ranking mode if needed
            pass

    def _format_aggregation_content(self) -> str:
        """Format the aggregation content"""
        content = "=== Final Evaluations from All Stakeholders ===\n\n"
        
        # Show each stakeholder's final evaluation (NO MORE COMMENTS message)
        for stakeholder_name in self._no_issue_stakeholders:
            final_eval = None
            # Find the last NO MORE COMMENTS message from this stakeholder
            for round_messages in reversed(self._debate_messages.values()):
                for msg in round_messages:
                    if msg.stakeholder_name == stakeholder_name and 'NO MORE COMMENTS' in msg.content.upper():
                        final_eval = msg.content
                        break
                if final_eval:
                    break
            
            if final_eval:
                content += f"Stakeholder {stakeholder_name}:\n{final_eval}\n\n"
        
        return content

    async def _select_next_speaker_with_llm(self, current_message: DebateMessage, available_speakers: List[str]) -> str:
        """Use LLM to select the next speaker based on message content and available speakers"""
        if not available_speakers:
            return ""
        
        if not self._stakeholder_names:
            self._stakeholder_names = [eval_msg.stakeholder_name for eval_msg in self._initial_evaluations]
            
        # Create system prompt for speaker selection
        system_prompt = (
            "You are a coordinator of a debate. Your task is to select the next speaker in a debate "
            "based on the current speaker's message content and the list of available speakers. "
            "Choose the speaker who would be most appropriate to respond given their expertise and perspective."
        )
        
        # Create user prompt with current message and available speakers
        user_prompt = (
            f"Current speaker {current_message.stakeholder_name} said:\n{current_message.content}\n\n"
            f"Available speakers and their roles:\n"
        )
        
        # Add available speakers info from initial evaluations
        for eval_msg in self._initial_evaluations:
            if eval_msg.stakeholder_name in available_speakers:
                user_prompt += f"- {eval_msg.stakeholder_name}\n"
        
        user_prompt += (
            "\nBased on the current message content and available speakers, who would be the most appropriate "
            "next speaker? Consider:\n"
            "1. Who might have relevant expertise to address points raised\n"
            "2. Who might have a contrasting perspective that could enrich the discussion\n"
            "3. Who hasn't spoken recently in this round\n\n"
            "Return ONLY the name of the next speaker, exactly as shown in the available speakers list."
        )
        
        # Make an inference using the model
        model_result = await self._model_client.create([
            SystemMessage(content=system_prompt),
            UserMessage(content=user_prompt, source="coordinator")
        ])
        
        # Extract the selected speaker name
        selected_speaker = model_result.content.strip()
        
        # Validate that selected speaker is in available speakers
        if selected_speaker in available_speakers:
            print(f"LLM selected next speaker: {selected_speaker}")
            # Start timer for the new speaker
            self._waiting_for_speaker = selected_speaker
            if self._speaker_timer:
                self._speaker_timer.cancel()
            self._speaker_timer = asyncio.create_task(self._handle_speaker_timeout())
            return selected_speaker
        else:
            print(f"LLM selection invalid, falling back to first available speaker")
            selected_speaker = available_speakers[0]
            # Start timer for the fallback speaker
            self._waiting_for_speaker = selected_speaker
            if self._speaker_timer:
                self._speaker_timer.cancel()
            self._speaker_timer = asyncio.create_task(self._handle_speaker_timeout())
            return selected_speaker

    async def _start_new_round(self, round_num: int) -> None:
        """Helper method to start a new debate round"""
        print(f"\n{'='*20} Starting Round {round_num} {'='*20}")
        available_speakers = [s for s in self._stakeholder_names if s not in self._no_issue_stakeholders]
        self._spoke_stakeholders = set()
        print(f"Available speakers for round {round_num}: {available_speakers}")
        
        if available_speakers:
            next_speaker = available_speakers[0]
            print(f"Selected first speaker for round {round_num}: {next_speaker}")
            
            # Start timer for the speaker
            self._waiting_for_speaker = next_speaker
            if self._speaker_timer:
                self._speaker_timer.cancel()
            self._speaker_timer = asyncio.create_task(self._handle_speaker_timeout())
            
            # Create a new message to start the round
            await self.publish_message(
                DebateMessage(
                    stakeholder_name="coordinator",
                    content=f"Starting round {round_num}. Please continue the discussion.",
                    round=round_num,
                    next_speaker=next_speaker,
                    is_new_round=True
                ),
                topic_id=DefaultTopicId()
            )
        else:
            print("No available speakers for new round. Ending debate.")
            await self._aggregate_final_evaluations()

    @message_handler
    async def handle_initial_evaluation(self, message: InitialEvaluation, ctx: MessageContext) -> None:
        """Handle initial evaluations from stakeholders"""
        self._initial_evaluations.append(message)
        print(f"Coordinator received initial evaluation from {message.stakeholder_name}")
        
        # Once all initial evaluations are received, start debate
        if len(self._initial_evaluations) == self._num_stakeholders:
            print(f"All {self._num_stakeholders} initial evaluations received. Starting debate phase.")
            
            # Format the initial evaluations for the debate start message
            evaluations_data = [
                {"stakeholder_name": eval_msg.stakeholder_name, "content": eval_msg.content}
                for eval_msg in self._initial_evaluations
            ]
            
            # Add to debate history
            for eval_msg in self._initial_evaluations:
                self._debate_history.append({
                    "phase": "initial",
                    "stakeholder_name": eval_msg.stakeholder_name,
                    "content": eval_msg.content
                })
            
            # Publish message to start debate
            await self.publish_message(
                DebateStart(initial_evaluations=evaluations_data),
                topic_id=DefaultTopicId()
            )

@default_subscription
class CompletionHandler(RoutedAgent):
    def __init__(self, completion_event: asyncio.Event, debate_result: Dict) -> None:
        super().__init__("CompletionHandler")
        self._debate_result = debate_result
        self._completion_event = completion_event

    def _extract_json_from_content(self, content: str) -> Dict:
        """Extract and parse JSON data from content string"""
        try:
            # Try to extract JSON from markdown code block
            if "```json" in content:
                json_str = content.split("```json")[1].split("```")[0].strip()
                return json.loads(json_str)
            # Try to parse the entire content as JSON
            return json.loads(content)
        except Exception as e:
            # If parsing fails, try to extract score using regex
            score_match = re.search(r'"score"\s*:\s*(\d+(\.\d+)?)', content)
            feedback_match = re.search(r'"feedback"\s*:\s*"([^"]+)"', content)
            
            result = {}
            if feedback_match:
                result["feedback"] = feedback_match.group(1)
            if score_match:
                result["score"] = float(score_match.group(1))
            else:
                print(f"Error parsing JSON content: {e}")
            return result

    @message_handler
    async def handle_initial_evaluation(self, message: InitialEvaluation, ctx: MessageContext) -> None:
        """Handle initial evaluations from stakeholders"""
        evaluation_data = self._extract_json_from_content(message.content)
        self._debate_result["initial_evaluations"][message.stakeholder_name] = evaluation_data

    @message_handler
    async def handle_score_evaluation(self, message: ScoreEvaluation, ctx: MessageContext) -> None:
        """Handle final score evaluations from stakeholders"""
        # Remove "NO MORE COMMENTS" and extract JSON data
        content = message.content.replace("NO MORE COMMENTS", "").strip()
        evaluation_data = self._extract_json_from_content(content)
        evaluation_data["score"] = message.score  # Use the extracted score from ScoreEvaluation
        self._debate_result["final_evaluations"][message.stakeholder_name] = evaluation_data

    @message_handler
    async def handle_ranking_evaluation(self, message: RankingEvaluation, ctx: MessageContext) -> None:
        """Handle final ranking evaluations from stakeholders"""
        # Remove "NO MORE COMMENTS" and extract JSON data
        content = message.content.replace("NO MORE COMMENTS", "").strip()
        evaluation_data = self._extract_json_from_content(content)
        if message.rankings:
            evaluation_data["rankings"] = message.rankings
        self._debate_result["final_evaluations"][message.stakeholder_name] = evaluation_data

    @message_handler
    async def handle_debate_complete(self, message: Union[DebateCompleteScore, DebateCompleteRanking], ctx: MessageContext) -> None:
        """Handle both score and ranking debate complete messages"""
        print(f"Received debate complete message: {message.content}")  # Debug print
        
        # Extract feedback from the content
        aggregated_data = self._extract_json_from_content(message.content)
        if not aggregated_data:
            print("Warning: Could not extract JSON data from aggregated content")
            aggregated_data = {"feedback": message.content}
        
        # Update the debate result with aggregated data
        self._debate_result["aggregate"]["feedback"] = aggregated_data.get("feedback", message.content)
        
        if isinstance(message, DebateCompleteScore):
            self._debate_result["type"] = "score"
            self._debate_result["aggregate"]["average_score"] = message.average_score
            print(f"Added average score to aggregate: {message.average_score}")  # Debug print
        else:  # DebateCompleteRanking
            self._debate_result["type"] = "ranking"
            if message.rankings:
                self._debate_result["aggregate"]["rankings"] = message.rankings
                print(f"Added rankings to aggregate: {message.rankings}")  # Debug print
        
        print(f"Final aggregate: {self._debate_result['aggregate']}")  # Debug print
        self._completion_event.set()

def load_json(file_path):
    with open(file_path, "r") as f:
        return json.load(f)

def save_json(data, file_path):
    with open(file_path, "w") as f:
        json.dump(data, f, indent=4)

def process_evaluation_content(evaluation_content_df):
    """Process the evaluation content from DataFrame to list of dictionaries"""
    eval_list = []
    for index, row in evaluation_content_df.iterrows():
        eval_list.append({
            "title": row['story_title'],
            "story": row['story_content'],
            "id": row['section_id'],
            "evaluation": row['QA-pair'],
            "annotator": row['Generator (Model/Human)']
        })
    return eval_list

async def run_debate(
    id: str,
    story_title: str,
    annotator: str,
    stakeholders: List[Dict],
    task_description: str,
    evaluation_content: Any,
    context: str,
    response_format: str,
    group_name: str,
    model_config: ModelConfig,
    mode: str = "score",
    dimension: str = None,
    debate_rounds: int = 3,
    timeout: int = 600
):
    # Initialize model client based on configuration
    model_client = create_model_client(model_config)
    
    # Create agent runtime
    agent_runtime = SingleThreadedAgentRuntime()
    
    # Create completion event and debate result with metadata
    completion_event = asyncio.Event()
    debate_result = {
        "metadata": {
            "id": id,
            "story_title": story_title,
            "annotator": annotator,
            "group_name": group_name,
            "mode": mode,
            "dimension": dimension,
            "debate_rounds": debate_rounds,
            "model_name": model_config.model_name,
            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "evaluation_content": evaluation_content,
            "context": context
        },
        "initial_evaluations": {},
        "final_evaluations": {},
        "aggregate": {
            "feedback": None,
            "average_score": None,
            "rankings": None
        }
    }
    
    # Initialize debate logger
    logger = DebateLogger(id, story_title, group_name, annotator, dimension=dimension)
    logger.log_system_event("Debate session started")
    
    # Create completion handler
    await CompletionHandler.register(
        agent_runtime,
        "completion_handler",
        lambda: CompletionHandler(completion_event, debate_result)
    )
    
    # Create stakeholder agents
    for stakeholder in stakeholders:
        stakeholder_name = stakeholder['Name']
        agent_name = re.sub(r'[^a-zA-Z0-9_-]', '_', stakeholder_name.lower())
        agent_name = re.sub(r'_+', '_', agent_name).strip('_')
        
        await StakeholderAgent.register(
            agent_runtime,
            agent_name,
            lambda s=stakeholder, n=agent_name: StakeholderAgent(
                model_client=model_client,
                name=n,
                persona=s,
                task_description=task_description,
                evaluation_content=evaluation_content,
                context=context,
                response_format=response_format,
                max_debate_rounds=debate_rounds,
                logger=logger,
                mode=mode,
                dimension=dimension
            )
        )
        logger.log_system_event(f"Registered stakeholder agent: {agent_name}")
    
    # Create coordinator agent
    await Coordinator.register(
        agent_runtime,
        "coordinator",
        lambda: Coordinator(
            model_client=model_client,
            num_stakeholders=len(stakeholders),
            logger=logger,
            debate_rounds=debate_rounds,
            mode=mode
        )
    )
    logger.log_system_event("Registered coordinator agent")
    
    # Start the evaluation process
    agent_runtime.start()
    await agent_runtime.publish_message(
        EvaluationRequest(content=evaluation_content),
        topic_id=DefaultTopicId()
    )
    logger.log_system_event("Started evaluation process")
    
    try:
        # Wait for completion or timeout
        await asyncio.wait_for(completion_event.wait(), timeout=timeout)
        logger.log_system_event("Debate completed successfully")
        
        # Add evaluation content and context to results
        debate_result["metadata"]["evaluation_content"] = evaluation_content
        debate_result["metadata"]["context"] = context
        
        logger.save_results(debate_result)
        return debate_result
    except asyncio.TimeoutError:
        logger.log_system_event("Debate timed out")
        return {"error": "Debate timed out"}
    finally:
        await agent_runtime.stop_when_idle()
        await model_client.close()
        logger.log_system_event("Debate session ended")

async def run_debate_by_group(evaluation_list, stakeholders, response_format_scoring, task_description, model_config: ModelConfig):
    # Read the CSV file
    csv_score_file = pd.read_csv("StorySparkQA Experiment.csv")
    # revise the header line. add all stakeholders_initial and stakeholders_final into the header line
    for stakeholder_name in stakeholders.keys():
        if stakeholder_name not in csv_score_file.columns:
            csv_score_file[stakeholder_name] = None
    for stakeholder_group, stakeholder_list in stakeholders.items():
        for stakeholder_name in stakeholder_list:
            agent_name = stakeholder_name['Name']
            agent_name = re.sub(r'[^a-zA-Z0-9_-]', '_', agent_name.lower())
            agent_name = re.sub(r'_+', '_', agent_name).strip('_')
            initial_colname = f"{agent_name}_initial"
            final_colname = f"{agent_name}_final"
            if initial_colname not in csv_score_file.columns:
                csv_score_file[initial_colname] = None
            if final_colname not in csv_score_file.columns:
                csv_score_file[final_colname] = None
    csv_score_file.to_csv("StorySparkQA_Experiment_Qwen.csv", index=False)

    csv_score_file = pd.read_csv("StorySparkQA_Experiment_Qwen.csv")
    # 按evaluation item顺序处理
    batch_size = 5
    for batch_start in range(0, len(evaluation_list), batch_size):
        batch_end = min(batch_start + batch_size, len(evaluation_list))
        current_batch = evaluation_list[batch_start:batch_end]
        print(f"\nProcessing batch of evaluation items {batch_start+1} to {batch_end}")
        
        # Create all tasks for current batch of evaluation items and all stakeholder groups
        all_tasks = []
        all_meta = []
        
        # First level: Create tasks for evaluation items in current batch
        for i, eval_item in enumerate(current_batch, start=batch_start):
            # Second level: Create tasks for all stakeholder groups within each evaluation item
            for stakeholder_name, stakeholder_list in stakeholders.items():
                all_tasks.append(
                    run_debate(
                        id=eval_item["id"],
                        story_title=eval_item["title"],
                        annotator=eval_item["annotator"],
                        stakeholders=stakeholder_list,
                        task_description=task_description,
                        evaluation_content=eval_item["evaluation"],
                        context=eval_item["story"],
                        group_name=stakeholder_name,
                        response_format=response_format_scoring,
                        mode="score",
                        model_config=model_config,
                        debate_rounds=3,
                        timeout=600
                    )
                )
                all_meta.append({
                    "eval_idx": i,
                    "eval_item": eval_item,
                    "stakeholder_name": stakeholder_name
                })

       # Run all tasks in current batch in parallel
        print(f"Running all debates ({len(all_tasks)} total tasks) in parallel...")
        results = [None] * len(all_tasks)

        # Process results as they complete
        for idx, fut in enumerate(tqdm_asyncio.as_completed(all_tasks, total=len(all_tasks))):
            result = await fut
            meta = all_meta[idx]
            eval_item = meta["eval_item"]
            stakeholder_name = meta["stakeholder_name"]
            
            print(f"\nDebate Result for {eval_item['title']}-{eval_item['id']} - {stakeholder_name}:")
            if "error" in result:
                print("Error:", result["error"])
            else:
                print("Initial Evaluations:")
                for stakeholder, eval_data in result["initial_evaluations"].items():
                    print(f"  {stakeholder}: Score = {eval_data.get('score', 'N/A')}")
                print("\nFinal Evaluations:")
                for stakeholder, eval_data in result["final_evaluations"].items():
                    print(f"  {stakeholder}: Score = {eval_data.get('score', 'N/A')}")
                print("\nAggregated Evaluation:")
                agg_eval = result["aggregate"]
                print(f"  Feedback: {agg_eval.get('feedback', 'N/A')}")
                print(f"  Average Score: {agg_eval.get('average_score', 'N/A')}")
                
                # Save scores to CSV
                mask = (
                    (csv_score_file["story_title"] == eval_item["title"]) & 
                    (csv_score_file["section_id"].astype(str) == str(eval_item["id"])) & 
                    (csv_score_file["Generator"] == eval_item["annotator"])
                )
                if not mask.any():
                    print(f"No matching row found for story_title={eval_item['title']}, "
                          f"section_id={eval_item['id']}, annotator={eval_item['annotator']}")
                    continue
                print(f"Found matching row for {eval_item['title']}, updating scores...")
                
                # Update the group aggregate score
                if stakeholder_name in csv_score_file.columns:
                    csv_score_file.loc[mask, stakeholder_name] = agg_eval.get('average_score')
                
                # Update initial evaluations
                for stakeholder, eval_data in result["initial_evaluations"].items():
                    initial_key = f"{stakeholder}_initial"
                    if initial_key in csv_score_file.columns:
                        csv_score_file.loc[mask, initial_key] = eval_data.get('score')
                
                # Update final evaluations
                for stakeholder, eval_data in result["final_evaluations"].items():
                    final_key = f"{stakeholder}_final"
                    if final_key in csv_score_file.columns:
                        csv_score_file.loc[mask, final_key] = eval_data.get('score')
                
                # Save the updated DataFrame
                csv_score_file.to_csv("StorySparkQA_Experiment_Qwen.csv", index=False)
                print("Scores saved successfully")
                wandb.log({
                    "step": i + 1,
                    "story_title": eval_item["title"],
                    "group": stakeholder_name,
                    "stakeholder": stakeholder,
                    "final_score": eval_data.get('score', None),
                })
                def safe_artifact_name(s):
                    import re
                    return re.sub(r'[^a-zA-Z0-9_\-\.]', '_', str(s))
                artifact = wandb.Artifact(
                    f'qwen_qag_debate_logs_and_csv_{safe_artifact_name(eval_item["title"])}_{safe_artifact_name(eval_item["id"])}_{safe_artifact_name(stakeholder_name)}', type='output'
                )
                log_files = glob.glob(f'qwen_qag_debate_logs/debate_log_{safe_artifact_name(eval_item["id"])}_{safe_artifact_name(eval_item["title"])}_{safe_artifact_name(stakeholder_name)}_{safe_artifact_name(eval_item["annotator"])}_*.txt')
                if log_files:
                    latest_log_file = max(log_files, key=os.path.getmtime)
                    artifact.add_file(latest_log_file)
                    wandb.save(latest_log_file)
                result_files = glob.glob(f'qwen_qag_debate_logs/evaluation_results_{safe_artifact_name(eval_item["id"])}_{safe_artifact_name(eval_item["title"])}_{safe_artifact_name(stakeholder_name)}_{safe_artifact_name(eval_item["annotator"])}_*.json')
                if result_files:
                    latest_result_file = max(result_files, key=os.path.getmtime)
                    artifact.add_file(latest_result_file)
                    wandb.save(latest_result_file)
                artifact.add_file('StorySparkQA_Experiment_Qwen.csv')
                wandb.log_artifact(artifact)
            results[idx] = result
        print(f"Completed all debates for evaluation item {i+1}")
    return results

async def main():
    # Load stakeholders
    stakeholders = load_json("Qwen235Opinions2Personas/CreatedPersonas.json")
    
    # Define response formats
    response_format_scoring = """
Present a json object with the following format: 
```json 
{
    "feedback": "..." (use a few sentences to explain your evaluation, including the strengths and weaknesses of the evaluation content), 
    "score": 1 / 2 / 3 / 4 / 5
}
```
Your score should be on a five-point Likert scale, , with the following standards: 1 - Strongly Disagree; 2 - Disagree; 3 - Neither agree nor disagree; 4 - Agree; 5 - Strongly Agree.

You are expected to make full use of the scale to reflect the quality of the evaluation content. Avoid always giving scores around the middle (2–3). Be bold and discerning in your judgments—do not hesitate to give high scores for excellent items and low scores for poor ones.
    """
    
    # Define task description
    task_description = """
You need to evaluate the quality of AI-generated question-answer pairs from the storybook content. These AI-generated question-answer pairs are designed for the interactive storybook reading activity between parents and children aged 3 to 6, and should be grammatically correct and fluent in English. Parents expect to ask questions that are grounded in the storybook content, but also introduce real-world common knowledge beyond the story content.
    """
    
    # Example model configurations
    openai_config = ModelConfig(
        provider=ModelProvider.OPENAI,
        model_name="gpt-4",
        model_kwargs={"temperature": 0.7}
    )
    
    claude_config = ModelConfig(
        provider=ModelProvider.CLAUDE,
        model_name="your model id"
    )
    
    # Qwen config for sglang router
    qwen_config = ModelConfig(
        provider=ModelProvider.OPENAI,
        model_name=MODEL_NAME,
        qwen_client=client
    )
    
    # Choose which model to use
    model_config = qwen_config  # or openai_config or claude_config
    
    # Load evaluation content
    evaluation_df = pd.read_csv("StorySparkQA Human Score.csv")
    evaluation_list = process_evaluation_content(evaluation_df)
    
    await run_debate_by_group(evaluation_list, stakeholders, response_format_scoring, task_description, model_config)

if __name__ == "__main__":
    wandb.init(
        project="multi_stakeholder_debate", 
        name="debate_run_qwen",           
        notes="update debate log and csv"
    )
    # delete the existing debate logs folder
    if os.path.exists('qwen_qag_debate_logs'):
        shutil.rmtree('qwen_qag_debate_logs')   
    asyncio.run(main()) 
    artifact = wandb.Artifact('all_qwen_qag_debate_logs_and_csv', type='output')
    for log_file in glob.glob('qwen_qag_debate_logs/*.txt'):
        artifact.add_file(log_file)
    artifact.add_file('StorySparkQA_Experiment_Qwen.csv')
    wandb.log_artifact(artifact)
    wandb.finish()
