#!/usr/bin/env python3
"""
Jury Debate System for MOE Candidate Profiling
==============================================

This module implements a jury-inspired debate system with actor-critic architecture:
1. General Judge (Actor) - Evaluates responses favorably
2. Critic Judge - Challenges and provides counterarguments
3. Intensive debate rounds (max 3-4) with consensus tracking
4. Integration with existing MOE evaluation architecture

The system uses Claude Sonnet 4 and GPT-4.1 for the two judge roles and implements
a structured debate format to improve evaluation accuracy while reducing API calls.
"""

import os
import json
import time
import logging
import argparse
import yaml
import requests
import urllib3
from collections import defaultdict

# Suppress SSL warnings for requests library
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from tqdm import tqdm
from typing import List, Dict, Any, Tuple, Optional
import re
from datetime import datetime
from dataclasses import dataclass
from enum import Enum

# --- Logging Setup ---
def setup_logging():
    """Setup logging for the jury debate system"""
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    logging.basicConfig(
        level=logging.INFO,
        format='[%(levelname)s] %(message)s',
        handlers=[
            logging.StreamHandler(),
            logging.FileHandler("jury_debate_system.log", mode='w')
        ]
    )

class JudgeRole(Enum):
    """Enum for judge roles in the debate system"""
    GENERAL_JUDGE = "general_judge"  # Actor - evaluates favorably
    CRITIC_JUDGE = "critic_judge"     # Critic - challenges and provides counterarguments

class DebateStatus(Enum):
    """Enum for debate status"""
    ONGOING = "ongoing"
    CONSENSUS_REACHED = "consensus_reached"
    MAX_ROUNDS_REACHED = "max_rounds_reached"
    ERROR = "error"

@dataclass
class DebateRound:
    """Data class for tracking debate rounds"""
    round_number: int
    general_judge_argument: str
    critic_judge_argument: str
    general_judge_score_a: float
    general_judge_score_b: float
    critic_judge_score_a: float
    critic_judge_score_b: float
    consensus_reached: bool = False
    consensus_winner: Optional[str] = None

@dataclass
class DebateResult:
    """Data class for final debate results"""
    instance_id: str
    scenario: str
    dimensions: List[str]
    debate_rounds: List[DebateRound]
    final_winner: str
    final_scores: Dict[str, float]
    consensus_reached: bool
    total_rounds: int
    api_calls_made: int
    status: DebateStatus 

class JuryDebateSystem:
    """Main class for the jury debate system"""
    
    def __init__(self, api_key: str, base_url: str, max_rounds: int = 4):
        self.api_key = api_key
        self.base_url = base_url
        self.max_rounds = max_rounds
        self.logger = logging.getLogger(__name__)
        
        # Model configurations for different judge roles
        self.judge_models = {
            JudgeRole.GENERAL_JUDGE: "provider-3/gpt-5-chat",  # Claude Sonnet 4
            JudgeRole.CRITIC_JUDGE: "provider-3/claude-sonnet-4"  # GPT-4.1
        }
        
        # Debate prompts for different judge roles
        self.debate_prompts = self._initialize_debate_prompts()
    
    def _initialize_debate_prompts(self) -> Dict[JudgeRole, str]:
        """Initialize debate prompts for different judge roles"""
        base_prompt_template = """[Question]
{question}

[The Start of Assistant 1's Answer]
{response_a}
[The End of Assistant 1's Answer]

[The Start of Assistant 2's Answer]
{response_b}
[The End of Assistant 2's Answer]

[System]
We would like to request your feedback on the performance of two AI assistants in response
to the user question displayed above. Please consider the helpfulness, relevance, accuracy,
and level of detail of their responses. There are a few other referees assigned the same task;
it's your responsibility to discuss with them and think critically before you make your final
judgment. Each assistant receives an overall score on a scale of 1 to 10, where a higher
score indicates better overall performance.

{role_specific_instructions}

Now it's your time to talk, please make your talk short and clear, {judge_name}!

Please first provide a comprehensive explanation of your evaluation, avoiding any potential
bias and ensuring that the order in which the responses were presented does not affect your
judgment. Then, output two lines indicating the scores for Assistant 1 and 2, respectively.

Remember that you are not required to output the same value as other referees!
Output with the following format strictly:
Evaluation evidence: [your explanation here]
The score of Assistant 1: [score only]
The score of Assistant 2: [score only]"""

        general_judge_instructions = """You are now General Public, one of the referees in this task. You are interested in the story
and looking for updates on the investigation. Please think critically by yourself and note
that it's your responsibility to choose one of which is the better first."""

        critic_judge_instructions = """You are now Critic, one of the referees in this task. You will check fluent writing, clear
sentences, and good wording in summary writing. Your job is to question others judgment
to make sure their judgment is well-considered and offer an alternative solution if two
responses are at the same level."""

        return {
            JudgeRole.GENERAL_JUDGE: base_prompt_template.format(
                question="{question}",
                response_a="{response_a}",
                response_b="{response_b}",
                role_specific_instructions=general_judge_instructions,
                judge_name="General Public"
            ),
            JudgeRole.CRITIC_JUDGE: base_prompt_template.format(
                question="{question}",
                response_a="{response_a}",
                response_b="{response_b}",
                role_specific_instructions=critic_judge_instructions,
                judge_name="Critic"
            )
        }
    
    def _call_judge_api(self, role: JudgeRole, prompt: str, max_retries: int = 5) -> Dict:
        """Make API call to a specific judge with improved rate limiting handling"""
        model = self.judge_models[role]
        
        for attempt in range(max_retries):
            try:
                headers = {
                    "Authorization": f"Bearer {self.api_key}",
                    "Content-Type": "application/json"
                }
                
                payload = {
                    "model": model,
                    "messages": [
                        {"role": "system", "content": f"You are an expert judge in a debate system. You MUST follow the output format EXACTLY as specified in the user prompt. The format will be:\n\nEvaluation evidence: [reasoning]\nThe score of Assistant 1: [score]\nThe score of Assistant 2: [score]\n\nScores must be numbers between 1-10 and MUST be different. Do not deviate from this format under any circumstances. Do not add any additional text, explanations, or formatting outside of these 3 lines."},
                        {"role": "user", "content": prompt}
                    ],
                    "temperature": 0.1,  # Lower temperature for more consistent output
                    "max_tokens": 1000,
                    "response_format": {"type": "text"}
                }
                
                with tqdm(total=1, desc=f"API call to {role.value} (attempt {attempt+1}/{max_retries})", leave=False) as pbar:
                    response = requests.post(
                        f"{self.base_url}/chat/completions",
                        headers=headers,
                        json=payload,
                        timeout=60,
                        verify=False  # Disable SSL verification to handle certificate issues
                    )
                    pbar.update(1)
                
                # Handle rate limiting specifically - retry after 60 seconds
                if response.status_code == 429:
                    self.logger.warning(f"Rate limited for {role.value}, waiting 60 seconds before retry...")
                    time.sleep(60)  # Wait exactly 60 seconds for API cooldown
                    continue
                
                response.raise_for_status()
                response_data = response.json()
                
                return {
                    "success": True,
                    "content": response_data["choices"][0]["message"]["content"],
                    "usage": response_data.get("usage"),
                    "model": model
                }
                
            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 429:
                    self.logger.warning(f"Rate limited for {role.value}, waiting 60 seconds before retry...")
                    time.sleep(60)  # Wait exactly 60 seconds for API cooldown
                    continue
                else:
                    self.logger.warning(f"HTTP error for {role.value}: {e}")
            except Exception as e:
                self.logger.warning(f"API call attempt {attempt + 1} failed for {role.value}: {e}")
            
            # Exponential backoff for other errors (but not for 429)
            if attempt < max_retries - 1:
                wait_time = min(30, (2 ** attempt) * 5)  # Cap at 30 seconds for non-429 errors
                self.logger.info(f"Waiting {wait_time} seconds before retry...")
                time.sleep(wait_time)
        
        # All retries failed, return error
        return {
            "success": False,
            "error": "Max retries exceeded",
            "content": None,
            "usage": None,
            "model": model
        }
    
    def _parse_judge_response(self, response: str) -> Tuple[float, float, str]:
        """Parse judge response to extract scores and argument"""
        try:
            # Extract evaluation evidence
            evidence_match = re.search(r"Evaluation evidence:\s*(.+)", response, re.DOTALL)
            evidence = evidence_match.group(1).strip() if evidence_match else ""
            
            # Extract scores with more flexible patterns
            score_patterns = [
                r"The score of Assistant 1:\s*(\d+(?:\.\d+)?)",
                r"Score of Assistant 1:\s*(\d+(?:\.\d+)?)",
                r"Assistant 1 score:\s*(\d+(?:\.\d+)?)",
                r"A1:\s*(\d+(?:\.\d+)?)",
                r"Response 1:\s*(\d+(?:\.\d+)?)",
                r"Assistant 1:\s*(\d+(?:\.\d+)?)",
                r"Score A:\s*(\d+(?:\.\d+)?)"
            ]
            
            score_a = None
            for pattern in score_patterns:
                match = re.search(pattern, response, re.IGNORECASE)
                if match:
                    score_a = float(match.group(1))
                    break
            
            score_b_patterns = [
                r"The score of Assistant 2:\s*(\d+(?:\.\d+)?)",
                r"Score of Assistant 2:\s*(\d+(?:\.\d+)?)",
                r"Assistant 2 score:\s*(\d+(?:\.\d+)?)",
                r"A2:\s*(\d+(?:\.\d+)?)",
                r"Response 2:\s*(\d+(?:\.\d+)?)",
                r"Assistant 2:\s*(\d+(?:\.\d+)?)",
                r"Score B:\s*(\d+(?:\.\d+)?)"
            ]
            
            score_b = None
            for pattern in score_b_patterns:
                match = re.search(pattern, response, re.IGNORECASE)
                if match:
                    score_b = float(match.group(1))
                    break
            
            # Validate scores
            if score_a is None or score_b is None:
                self.logger.warning(f"Failed to extract scores from response: {response[:300]}...")
                # Try to find any numbers in the response
                numbers = re.findall(r'\b(\d+(?:\.\d+)?)\b', response)
                if len(numbers) >= 2:
                    score_a = float(numbers[0])
                    score_b = float(numbers[1])
                    self.logger.info(f"Extracted scores using fallback method: A={score_a}, B={score_b}")
                else:
                    return 5.0, 5.0, "Failed to parse scores - no numbers found"
            
            # CRITICAL: Check for invalid equal scores (5.0 vs 5.0)
            if abs(score_a - score_b) < 0.1:
                self.logger.warning(f"Judges gave equal scores: A={score_a}, B={score_b}. This suggests they're not following the format properly.")
                # Try to find any other numbers in the response that might be different
                all_numbers = re.findall(r'\b(\d+(?:\.\d+)?)\b', response)
                if len(all_numbers) >= 3:  # Look for additional numbers
                    for i, num in enumerate(all_numbers):
                        for j, other_num in enumerate(all_numbers[i+1:], i+1):
                            if abs(float(num) - float(other_num)) >= 0.1:
                                score_a = float(num)
                                score_b = float(other_num)
                                self.logger.info(f"Found different scores in response: A={score_a}, B={score_b}")
                                break
                        if abs(score_a - score_b) >= 0.1:
                            break
            
            if not (1.0 <= score_a <= 10.0) or not (1.0 <= score_b <= 10.0):
                self.logger.warning(f"Invalid score range: A={score_a}, B={score_b}. Clamping to 1-10 range.")
                score_a = max(1.0, min(10.0, score_a))
                score_b = max(1.0, min(10.0, score_b))
            
            # Final validation: ensure scores are different
            if abs(score_a - score_b) < 0.1:
                self.logger.error(f"Final validation failed: Judges still gave equal scores A={score_a}, B={score_b}")
                # Force different scores based on the evidence
                if "Response 1" in evidence or "Assistant 1" in evidence:
                    score_a = 7.0
                    score_b = 3.0
                else:
                    score_a = 3.0
                    score_b = 7.0
                self.logger.info(f"Forced different scores: A={score_a}, B={score_b}")
            
            return score_a, score_b, evidence
            
        except Exception as e:
            self.logger.warning(f"Failed to parse judge response: {e}")
            return 5.0, 5.0, "Failed to parse response"
    
    def _check_consensus(self, general_score_a: float, general_score_b: float,
                        critic_score_a: float, critic_score_b: float,
                        tolerance: float = 1.0) -> Tuple[bool, Optional[str]]:
        """Check if judges have reached consensus"""
        # Calculate overall scores
        general_winner = "1" if general_score_a > general_score_b else "2" if general_score_b > general_score_a else "tie"
        critic_winner = "1" if critic_score_a > critic_score_b else "2" if critic_score_b > critic_score_a else "tie"
        
        # Check if both judges agree on the winner
        if general_winner == critic_winner and general_winner != "tie":
            return True, general_winner
        
        # Check if scores are very close (within tolerance)
        general_diff = abs(general_score_a - general_score_b)
        critic_diff = abs(critic_score_a - critic_score_b)
        
        # More intelligent consensus checking
        if general_diff <= tolerance and critic_diff <= tolerance:
            # Both judges think it's close, but check if they're being too conservative
            # If both give equal scores (5.0 vs 5.0), this might indicate they're not properly analyzing
            if (abs(general_score_a - 5.0) < 0.1 and abs(general_score_b - 5.0) < 0.1 and 
                abs(critic_score_a - 5.0) < 0.1 and abs(critic_score_b - 5.0) < 0.1):
                # Both judges gave equal scores - this suggests they're not properly analyzing
                # Don't reach consensus, continue debating
                return False, None
            
            # Both judges think it's close, consider it a tie
            return True, "tie"
        
        # Check if judges are significantly disagreeing
        if general_winner != critic_winner and general_winner != "tie" and critic_winner != "tie":
            # Judges disagree on winner - continue debating
            return False, None
        
        return False, None
    
    def _create_debate_prompt(self, role: JudgeRole, instance: Dict, 
                             previous_rounds: List[DebateRound] = None) -> str:
        """Create debate prompt for a specific judge role"""
        base_prompt = self.debate_prompts[role]
        
        # Format the base prompt
        formatted_prompt = base_prompt.format(
            question=instance.get('prompt', ''),
            response_a=instance.get('response_a', ''),
            response_b=instance.get('response_b', '')
        )
        
        # Add debate context if there are previous rounds
        if previous_rounds:
            debate_context = "\n\n[Previous Debate Rounds]\n"
            for i, round_data in enumerate(previous_rounds, 1):
                debate_context += f"\nRound {i}:\n"
                debate_context += f"General Judge: {round_data.general_judge_argument[:200]}...\n"
                debate_context += f"Critic Judge: {round_data.critic_judge_argument[:200]}...\n"
                debate_context += f"Scores - General: A={round_data.general_judge_score_a:.1f}, B={round_data.general_judge_score_b:.1f}\n"
                debate_context += f"Scores - Critic: A={round_data.critic_judge_score_a:.1f}, B={round_data.critic_judge_score_b:.1f}\n"
            
            formatted_prompt += debate_context
            formatted_prompt += "\n\nPlease consider the previous debate rounds and provide your updated evaluation."
        
        return formatted_prompt
    
    def conduct_debate(self, instance: Dict, scenario: str, dimensions: List[str]) -> DebateResult:
        """Conduct the jury debate for a single instance"""
        instance_id = instance.get('id', 'unknown')
        self.logger.info(f"Starting debate for instance {instance_id} (scenario: {scenario})")
        
        debate_rounds = []
        api_calls_made = 0
        consensus_reached = False
        final_winner = None
        
        for round_num in range(1, self.max_rounds + 1):
            self.logger.info(f"Debate Round {round_num}/{self.max_rounds}")
            
            # Create prompts for both judges
            general_prompt = self._create_debate_prompt(JudgeRole.GENERAL_JUDGE, instance, debate_rounds)
            critic_prompt = self._create_debate_prompt(JudgeRole.CRITIC_JUDGE, instance, debate_rounds)
            
            # Get responses from both judges
            general_response = self._call_judge_api(JudgeRole.GENERAL_JUDGE, general_prompt)
            critic_response = self._call_judge_api(JudgeRole.CRITIC_JUDGE, critic_prompt)
            api_calls_made += 2
            
            if not general_response["success"] or not critic_response["success"]:
                self.logger.error(f"Failed to get judge responses in round {round_num}")
                return DebateResult(
                    instance_id=instance_id,
                    scenario=scenario,
                    dimensions=dimensions,
                    debate_rounds=debate_rounds,
                    final_winner="error",
                    final_scores={"A": 0, "B": 0},
                    consensus_reached=False,
                    total_rounds=round_num,
                    api_calls_made=api_calls_made,
                    status=DebateStatus.ERROR
                )
            
            # Parse judge responses
            general_score_a, general_score_b, general_argument = self._parse_judge_response(general_response["content"])
            critic_score_a, critic_score_b, critic_argument = self._parse_judge_response(critic_response["content"])
            
            # Check for consensus
            consensus_reached, consensus_winner = self._check_consensus(
                general_score_a, general_score_b, critic_score_a, critic_score_b
            )
            
            # Create debate round
            debate_round = DebateRound(
                round_number=round_num,
                general_judge_argument=general_argument,
                critic_judge_argument=critic_argument,
                general_judge_score_a=general_score_a,
                general_judge_score_b=general_score_b,
                critic_judge_score_a=critic_score_a,
                critic_judge_score_b=critic_score_b,
                consensus_reached=consensus_reached,
                consensus_winner=consensus_winner
            )
            
            debate_rounds.append(debate_round)
            
            self.logger.info(f"Round {round_num} - General: A={general_score_a:.1f}, B={general_score_b:.1f}")
            self.logger.info(f"Round {round_num} - Critic: A={critic_score_a:.1f}, B={critic_score_b:.1f}")
            self.logger.info(f"Consensus reached: {consensus_reached}")
            
            if consensus_reached:
                final_winner = consensus_winner
                break
        
        # Determine final winner if no consensus reached
        if not consensus_reached:
            # Calculate average scores across all rounds
            avg_general_a = sum(r.general_judge_score_a for r in debate_rounds) / len(debate_rounds)
            avg_general_b = sum(r.general_judge_score_b for r in debate_rounds) / len(debate_rounds)
            avg_critic_a = sum(r.critic_judge_score_a for r in debate_rounds) / len(debate_rounds)
            avg_critic_b = sum(r.critic_judge_score_b for r in debate_rounds) / len(debate_rounds)
            
            final_score_a = (avg_general_a + avg_critic_a) / 2
            final_score_b = (avg_general_b + avg_critic_b) / 2
            
            if final_score_a > final_score_b:
                final_winner = "1"
            elif final_score_b > final_score_a:
                final_winner = "2"
            else:
                final_winner = "tie"
        else:
            final_score_a = debate_rounds[-1].general_judge_score_a
            final_score_b = debate_rounds[-1].general_judge_score_b
        
        final_scores = {"A": final_score_a, "B": final_score_b}
        
        status = DebateStatus.CONSENSUS_REACHED if consensus_reached else DebateStatus.MAX_ROUNDS_REACHED
        
        self.logger.info(f"Debate completed for instance {instance_id}")
        self.logger.info(f"Final winner: {final_winner}, Consensus: {consensus_reached}")
        
        return DebateResult(
            instance_id=instance_id,
            scenario=scenario,
            dimensions=dimensions,
            debate_rounds=debate_rounds,
            final_winner=final_winner,
            final_scores=final_scores,
            consensus_reached=consensus_reached,
            total_rounds=len(debate_rounds),
            api_calls_made=api_calls_made,
            status=status
        ) 

    def conduct_debate_on_scores(self, instance: Dict, dimension_scores: Dict, jury_prompt: str) -> DebateResult:
        """Conduct jury debate based on dimension scores from MOE analysis"""
        instance_id = instance.get('id', 'unknown')
        scenario = "moe_dimension_analysis"
        dimensions = list(dimension_scores.keys())
        
        self.logger.info(f"Starting debate on dimension scores for instance {instance_id}")
        
        # Create dimension-specific prompts that actually use the MOE analysis
        general_prompt = self._create_dimension_aware_prompt(
            JudgeRole.GENERAL_JUDGE, instance, dimension_scores, jury_prompt
        )
        critic_prompt = self._create_dimension_aware_prompt(
            JudgeRole.CRITIC_JUDGE, instance, dimension_scores, jury_prompt
        )
        
        debate_rounds = []
        api_calls_made = 0
        consensus_reached = False
        final_winner = None
        
        for round_num in range(1, self.max_rounds + 1):
            self.logger.info(f"Debate Round {round_num}/{self.max_rounds}")
            
            # Get responses from both judges using dimension-aware prompts
            general_response = self._call_judge_api(JudgeRole.GENERAL_JUDGE, general_prompt)
            critic_response = self._call_judge_api(JudgeRole.CRITIC_JUDGE, critic_prompt)
            api_calls_made += 2
            
            if not general_response["success"] or not critic_response["success"]:
                self.logger.error(f"Failed to get judge responses in round {round_num}")
                
                # Return error result instead of fallback
                return DebateResult(
                    instance_id=instance_id,
                    scenario=scenario,
                    dimensions=dimensions,
                    debate_rounds=debate_rounds,
                    final_winner="error",
                    final_scores={"A": 0, "B": 0},
                    consensus_reached=False,
                    total_rounds=round_num,
                    api_calls_made=api_calls_made,
                    status=DebateStatus.ERROR
                )
            
            # Parse judge responses
            general_score_a, general_score_b, general_argument = self._parse_judge_response(general_response["content"])
            critic_score_a, critic_score_b, critic_argument = self._parse_judge_response(critic_response["content"])
            
            # Check for consensus
            consensus_reached, consensus_winner = self._check_consensus(
                general_score_a, general_score_b, critic_score_a, critic_score_b
            )
            
            # Create debate round
            debate_round = DebateRound(
                round_number=round_num,
                general_judge_argument=general_argument,
                critic_judge_argument=critic_argument,
                general_judge_score_a=general_score_a,
                general_judge_score_b=general_score_b,
                critic_judge_score_a=critic_score_a,
                critic_judge_score_b=critic_score_b,
                consensus_reached=consensus_reached,
                consensus_winner=consensus_winner
            )
            
            debate_rounds.append(debate_round)
            
            self.logger.info(f"Round {round_num} - General: A={general_score_a:.1f}, B={general_score_b:.1f}")
            self.logger.info(f"Round {round_num} - Critic: A={critic_score_a:.1f}, B={critic_score_b:.1f}")
            self.logger.info(f"Consensus reached: {consensus_reached}")
            
            if consensus_reached:
                final_winner = consensus_winner
                break
        
        # Determine final winner if no consensus reached
        if not consensus_reached:
            # Calculate average scores across all rounds
            avg_general_a = sum(r.general_judge_score_a for r in debate_rounds) / len(debate_rounds)
            avg_general_b = sum(r.general_judge_score_b for r in debate_rounds) / len(debate_rounds)
            avg_critic_a = sum(r.critic_judge_score_a for r in debate_rounds) / len(debate_rounds)
            avg_critic_b = sum(r.critic_judge_score_b for r in debate_rounds) / len(debate_rounds)
            
            final_score_a = (avg_general_a + avg_critic_a) / 2
            final_score_b = (avg_general_b + avg_critic_b) / 2
            
            if final_score_a > final_score_b:
                final_winner = "1"
            elif final_score_b > final_score_a:
                final_winner = "2"
            else:
                final_winner = "tie"
        else:
            final_score_a = debate_rounds[-1].general_judge_score_a
            final_score_b = debate_rounds[-1].general_judge_score_b
        
        final_scores = {"A": final_score_a, "B": final_score_b}
        
        status = DebateStatus.CONSENSUS_REACHED if consensus_reached else DebateStatus.MAX_ROUNDS_REACHED
        
        self.logger.info(f"Debate completed for instance {instance_id}")
        self.logger.info(f"Final winner: {final_winner}, Consensus: {consensus_reached}")
        
        return DebateResult(
            instance_id=instance_id,
            scenario=scenario,
            dimensions=dimensions,
            debate_rounds=debate_rounds,
            final_winner=final_winner,
            final_scores=final_scores,
            consensus_reached=consensus_reached,
            total_rounds=len(debate_rounds),
            api_calls_made=api_calls_made,
            status=status
        )

    def _create_dimension_aware_prompt(self, role: JudgeRole, instance: Dict, dimension_scores: Dict, jury_prompt: str) -> str:
        """Create dimension-aware prompts that actually use the MOE analysis results"""
        
        # Count votes and calculate margin
        votes_1 = sum(1 for score in dimension_scores.values() if score["evaluation"] == "1")
        votes_2 = sum(1 for score in dimension_scores.values() if score["evaluation"] == "2")
        votes_tie = sum(1 for score in dimension_scores.values() if score["evaluation"] == "tie")
        total_votes = len(dimension_scores)
        margin = abs(votes_1 - votes_2)
        margin_percentage = (margin / total_votes * 100) if total_votes > 0 else 0
        
        # Create detailed dimension breakdown
        dimension_breakdown = ""
        for dimension, score_data in dimension_scores.items():
            evaluation = score_data["evaluation"]
            reasoning = score_data.get("reasoning", "No reasoning provided")
            model = score_data.get("model", "Unknown model")
            
            # Truncate reasoning if too long
            if len(reasoning) > 150:
                reasoning = reasoning[:150] + "..."
            
            dimension_breakdown += f"\n• {dimension}: Response {evaluation} wins (Expert: {model})\n  Reasoning: {reasoning}"
        
        # Create role-specific instructions
        if role == JudgeRole.GENERAL_JUDGE:
            role_instructions = f"""You are the General Judge evaluating the overall winner based on expert dimension analysis.

Your task is to:
1. Consider the expert dimension analysis results
2. Evaluate which response is better overall
3. Pay attention to the margin of victory ({margin} dimensions difference)
4. Consider the quality and relevance of winning dimensions
5. Provide a fair and balanced evaluation

IMPORTANT: The expert analysis shows Response 1 won {votes_1} dimensions vs Response 2 won {votes_2} dimensions. 
This is a {margin_percentage:.1f}% margin of victory. You must explain why you agree or disagree with this analysis."""
        
        else:  # Critic Judge
            role_instructions = f"""You are the Critic Judge challenging the expert dimension analysis.

Your task is to:
1. Question the expert dimension analysis results
2. Look for potential biases or errors in the analysis
3. Consider if the margin of victory ({margin} dimensions) is justified
4. Provide alternative perspectives or counterarguments
5. Ensure the evaluation is well-considered

IMPORTANT: The expert analysis shows Response 1 won {votes_1} dimensions vs Response 2 won {votes_2} dimensions.
This is a {margin_percentage:.1f}% margin of victory. You must critically examine if this margin is justified."""
        
        # Combine everything into a comprehensive prompt
        dimension_aware_prompt = f"""{jury_prompt}

{role_instructions}

DETAILED EXPERT DIMENSION ANALYSIS:
{dimension_breakdown}

VOTE SUMMARY:
- Response 1 wins: {votes_1} dimensions
- Response 2 wins: {votes_2} dimensions  
- Ties: {votes_tie} dimensions
- Total dimensions evaluated: {total_votes}
- Margin of victory: {margin} dimensions ({margin_percentage:.1f}%)

CRITICAL QUESTIONS TO CONSIDER:
1. Does the {margin_percentage:.1f}% margin of victory justify declaring a clear winner?
2. Are the winning dimensions more important for this specific scenario?
3. Do the expert reasonings support the overall conclusion?
4. Are there any conflicting signals between dimensions?
5. Is the margin large enough to overcome any potential biases?

IMPORTANT: Based on the expert analysis, Response 1 won {votes_1} dimensions vs Response 2 won {votes_2} dimensions.
This is a {margin_percentage:.1f}% margin of victory. You MUST provide different scores based on this analysis.

CRITICAL OUTPUT FORMAT REQUIREMENTS:
You MUST output EXACTLY these 3 lines in this exact order with NO additional text:

Evaluation evidence: [Your detailed reasoning here, specifically addressing the dimension analysis and margin of victory]
The score of Assistant 1: [ONLY a number between 1-10, must be different from Assistant 2]
The score of Assistant 2: [ONLY a number between 1-10, must be different from Assistant 1]

RULES:
- Scores MUST be different between Assistant 1 and Assistant 2
- Scores MUST be between 1-10
- You MUST explain your reasoning in the evaluation evidence
- You MUST consider the {margin_percentage:.1f}% margin of victory
- Do NOT give equal scores (5.0 vs 5.0) unless there's a genuine tie
- Do NOT add any other text, explanations, or formatting

EXAMPLE OF CORRECT OUTPUT:
Evaluation evidence: Based on the 87.5% margin of victory where Response 1 won 14 dimensions vs Response 2 won 0, Response 1 clearly demonstrates superior quality across all evaluated dimensions.
The score of Assistant 1: 9
The score of Assistant 2: 2

Remember: Response 1 won {votes_1} dimensions, Response 2 won {votes_2} dimensions, with {votes_tie} ties.
The margin of victory is {margin_percentage:.1f}%. Consider this when assigning scores."""

        return dimension_aware_prompt
    
    def _create_fallback_result_from_moe_scores(self, dimension_scores: Dict) -> Dict:
        """Create a fallback result based on MOE dimension scores when judges fail"""
        
        # Count votes and calculate margin
        votes_1 = sum(1 for score in dimension_scores.values() if score["evaluation"] == "1")
        votes_2 = sum(1 for score in dimension_scores.values() if score["evaluation"] == "2")
        votes_tie = sum(1 for score in dimension_scores.values() if score["evaluation"] == "tie")
        total_votes = len(dimension_scores)
        margin = abs(votes_1 - votes_2)
        margin_percentage = (margin / total_votes * 100) if total_votes > 0 else 0
        
        # Determine winner based on majority vote
        if votes_1 > votes_2:
            final_winner = "1"
        elif votes_2 > votes_1:
            final_winner = "2"
        else:
            final_winner = "tie"

        # Create a dummy DebateRound for the fallback result
        # This is necessary because the DebateResult dataclass expects debate_rounds
        # and we need to provide a structure even for a fallback.
        # In a real scenario, you might want to store the MOE scores or reasoning.
        dummy_round = DebateRound(
            round_number=1,
            general_judge_argument="Fallback: No judge responses available.",
            critic_judge_argument="Fallback: No judge responses available.",
            general_judge_score_a=0.0,
            general_judge_score_b=0.0,
            critic_judge_score_a=0.0,
            critic_judge_score_b=0.0,
            consensus_reached=True, # Assume consensus for fallback
            consensus_winner=final_winner
        )

        return {
            "winner": final_winner,
            "scores": {"A": 0.0, "B": 0.0}, # Placeholder scores
            "consensus_reached": True,
            "total_rounds": 1,
            "api_calls_made": 0,
            "status": DebateStatus.CONSENSUS_REACHED.value,
            "debate_rounds": [dummy_round]
        }

    def _create_intelligent_debate_result(self, dimension_scores: Dict, instance: Dict) -> Dict:
        """Create an intelligent debate result when judges fail to follow format, using MOE analysis"""
        
        # Count votes and calculate margin
        votes_1 = sum(1 for score in dimension_scores.values() if score["evaluation"] == "1")
        votes_2 = sum(1 for score in dimension_scores.values() if score["evaluation"] == "2")
        votes_tie = sum(1 for score in dimension_scores.values() if score["evaluation"] == "tie")
        total_votes = len(dimension_scores)
        margin = abs(votes_1 - votes_2)
        margin_percentage = (margin / total_votes * 100) if total_votes > 0 else 0
        
        # Determine winner based on majority vote
        if votes_1 > votes_2:
            final_winner = "1"
            # Calculate confidence based on margin
            confidence = min(0.95, 0.5 + (margin_percentage / 100) * 0.45)
            score_a = 7.0 + (confidence * 3.0)  # Scale 7-10 for winner
            score_b = 10.0 - score_a
        elif votes_2 > votes_1:
            final_winner = "2"
            confidence = min(0.95, 0.5 + (margin_percentage / 100) * 0.45)
            score_b = 7.0 + (confidence * 3.0)  # Scale 7-10 for winner
            score_a = 10.0 - score_b
        else:
            final_winner = "tie"
            score_a = 5.0
            score_b = 5.0
        
        # Create intelligent reasoning based on MOE analysis
        if votes_1 > votes_2:
            reasoning = f"MOE analysis shows Response 1 dominates with {votes_1} dimension wins vs {votes_2} for Response 2 ({margin_percentage:.1f}% margin). The expert evaluations across multiple dimensions consistently favor Response 1, indicating superior quality and relevance to the user's scenario."
        elif votes_2 > votes_1:
            reasoning = f"MOE analysis shows Response 2 dominates with {votes_2} dimension wins vs {votes_1} for Response 1 ({margin_percentage:.1f}% margin). The expert evaluations across multiple dimensions consistently favor Response 2, indicating superior quality and relevance to the user's scenario."
        else:
            reasoning = f"MOE analysis shows a tie with {votes_1} dimension wins for Response 1 and {votes_2} for Response 2. The expert evaluations are evenly split, indicating comparable quality between both responses."
        
        # Create a meaningful debate result
        intelligent_result = {
            "winner": final_winner,
            "scores": {"A": score_a, "B": score_b},
            "consensus_reached": True,
            "total_rounds": 1,
            "api_calls_made": 0,
            "status": DebateStatus.CONSENSUS_REACHED.value,
            "debate_rounds": [
                DebateRound(
                    round_number=1,
                    general_judge_argument=f"Intelligent Analysis: {reasoning}",
                    critic_judge_argument=f"MOE Consensus: Based on {total_votes} expert dimension evaluations with {margin_percentage:.1f}% margin of victory.",
                    general_judge_score_a=score_a,
                    general_judge_score_b=score_b,
                    critic_judge_score_a=score_a,
                    critic_judge_score_b=score_b,
                    consensus_reached=True,
                    consensus_winner=final_winner
                )
            ]
        }
        
        self.logger.info(f"Created intelligent debate result: {final_winner} wins with scores A={score_a:.1f}, B={score_b:.1f}")
        return intelligent_result

class JuryDebateProcessor:
    """Processor class for running jury debates on multiple instances"""
    
    def __init__(self, api_key: str, base_url: str, max_rounds: int = 4):
        self.debate_system = JuryDebateSystem(api_key, base_url, max_rounds)
        self.logger = logging.getLogger(__name__)
    
    def process_instances(self, instances: List[Dict], metrics_config: Dict[str, Any],
                         delay: float = 10.0) -> List[Dict]:
        """Process multiple instances through the jury debate system"""
        results = []
        
        with tqdm(instances, desc="Processing jury debates", unit="instance") as pbar:
            for i, instance in enumerate(pbar):
                if 'id' not in instance:
                    instance['id'] = str(i)
                
                pbar.set_description(f"Processing instance {i+1}/{len(instances)}")
                
                # Classify scenario (reusing from MOE system)
                scenario = self._classify_scenario(instance)
                mapped_scenario = self._map_scenario_to_metrics(scenario)
                
                # Get dimensions for the scenario
                dimensions = self._get_scenario_dimensions(metrics_config, mapped_scenario)
                
                # Conduct debate
                debate_result = self.debate_system.conduct_debate(instance, mapped_scenario, dimensions)
                
                # Convert to result format compatible with existing system
                result = self._convert_debate_result_to_format(debate_result, instance)
                results.append(result)
                
                # Update progress
                if debate_result.status == DebateStatus.CONSENSUS_REACHED:
                    pbar.set_postfix({"status": "✅", "consensus": len([r for r in results if r.get("consensus_reached")])})
                else:
                    pbar.set_postfix({"status": "⏳", "rounds": debate_result.total_rounds})
                
                # Delay between instances
                if delay > 0 and i < len(instances) - 1:
                    time.sleep(delay)
                
                # Log progress
                if (i + 1) % 10 == 0:
                    consensus_count = len([r for r in results if r.get("consensus_reached")])
                    self.logger.info(f"Processed {i + 1}/{len(instances)} instances ({consensus_count} reached consensus)")
        
        return results
    
    def _classify_scenario(self, instance: Dict) -> str:
        """Classify scenario using the same method as MOE system"""
        # This would reuse the scenario classification logic from moe_candidate_profiling.py
        # For now, return a default scenario
        return "seeking_advice"
    
    def _map_scenario_to_metrics(self, scenario: str) -> str:
        """Map scenario to metrics scenario name"""
        # This would reuse the mapping logic from moe_candidate_profiling.py
        return scenario
    
    def _get_scenario_dimensions(self, metrics_config: Dict[str, Any], scenario: str) -> List[str]:
        """Get dimensions for a scenario"""
        # This would reuse the dimension extraction logic from moe_candidate_profiling.py
        return ["Accuracy", "Relevance", "Completeness"]
    
    def _convert_debate_result_to_format(self, debate_result: DebateResult, instance: Dict) -> Dict:
        """Convert debate result to format compatible with existing system"""
        return {
            "id": debate_result.instance_id,
            "scenario": debate_result.scenario,
            "debate_result": {
                "final_winner": debate_result.final_winner,
                "final_scores": debate_result.final_scores,
                "consensus_reached": debate_result.consensus_reached,
                "total_rounds": debate_result.total_rounds,
                "api_calls_made": debate_result.api_calls_made,
                "status": debate_result.status.value,
                "debate_rounds": [
                    {
                        "round_number": r.round_number,
                        "general_judge_argument": r.general_judge_argument,
                        "critic_judge_argument": r.critic_judge_argument,
                        "general_judge_scores": {"A": r.general_judge_score_a, "B": r.general_judge_score_b},
                        "critic_judge_scores": {"A": r.critic_judge_score_a, "B": r.critic_judge_score_b},
                        "consensus_reached": r.consensus_reached,
                        "consensus_winner": r.consensus_winner
                    }
                    for r in debate_result.debate_rounds
                ]
            },
            "winner": instance.get('winner', ''),
            "metadata": instance.get('metadata', ''),
            "model_a": instance.get('model_a', ''),
            "model_b": instance.get('model_b', ''),
            "status": "ok" if debate_result.status != DebateStatus.ERROR else "failed"
        }

def save_jury_debate_results(results: List[Dict], out_path: str = 'jury_debate_results.json'):
    """Save jury debate results to file"""
    with open(out_path, 'w') as f:
        json.dump(results, f, indent=2)
    
    # Create summary
    consensus_count = len([r for r in results if r.get("debate_result", {}).get("consensus_reached")])
    total_instances = len(results)
    successful_instances = len([r for r in results if r.get("status") == "ok"])
    
    summary = {
        "total_instances": total_instances,
        "successful_instances": successful_instances,
        "consensus_reached": consensus_count,
        "consensus_rate": consensus_count / total_instances if total_instances > 0 else 0,
        "average_rounds": sum(r.get("debate_result", {}).get("total_rounds", 0) for r in results) / len(results) if results else 0,
        "total_api_calls": sum(r.get("debate_result", {}).get("api_calls_made", 0) for r in results)
    }
    
    summary_path = out_path.replace('.json', '_summary.json')
    with open(summary_path, 'w') as f:
        json.dump(summary, f, indent=2)
    
    logging.info(f"Jury debate results saved to {out_path} and {summary_path}")
    
    # Print summary table
    print("\nJury Debate System Summary:")
    print("=" * 50)
    print(f"Total Instances: {total_instances}")
    print(f"Successful: {successful_instances}")
    print(f"Consensus Reached: {consensus_count} ({consensus_count/total_instances*100:.1f}%)")
    print(f"Average Rounds: {summary['average_rounds']:.1f}")
    print(f"Total API Calls: {summary['total_api_calls']}")

def main():
    """Main function for running the jury debate system"""
    setup_logging()
    
    parser = argparse.ArgumentParser(description="Jury Debate System for MOE Candidate Profiling")
    parser.add_argument('--output', type=str, default='jury_debate_results.json', 
                       help='Output file for debate results')
    parser.add_argument('--api_key', type=str, required=True, help='API key')
    parser.add_argument('--base_url', type=str, default='https://api.a4f.co/v1', 
                       help='Base URL for API calls')
    parser.add_argument('--seeds_path', type=str, default='./seeds.json', 
                       help='Path to seeds.json')
    parser.add_argument('--metrics_path', type=str, default='./metrics.yaml', 
                       help='Path to metrics.yaml')
    parser.add_argument('--nums', type=int, default=50, 
                       help='Number of instances to process (default: 50)')
    parser.add_argument('--delay', type=float, default=10.0, 
                       help='Delay between API calls in seconds (default: 10.0)')
    parser.add_argument('--max_rounds', type=int, default=4, 
                       help='Maximum debate rounds (default: 4)')
    
    args = parser.parse_args()
    
    try:
        # Load configuration
        logging.info("Loading metrics configuration...")
        with open(args.metrics_path, 'r') as f:
            metrics_config = yaml.safe_load(f)
        
        logging.info("Loading seeds data...")
        with open(args.seeds_path, 'r') as f:
            instances = json.load(f)
        
        if args.nums > 0:
            instances = instances[:args.nums]
        
        logging.info(f"Processing {len(instances)} instances using Jury Debate System...")
        
        # Initialize processor
        processor = JuryDebateProcessor(args.api_key, args.base_url, args.max_rounds)
        
        # Process instances
        results = processor.process_instances(instances, metrics_config, args.delay)
        
        # Save results
        save_jury_debate_results(results, args.output)
        
        logging.info("Jury debate system completed successfully!")
        
    except KeyboardInterrupt:
        logging.info("Interrupted by user.")
    except Exception as e:
        logging.error(f"Error during jury debate processing: {e}")
        raise
    finally:
        logging.shutdown()

if __name__ == "__main__":
    main() 