#!/usr/bin/env python3
"""
Integrated MOE and Jury Debate System
=====================================

This script integrates both the Mixture of Experts (MOE) candidate profiling system
and the Jury Debate system, allowing for:

1. Parallel execution of both approaches
2. Comparison of results between MOE and Jury Debate
3. Hybrid approach using Jury Debate as validation for MOE results
4. Comprehensive analysis and reporting

Usage:
    python integrated_moe_jury_system.py --api_key YOUR_KEY --nums 100
"""

import os
import json
import time
import logging
import argparse
import yaml
import asyncio
import aiohttp
from collections import defaultdict
from tqdm import tqdm
from typing import List, Dict, Any, Tuple, Optional
import re
from datetime import datetime
from dataclasses import dataclass
from enum import Enum

import sys,os 
sys.path.append(os.path.dirname(__file__))

# Import the existing systems
from .MOE_Pipeline_Standalone import (
    process_instances_moe_parallel, save_moe_results,
    load_metrics_config, load_seeds_data, map_scenario_to_metrics,
    get_scenario_dimensions_with_descriptions, 
    # classify_scenario_async,  # COMMENTED OUT - no longer using scenario classification
    DualAPIKeyManager
)
from ..jury.jury_debate_system import JuryDebateSystem, JuryDebateProcessor, DebateResult, DebateStatus

# --- Logging Setup ---
def setup_logging():
    """Setup logging for the integrated system"""
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    logging.basicConfig(
        level=logging.INFO,
        format='[%(levelname)s] %(message)s',
        handlers=[
            logging.StreamHandler(),
            logging.FileHandler("integrated_moe_jury_system.log", mode='w')
        ]
    )

class ComparisonMode(Enum):
    """Enum for comparison modes"""
    PARALLEL = "parallel"  # Run both systems independently
    HYBRID = "hybrid"      # Use Jury Debate to validate MOE results
    VALIDATION = "validation"  # Use Jury Debate only for uncertain cases

@dataclass
class ComparisonResult:
    """Data class for comparison results"""
    instance_id: str
    scenario: str
    moe_winner: str
    jury_winner: str
    agreement: bool
    moe_confidence: float
    jury_confidence: float
    moe_scores: Dict[str, Any]
    jury_scores: Dict[str, Any]
    final_winner: str
    method_used: str

class IntegratedSystem:
    """Main class for the integrated MOE and Jury Debate system"""
    
    def __init__(self, api_key_1: str, api_key_2: str, base_url: str, max_rounds: int = 4):
        self.api_key_1 = api_key_1
        self.api_key_2 = api_key_2
        self.base_url = base_url
        self.max_rounds = max_rounds
        self.logger = logging.getLogger(__name__)
        
        # Initialize both systems
        self.jury_system = JuryDebateSystem(api_key_1, base_url, max_rounds)
        self.jury_processor = JuryDebateProcessor(api_key_1, base_url, max_rounds)
    
    async def run_moe_analysis(self, instances: List[Dict], metrics_config: Dict[str, Any], 
                              workers: int = 10, max_retries: int = 3) -> List[Dict]:
        """Run MOE analysis on instances"""
        self.logger.info("Starting MOE analysis...")
        
        # Create dual API key manager
        api_key_manager = DualAPIKeyManager(self.api_key_1, self.api_key_2)
        
        # Create aiohttp session with SSL context to handle certificate issues
        import ssl
        ssl_context = ssl.create_default_context()
        ssl_context.check_hostname = False
        ssl_context.verify_mode = ssl.CERT_NONE
        
        connector = aiohttp.TCPConnector(limit=workers, limit_per_host=workers, ssl=ssl_context)
        timeout = aiohttp.ClientTimeout(total=90)
        
        async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
            # COMMENTED OUT: No longer using scenario classification
            # Instead, we'll modify the MOE processing to use scenarios directly from seeds.json
            
            # Create a modified version of instances with scenarios from seeds.json
            modified_instances = []
            for instance in instances:
                # Get scenario directly from seeds.json
                scenario = instance.get('scenario', 'default')
                if not scenario:
                    scenario = 'default'
                
                # Map scenario to metrics.yaml format if needed
                mapped_scenario = map_scenario_to_metrics(scenario)
                
                # Create modified instance with scenario info
                modified_instance = instance.copy()
                modified_instance['scenario'] = mapped_scenario
                modified_instance['original_scenario'] = scenario
                modified_instances.append(modified_instance)
            
            # Now run MOE analysis with pre-classified scenarios
            moe_results = await process_instances_moe_parallel(
                modified_instances, metrics_config, api_key_manager, self.base_url, 
                max_retries, workers
            )
        
        self.logger.info(f"MOE analysis completed: {len(moe_results)} instances processed")
        return moe_results
    
    def run_jury_debate_on_moe_scores(self, moe_results: List[Dict], instances: List[Dict]) -> List[Dict]:
        """Run Jury Debate on the dimension scores from MOE analysis
        
        Timing Strategy:
        - Default delay between instances: 2 seconds (30 RPM potential)
        - Exponential backoff for 429 errors: 2, 4, 8, 16, 32 seconds
        - Max 5 retries for rate limit errors
        - Non-rate-limit errors are not retried
        """
        self.logger.info("Starting Jury Debate on MOE dimension scores...")
        
        jury_results = []
        
        for i, moe_result in enumerate(moe_results):
            if moe_result.get("status") != "ok":
                jury_results.append({
                    "id": moe_result.get("id", f"instance_{i}"),
                    "status": "failed",
                    "error": "MOE analysis failed"
                })
                continue
            
            # Get the corresponding instance
            if i < len(instances):
                instance = instances[i]
            else:
                self.logger.warning(f"Instance index {i} out of range for instances list")
                continue
                
            instance_id = moe_result.get("id", f"instance_{i}")
            
            # Extract dimension scores from MOE results
            dimension_scores = self._extract_dimension_scores(moe_result)
            
            if not dimension_scores:
                jury_results.append({
                    "id": instance_id,
                    "status": "failed",
                    "error": "No valid dimension scores found"
                })
                continue
            
            # Create jury debate prompt based on dimension scores
            jury_prompt = self._create_jury_prompt_from_dimensions(
                instance, dimension_scores, moe_result.get("scenario", "unknown")
            )
            
            # Conduct jury debate with exponential backoff for 429 errors
            max_retries = 5
            base_delay = 2  # Start with 2 seconds
            
            for attempt in range(max_retries):
                try:
                    debate_result = self.jury_system.conduct_debate_on_scores(
                        instance, dimension_scores, jury_prompt
                    )
                    
                    # Check if jury debate failed
                    if debate_result.status == DebateStatus.ERROR:
                        jury_results.append({
                            "id": instance_id,
                            "status": "failed",
                            "error": "Jury debate failed - no fallback available"
                        })
                        break
                    
                    jury_results.append({
                        "id": instance_id,
                        "scenario": moe_result.get("scenario", "unknown"),
                        "moe_result": moe_result,
                        "jury_debate": debate_result,
                        "status": "ok"
                    })
                    
                    self.logger.info(f"Jury debate completed for instance {instance_id}")
                    break  # Success, exit retry loop
                    
                except Exception as e:
                    error_str = str(e).lower()
                    
                    # Check if it's a 429 rate limit error
                    if "429" in error_str or "rate limit" in error_str or "too many requests" in error_str:
                        if attempt < max_retries - 1:
                            # Calculate exponential backoff delay: 2, 4, 8, 16, 32 seconds
                            delay = base_delay * (2 ** attempt)
                            self.logger.warning(f"Rate limit (429) for instance {instance_id}. Retry {attempt + 1}/{max_retries} after {delay} seconds...")
                            time.sleep(delay)
                            continue
                        else:
                            # Max retries reached for rate limit
                            self.logger.error(f"Max retries ({max_retries}) reached for rate limit on instance {instance_id}")
                            jury_results.append({
                                "id": instance_id,
                                "status": "failed",
                                "error": f"Rate limit exceeded after {max_retries} retries"
                            })
                    else:
                        # Non-rate-limit error, don't retry
                        self.logger.error(f"Error in jury debate for instance {instance_id}: {e}")
                        jury_results.append({
                            "id": instance_id,
                            "status": "failed",
                            "error": f"Jury debate error: {str(e)}"
                        })
                        break
            
            # Add small delay between instances to prevent overwhelming the API
            # This is separate from the exponential backoff for 429 errors
            if i < len(moe_results) - 1:  # Don't delay after the last instance
                time.sleep(2.0)  # 2 second delay between instances
        
        self.logger.info(f"Jury Debate on MOE scores completed: {len(jury_results)} instances processed")
        return jury_results
    
    def _extract_dimension_scores(self, moe_result: Dict) -> Dict[str, Dict]:
        """Extract dimension scores from MOE results"""
        dimension_scores = {}
        
        # Debug logging
        self.logger.debug(f"Extracting dimension scores from MOE result: {moe_result.keys()}")
        
        results = moe_result.get("results", {})
        self.logger.debug(f"MOE results structure: {list(results.keys()) if results else 'No results'}")
        
        for dimension, result in results.items():
            expert_evaluation = result.get("expert_evaluation")
            if expert_evaluation in ['1', '2', 'tie']:
                # Extract reasoning from API response if available
                api_response = result.get("api_response", "")
                reasoning = ""
                
                # Try to extract reasoning from API response
                if api_response and isinstance(api_response, str):
                    # Look for reasoning in the API response
                    if '<think>' in api_response and '</think>' in api_response:
                        think_start = api_response.find('<think>') + 7
                        think_end = api_response.find('</think>')
                        if think_end > think_start:
                            reasoning = api_response[think_start:think_end].strip()
                    else:
                        # If no think tags, use the first part of the response
                        reasoning = api_response[:300] + "..." if len(api_response) > 300 else api_response
                
                dimension_scores[dimension] = {
                    "evaluation": expert_evaluation,
                    "reasoning": reasoning,
                    "model": result.get("expert_model", ""),
                    "api_response": api_response
                }
                
                self.logger.debug(f"Extracted dimension {dimension}: {expert_evaluation} from {result.get('expert_model', 'unknown')}")
        
        self.logger.info(f"Extracted {len(dimension_scores)} valid dimension scores")
        return dimension_scores
    
    def _create_jury_prompt_from_dimensions(self, instance: Dict, dimension_scores: Dict, scenario: str) -> str:
        """Create jury debate prompt based on dimension scores"""
        
        # Create dimension summary
        dimension_summary = ""
        for dimension, score_data in dimension_scores.items():
            evaluation = score_data["evaluation"]
            reasoning = score_data["reasoning"]
            model = score_data["model"]
            
            # Truncate reasoning if too long
            if len(reasoning) > 200:
                reasoning = reasoning[:200] + "..."
            
            dimension_summary += f"\n- {dimension}: Response {evaluation} is better (Expert: {model})\n  Reasoning: {reasoning}"
        
        # Count votes
        votes_1 = sum(1 for score in dimension_scores.values() if score["evaluation"] == "1")
        votes_2 = sum(1 for score in dimension_scores.values() if score["evaluation"] == "2")
        votes_tie = sum(1 for score in dimension_scores.values() if score["evaluation"] == "tie")
        
        # Calculate confidence indicators
        total_votes = len(dimension_scores)
        margin = abs(votes_1 - votes_2)
        margin_percentage = (margin / total_votes * 100) if total_votes > 0 else 0
        
        prompt = f"""You are a jury panel evaluating the results of expert dimension analysis.

Scenario: {scenario}

Original Question: {instance.get('prompt', '')}

Response 1: {instance.get('response_a', '')}

Response 2: {instance.get('response_b', '')}

EXPERT DIMENSION ANALYSIS RESULTS:
{dimension_summary}

VOTE SUMMARY:
- Response 1 wins: {votes_1} dimensions
- Response 2 wins: {votes_2} dimensions  
- Ties: {votes_tie} dimensions
- Total dimensions evaluated: {total_votes}
- Margin of victory: {margin} dimensions ({margin_percentage:.1f}%)

Your task is to debate and reach consensus on the overall winner based on the expert dimension analysis.

Consider:
1. Which response won more dimensions and by what margin?
2. Are the winning dimensions more important for this specific scenario?
3. Do the expert reasonings support the overall conclusion?
4. Are there any conflicting signals between dimensions?
5. Does the margin of victory suggest a clear winner or is it close?

Please provide your evaluation and reasoning for the overall winner. If the margin is small, consider the quality of the winning dimensions and their relevance to the scenario."""

        return prompt
    
    def _create_fallback_debate_result(self, instance: Dict, dimension_scores: Dict, jury_prompt: str) -> Dict:
        """Create a fallback debate result when the jury system method is not available"""
        
        # Count votes
        votes_1 = sum(1 for score in dimension_scores.values() if score["evaluation"] == "1")
        votes_2 = sum(1 for score in dimension_scores.values() if score["evaluation"] == "2")
        votes_tie = sum(1 for score in dimension_scores.values() if score["evaluation"] == "tie")
        
        # Determine winner based on votes
        if votes_1 > votes_2:
            final_winner = "1"
            consensus_reached = True
        elif votes_2 > votes_1:
            final_winner = "2"
            consensus_reached = True
        else:
            final_winner = "tie"
            consensus_reached = True
        
        # Create simple scores
        total_votes = len(dimension_scores)
        score_1 = (votes_1 / total_votes * 10) if total_votes > 0 else 5.0
        score_2 = (votes_2 / total_votes * 10) if total_votes > 0 else 5.0
        
        return {
            "final_winner": final_winner,
            "final_scores": {"A": score_1, "B": score_2},
            "consensus_reached": consensus_reached,
            "total_rounds": 1,
            "status": "completed",
            "method": "fallback_vote_counting"
        }
    
    def calculate_moe_confidence(self, moe_result: Dict) -> float:
        """Calculate confidence score for MOE result"""
        if moe_result.get("status") != "ok":
            return 0.0
        
        # Calculate confidence based on:
        # 1. Number of successful dimension evaluations
        # 2. Agreement among dimensions
        # 3. Margin of victory
        
        expert_results = moe_result.get("expert_results", {})
        successful_dims = sum(1 for dim_result in expert_results.values() 
                            if dim_result.get("success", False))
        total_dims = len(expert_results)
        
        if total_dims == 0:
            return 0.0
        
        # Base confidence from success rate
        success_rate = successful_dims / total_dims
        
        # Margin of victory
        wins_1 = moe_result.get("expert_wins_1", 0)
        wins_2 = moe_result.get("expert_wins_2", 0)
        ties = moe_result.get("expert_ties", 0)
        
        if wins_1 + wins_2 + ties == 0:
            margin_score = 0.0
        else:
            max_wins = max(wins_1, wins_2)
            total_votes = wins_1 + wins_2 + ties
            margin_score = max_wins / total_votes if total_votes > 0 else 0.0
        
        # Combined confidence score
        confidence = (success_rate * 0.6) + (margin_score * 0.4)
        return confidence
    
    def calculate_jury_confidence(self, jury_result: Dict) -> float:
        """Calculate confidence score for Jury Debate result"""
        
        # Handle different jury result formats
        if hasattr(jury_result, 'jury_debate') and hasattr(jury_result.jury_debate, 'consensus_reached'):
            # It's a DebateResult object wrapped in a dictionary
            debate_result = jury_result.jury_debate
            consensus_reached = debate_result.consensus_reached
            total_rounds = debate_result.total_rounds
            final_winner = debate_result.final_winner
        elif hasattr(jury_result, 'consensus_reached'):
            # It's a DebateResult object directly
            consensus_reached = jury_result.consensus_reached
            total_rounds = jury_result.total_rounds
            final_winner = jury_result.final_winner
        else:
            # It's a dictionary with jury_debate key containing DebateResult
            jury_debate_obj = jury_result.get("jury_debate")
            if hasattr(jury_debate_obj, 'consensus_reached'):
                # The jury_debate key contains a DebateResult object
                consensus_reached = jury_debate_obj.consensus_reached
                total_rounds = jury_debate_obj.total_rounds
                final_winner = jury_debate_obj.final_winner
            else:
                # It's a regular dictionary
                debate_result = jury_result.get("jury_debate", {})
                consensus_reached = debate_result.get("consensus_reached", False)
                total_rounds = debate_result.get("total_rounds", 0)
                final_winner = debate_result.get("final_winner", "unknown")
        
        max_rounds = self.max_rounds
        
        # Consensus bonus
        consensus_score = 1.0 if consensus_reached else 0.5
        
        # Round efficiency (fewer rounds = higher confidence)
        round_efficiency = 1.0 - (total_rounds / max_rounds) if max_rounds > 0 else 0.0
        
        # Agreement with MOE dimension analysis (if available)
        # For now, we'll use a default value since we don't have MOE result in this context
        agreement_with_moe = 0.5  # Default neutral value
        
        # Combined confidence score
        confidence = (consensus_score * 0.4) + (round_efficiency * 0.3) + (agreement_with_moe * 0.3)
        return confidence
    
    def compare_results(self, moe_results: List[Dict], jury_results: List[Dict]) -> List[ComparisonResult]:
        """Compare MOE and Jury Debate results"""
        comparison_results = []
        
        # Create lookup dictionaries
        moe_lookup = {result["id"]: result for result in moe_results}
        jury_lookup = {result["id"]: result for result in jury_results}
        
        for instance_id in moe_lookup.keys():
            moe_result = moe_lookup.get(instance_id)
            jury_result = jury_lookup.get(instance_id)
            
            if not moe_result or not jury_result:
                continue
            
            # Debug logging
            self.logger.info(f"Processing comparison for instance {instance_id}")
            self.logger.info(f"Jury result type: {type(jury_result)}")
            self.logger.info(f"Jury result keys: {jury_result.keys() if hasattr(jury_result, 'keys') else 'No keys'}")
            
            # Extract winners
            moe_winner = moe_result.get("overall_winner", "unknown")
            
            # Handle different jury result formats
            if hasattr(jury_result, 'jury_debate') and hasattr(jury_result.jury_debate, 'final_winner'):
                # It's a DebateResult object wrapped in a dictionary
                debate_result = jury_result.jury_debate
                jury_winner = debate_result.final_winner
                jury_debate_data = {
                    "final_winner": debate_result.final_winner,
                    "final_scores": debate_result.final_scores,
                    "consensus_reached": debate_result.consensus_reached,
                    "total_rounds": debate_result.total_rounds
                }
            elif hasattr(jury_result, 'final_winner'):
                # It's a DebateResult object directly
                jury_winner = jury_result.final_winner
                jury_debate_data = {
                    "final_winner": jury_result.final_winner,
                    "final_scores": jury_result.final_scores,
                    "consensus_reached": jury_result.consensus_reached,
                    "total_rounds": jury_result.total_rounds
                }
            else:
                # It's a dictionary with jury_debate key containing DebateResult
                jury_debate_obj = jury_result.get("jury_debate")
                if hasattr(jury_debate_obj, 'final_winner'):
                    # The jury_debate key contains a DebateResult object
                    jury_winner = jury_debate_obj.final_winner
                    jury_debate_data = {
                        "final_winner": jury_debate_obj.final_winner,
                        "final_scores": jury_debate_obj.final_scores,
                        "consensus_reached": jury_debate_obj.consensus_reached,
                        "total_rounds": jury_debate_obj.total_rounds
                    }
                else:
                    # It's a regular dictionary
                    jury_debate_data = jury_result.get("jury_debate", {})
                    jury_winner = jury_debate_data.get("final_winner", "unknown")
            
            # Calculate confidence scores
            moe_confidence = self.calculate_moe_confidence(moe_result)
            jury_confidence = self.calculate_jury_confidence(jury_result)
            
            # Check agreement
            agreement = moe_winner == jury_winner
            
            # Determine final winner based on confidence
            if moe_confidence > jury_confidence:
                final_winner = moe_winner
                method_used = "MOE"
            else:
                final_winner = jury_winner
                method_used = "Jury"
            
            comparison_result = ComparisonResult(
                instance_id=instance_id,
                scenario=moe_result.get("scenario", "unknown"),
                moe_winner=moe_winner,
                jury_winner=jury_winner,
                agreement=agreement,
                moe_confidence=moe_confidence,
                jury_confidence=jury_confidence,
                moe_scores={
                    "wins_1": moe_result.get("expert_wins_1", 0),
                    "wins_2": moe_result.get("expert_wins_2", 0),
                    "ties": moe_result.get("expert_ties", 0)
                },
                jury_scores=jury_debate_data.get("final_scores", {}),
                final_winner=final_winner,
                method_used=method_used
            )
            
            comparison_results.append(comparison_result)
        
        return comparison_results
    
    async def run_hybrid_analysis(self, instances: List[Dict], metrics_config: Dict[str, Any],
                                 confidence_threshold: float = 0.7, workers: int = 10) -> List[Dict]:
        """Run hybrid analysis using MOE first, then Jury Debate for low-confidence cases"""
        self.logger.info("Starting hybrid analysis...")
        
        # Step 1: Run MOE analysis
        moe_results = await self.run_moe_analysis(instances, metrics_config, workers)
        
        # Step 2: Identify low-confidence cases
        low_confidence_instances = []
        high_confidence_results = []
        
        for i, moe_result in enumerate(moe_results):
            confidence = self.calculate_moe_confidence(moe_result)
            
            if confidence < confidence_threshold:
                low_confidence_instances.append(instances[i])
                self.logger.info(f"Instance {moe_result.get('id', 'unknown')} has low confidence ({confidence:.2f}), will use Jury Debate")
            else:
                high_confidence_results.append(moe_result)
        
        # Step 3: Run Jury Debate for low-confidence cases
        jury_results = []
        if low_confidence_instances:
            self.logger.info(f"Running Jury Debate for {len(low_confidence_instances)} low-confidence instances...")
            jury_results = self.run_jury_debate_on_moe_scores(moe_results, instances)
        
        # Step 4: Combine results
        final_results = high_confidence_results + jury_results
        
        self.logger.info(f"Hybrid analysis completed: {len(high_confidence_results)} MOE + {len(jury_results)} Jury Debate")
        
        return final_results
    
    async def run_parallel_analysis(self, instances: List[Dict], metrics_config: Dict[str, Any],
                                   workers: int = 10) -> Tuple[List[Dict], List[Dict], List[ComparisonResult]]:
        """Run both MOE and Jury Debate analyses in parallel"""
        self.logger.info("Starting parallel analysis...")
        
        # Step 1: Run MOE analysis
        moe_results = await self.run_moe_analysis(instances, metrics_config, workers)
        
        # Step 2: Run Jury Debate on MOE scores
        jury_results = self.run_jury_debate_on_moe_scores(moe_results, instances)
        
        # Step 3: Compare results
        comparison_results = self.compare_results(moe_results, jury_results)
        
        self.logger.info(f"Parallel analysis completed: {len(moe_results)} MOE + {len(jury_results)} Jury Debate")
        
        return moe_results, jury_results, comparison_results

def save_integrated_results(moe_results: List[Dict], jury_results: List[Dict], 
                          comparison_results: List[ComparisonResult], 
                          out_path: str = 'integrated_results.json'):
    """Save integrated results to file with detailed analysis display"""
    
    # Convert jury results to JSON-serializable format
    converted_jury_results = convert_jury_results_to_dict(jury_results)
    
    # Convert comparison results to dict format
    comparison_dicts = []
    for result in comparison_results:
        comparison_dicts.append({
            "instance_id": result.instance_id,
            "scenario": result.scenario,
            "moe_winner": result.moe_winner,
            "jury_winner": result.jury_winner,
            "agreement": result.agreement,
            "moe_confidence": result.moe_confidence,
            "jury_confidence": result.jury_confidence,
            "moe_scores": result.moe_scores,
            "jury_scores": result.jury_scores,
            "final_winner": result.final_winner,
            "method_used": result.method_used
        })
    
    # Save all results
    integrated_results = {
        "moe_results": moe_results,
        "jury_results": converted_jury_results,
        "comparison_results": comparison_dicts,
        "summary": {
            "total_instances": len(moe_results),
            "moe_successful": len([r for r in moe_results if r.get("status") == "ok"]),
            "jury_successful": len([r for r in converted_jury_results if r.get("status") == "ok"]),
            "agreement_rate": len([r for r in comparison_results if r.agreement]) / len(comparison_results) if comparison_results else 0,
            "moe_avg_confidence": sum(r.moe_confidence for r in comparison_results) / len(comparison_results) if comparison_results else 0,
            "jury_avg_confidence": sum(r.jury_confidence for r in comparison_results) / len(comparison_results) if comparison_results else 0,
            "moe_preferred": len([r for r in comparison_results if r.method_used == "MOE"]),
            "jury_preferred": len([r for r in comparison_results if r.method_used == "Jury"])
        }
    }
    
    with open(out_path, 'w') as f:
        json.dump(integrated_results, f, indent=2)
    
    # Create summary
    summary_path = out_path.replace('.json', '_summary.json')
    with open(summary_path, 'w') as f:
        json.dump(integrated_results["summary"], f, indent=2)
    
    logging.info(f"Integrated results saved to {out_path} and {summary_path}")
    
    # Print comprehensive analysis similar to moe_candidate_profiling_500_winners.py
    print_comprehensive_analysis(moe_results, jury_results, comparison_results)
    
    # Print summary
    print("\nIntegrated MOE and Jury Debate System Summary:")
    print("=" * 60)
    summary = integrated_results["summary"]
    print(f"Total Instances: {summary['total_instances']}")
    print(f"MOE Successful: {summary['moe_successful']}")
    print(f"Jury Successful: {summary['jury_successful']}")
    print(f"Agreement Rate: {summary['agreement_rate']:.1%}")
    print(f"MOE Avg Confidence: {summary['moe_avg_confidence']:.2f}")
    print(f"Jury Avg Confidence: {summary['jury_avg_confidence']:.2f}")
    print(f"MOE Preferred: {summary['moe_preferred']}")
    print(f"Jury Preferred: {summary['jury_preferred']}")
    
    # Print agreement analysis
    agreement_count = len([r for r in comparison_results if r.agreement])
    disagreement_count = len(comparison_results) - agreement_count
    
    print(f"\nAgreement Analysis:")
    print(f"  Agree: {agreement_count} ({agreement_count/len(comparison_results)*100:.1f}%)")
    print(f"  Disagree: {disagreement_count} ({disagreement_count/len(comparison_results)*100:.1f}%)")
    
    # Print confidence comparison
    moe_higher = len([r for r in comparison_results if r.moe_confidence > r.jury_confidence])
    jury_higher = len([r for r in comparison_results if r.jury_confidence > r.moe_confidence])
    equal = len([r for r in comparison_results if abs(r.moe_confidence - r.jury_confidence) < 0.01])
    
    print(f"\nConfidence Comparison:")
    print(f"  MOE Higher: {moe_higher} ({moe_higher/len(comparison_results)*100:.1f}%)")
    print(f"  Jury Higher: {jury_higher} ({jury_higher/len(comparison_results)*100:.1f}%)")
    print(f"  Equal: {equal} ({equal/len(comparison_results)*100:.1f}%)")

def convert_debate_result_to_dict(debate_result):
    """Convert DebateResult object to dictionary for JSON serialization"""
    if hasattr(debate_result, 'final_winner'):
        return {
            "final_winner": debate_result.final_winner,
            "final_scores": debate_result.final_scores,
            "consensus_reached": debate_result.consensus_reached,
            "total_rounds": debate_result.total_rounds,
            "status": debate_result.status.value if hasattr(debate_result.status, 'value') else str(debate_result.status)
        }
    return debate_result

def convert_jury_results_to_dict(jury_results):
    """Convert jury results to JSON-serializable format"""
    converted_results = []
    for result in jury_results:
        converted_result = result.copy()
        if 'jury_debate' in converted_result and hasattr(converted_result['jury_debate'], 'final_winner'):
            converted_result['jury_debate'] = convert_debate_result_to_dict(converted_result['jury_debate'])
        converted_results.append(converted_result)
    return converted_results

def print_comprehensive_analysis(moe_results: List[Dict], jury_results: List[Dict], comparison_results: List[ComparisonResult]):
    """Print comprehensive analysis similar to moe_candidate_profiling_500_winners.py"""
    
    print("\n" + "=" * 120)
    print("COMPREHENSIVE INTEGRATED SYSTEM ANALYSIS")
    print("=" * 120)
    
    # 1. MOE System Analysis
    print_moe_analysis(moe_results)
    
    # 2. Jury System Analysis
    print_jury_analysis(jury_results)
    
    # 3. Integration Analysis
    print_integration_analysis(comparison_results)
    
    # 4. Instance-Level Analysis
    print_instance_analysis(moe_results, jury_results, comparison_results)

def print_moe_analysis(moe_results: List[Dict]):
    """Print detailed MOE system analysis with dimension table"""
    print("\n1. MOE SYSTEM ANALYSIS")
    print("-" * 50)
    
    # Calculate dimension-level statistics
    dimension_stats = defaultdict(lambda: {"correct": 0, "total": 0, "accuracy": 0.0, "failed_calls": 0})
    
    # Track instance-level overall accuracy
    instance_overall_correct = 0
    instance_overall_total = 0
    
    for instance_result in moe_results:
        if instance_result.get("status") != "ok":
            continue
        
        # Track instance-level overall accuracy
        expert_overall = instance_result.get("overall_winner")
        ground_truth_overall = instance_result.get("winner")
        
        # Map ground truth format: model_a -> "1", model_b -> "2"
        if ground_truth_overall == "model_a":
            ground_truth_overall = "1"
        elif ground_truth_overall == "model_b":
            ground_truth_overall = "2"
        
        if expert_overall and ground_truth_overall:
            instance_overall_total += 1
            if expert_overall == ground_truth_overall:
                instance_overall_correct += 1
        
        # Track dimension-level statistics
        for dimension, result in instance_result.get("results", {}).items():
            if result.get("correct") is not None:
                dimension_stats[dimension]["total"] += 1
                if result["correct"]:
                    dimension_stats[dimension]["correct"] += 1
            
            # Track API call issues
            expert_result = instance_result.get("expert_results", {}).get(dimension, {})
            if not expert_result.get("success", True):
                dimension_stats[dimension]["failed_calls"] += 1
    
    # Calculate accuracy for each dimension
    for dimension, stats in dimension_stats.items():
        if stats["total"] > 0:
            stats["accuracy"] = stats["correct"] / stats["total"]
    
    # Calculate instance-level overall accuracy
    instance_overall_accuracy = instance_overall_correct / instance_overall_total if instance_overall_total > 0 else 0
    
    # Print dimension-level statistics table (similar to moe_candidate_profiling_500_winners.py)
    print(f"\nDimension-Level Statistics:")
    print(f"{'Dimension':<25} {'Accuracy':<10} {'Correct':<8} {'Total':<6} {'Failed':<7} {'Expert Model':<40}")
    print("-" * 100)
    
    for dimension, stats in sorted(dimension_stats.items()):
        accuracy_pct = stats["accuracy"] * 100
        expert_model = get_expert_model_for_dimension(dimension)
        failed_calls = stats.get("failed_calls", 0)
        print(f"{dimension:<25} {accuracy_pct:>8.1f}% {stats['correct']:>8} {stats['total']:>6} {failed_calls:>7} {expert_model:<40}")
    
    print("-" * 100)
    instance_accuracy_pct = instance_overall_accuracy * 100
    
    # Calculate dimension mean accuracy, excluding dimensions with failed API calls
    valid_dimensions = [stats for stats in dimension_stats.values() if stats.get("failed_calls", 0) == 0 and stats["total"] > 0]
    dimension_mean_pct = sum(stats["accuracy"] for stats in valid_dimensions) / len(valid_dimensions) if valid_dimensions else 0
    
    successful = len([r for r in moe_results if r.get("status") == "ok"])
    total = len(moe_results)
    total_dimensions = len(dimension_stats)
    failed_dimensions = len([stats for stats in dimension_stats.values() if stats.get("failed_calls", 0) > 0])
    
    print(f"{'INSTANCE OVERALL':<25} {instance_accuracy_pct:>8.1f}% ({instance_overall_correct}/{instance_overall_total})")
    print(f"{'DIMENSION MEAN':<25} {dimension_mean_pct*100:>8.1f}% (excludes {failed_dimensions} failed API dimensions)")
    print(f"Success Rate: {successful}/{total} ({successful/total*100:.1f}%)")
    print(f"Dimensions: {len(valid_dimensions)}/{total_dimensions} valid (excluded {failed_dimensions} with API failures)")
    
    # Print consistency warnings
    high_failure_dimensions = [dim for dim, stats in dimension_stats.items() if stats.get("failed_calls", 0) > 0]
    if high_failure_dimensions:
        print(f"\n⚠️  Dimensions with API call failures: {', '.join(high_failure_dimensions)}")
        print(f"   These dimensions are excluded from the DIMENSION MEAN calculation")
    
    # Print dimension evaluation summary
    print(f"\nDimension Evaluation Summary:")
    print(f"Total Dimensions Evaluated: {len(dimension_stats)}")
    print(f"Successful Evaluations: {sum(stats['total'] for stats in dimension_stats.values())}")
    print(f"Failed Evaluations: {sum(stats.get('failed_calls', 0) for stats in dimension_stats.values())}")

def get_expert_model_for_dimension(dimension: str) -> str:
    """Get the expert model for a specific dimension"""
    # Import the mapping from the MOE system
    try:
        from .MOE_Pipeline_Standalone import DIMENSION_TO_MODEL_MAPPING
        return DIMENSION_TO_MODEL_MAPPING.get(dimension, "Unknown")
    except ImportError:
        return "Unknown"

def print_jury_analysis(jury_results: List[Dict]):
    """Print detailed Jury system analysis"""
    print("\n2. JURY SYSTEM ANALYSIS")
    print("-" * 50)
    
    successful_jury = [r for r in jury_results if r.get("status") == "ok"]
    failed_jury = [r for r in jury_results if r.get("status") != "ok"]
    
    print(f"Total Jury Evaluations: {len(jury_results)}")
    print(f"Successful: {len(successful_jury)}")
    print(f"Failed: {len(failed_jury)}")
    print(f"Success Rate: {len(successful_jury)/len(jury_results)*100:.1f}%")
    
    if successful_jury:
        # Analyze jury debate results
        consensus_reached = 0
        total_rounds = 0
        method_used = {"intelligent_fallback": 0, "real_judges": 0}
        
        for result in successful_jury:
            jury_debate = result.get("jury_debate", {})
            if hasattr(jury_debate, 'consensus_reached'):
                if jury_debate.consensus_reached:
                    consensus_reached += 1
                total_rounds += jury_debate.total_rounds
                
                # Check if real judges were used or fallback
                if "intelligent" in str(jury_debate).lower():
                    method_used["intelligent_fallback"] += 1
                else:
                    method_used["real_judges"] += 1
        
        print(f"\nJury Debate Statistics:")
        print(f"Consensus Reached: {consensus_reached}/{len(successful_jury)} ({consensus_reached/len(successful_jury)*100:.1f}%)")
        print(f"Average Rounds: {total_rounds/len(successful_jury):.1f}")
        print(f"Method Used:")
        print(f"  - Intelligent Fallback: {method_used['intelligent_fallback']} ({method_used['intelligent_fallback']/len(successful_jury)*100:.1f}%)")
        print(f"  - Real Judges: {method_used['real_judges']} ({method_used['real_judges']/len(successful_jury)*100:.1f}%)")

def print_integration_analysis(comparison_results: List[ComparisonResult]):
    """Print integration analysis between MOE and Jury systems"""
    print("\n3. INTEGRATION ANALYSIS")
    print("-" * 50)
    
    if not comparison_results:
        print("No comparison results available")
        return
    
    # Agreement analysis
    agreement_count = len([r for r in comparison_results if r.agreement])
    disagreement_count = len(comparison_results) - agreement_count
    
    print(f"System Agreement: {agreement_count}/{len(comparison_results)} ({agreement_count/len(comparison_results)*100:.1f}%)")
    print(f"System Disagreement: {disagreement_count}/{len(comparison_results)} ({disagreement_count/len(comparison_results)*100:.1f}%)")
    
    # Method preference analysis
    moe_preferred = len([r for r in comparison_results if r.method_used == "MOE"])
    jury_preferred = len([r for r in comparison_results if r.method_used == "Jury"])
    
    print(f"\nMethod Preference:")
    print(f"MOE Preferred: {moe_preferred} ({moe_preferred/len(comparison_results)*100:.1f}%)")
    print(f"Jury Preferred: {jury_preferred} ({jury_preferred/len(comparison_results)*100:.1f}%)")
    
    # Confidence analysis
    moe_higher = len([r for r in comparison_results if r.moe_confidence > r.jury_confidence])
    jury_higher = len([r for r in comparison_results if r.jury_confidence > r.moe_confidence])
    equal = len([r for r in comparison_results if abs(r.moe_confidence - r.jury_confidence) < 0.01])
    
    print(f"\nConfidence Analysis:")
    print(f"MOE Higher Confidence: {moe_higher} ({moe_higher/len(comparison_results)*100:.1f}%)")
    print(f"Jury Higher Confidence: {jury_higher} ({jury_higher/len(comparison_results)*100:.1f}%)")
    print(f"Equal Confidence: {equal} ({equal/len(comparison_results)*100:.1f}%)")
    
    # Average confidence scores
    avg_moe_confidence = sum(r.moe_confidence for r in comparison_results) / len(comparison_results)
    avg_jury_confidence = sum(r.jury_confidence for r in comparison_results) / len(comparison_results)
    
    print(f"\nAverage Confidence Scores:")
    print(f"MOE: {avg_moe_confidence:.3f}")
    print(f"Jury: {avg_jury_confidence:.3f}")

def print_instance_analysis(moe_results: List[Dict], jury_results: List[Dict], comparison_results: List[ComparisonResult]):
    """Print instance-level analysis"""
    print("\n4. INSTANCE-LEVEL ANALYSIS")
    print("-" * 50)
    
    if not comparison_results:
        print("No comparison results available")
        return
    
    print(f"{'Instance ID':<15} {'Scenario':<20} {'MOE Winner':<12} {'Jury Winner':<12} {'Agreement':<10} {'MOE Conf':<10} {'Jury Conf':<10} {'Method':<10}")
    print("-" * 100)
    
    for result in comparison_results:
        instance_id = result.instance_id[:14] if len(result.instance_id) > 14 else result.instance_id
        scenario = result.scenario[:19] if len(result.scenario) > 19 else result.scenario
        agreement = "✅" if result.agreement else "❌"
        
        print(f"{instance_id:<15} {scenario:<20} {result.moe_winner:<12} {result.jury_winner:<12} {agreement:<10} {result.moe_confidence:<10.3f} {result.jury_confidence:<10.3f} {result.method_used:<10}")

def main():
    """Main function for running the integrated system"""
    setup_logging()
    
    parser = argparse.ArgumentParser(
        description="Integrated MOE and Jury Debate System",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        epilog=(
            "Examples:\n"
            "  python D-MOE-EVAL/main.py --api_key_1 YOUR_KEY \\\n+    --base_url https://api.your-provider.com \\\n+    --seeds_path D-MOE-EVAL/datasets/md_eval/seeds.json \\\n+    --metrics_path D-MOE-EVAL/datasets/md_eval/metrics.yaml \\\n+    --mode parallel --nums 50\n\n"
            "  python D-MOE-EVAL/main.py --api_key_1 KEY1 --api_key_2 KEY2 --workers 19 --max_rounds 4 --delay 2\n"
        )
    )
    parser.add_argument('--output', type=str, default='integrated_results.json', 
                       help='Output file for integrated results')
    parser.add_argument('--api_key_1', type=str, required=True, 
                       help='First API key (provide via CLI or env)')
    parser.add_argument('--api_key_2', type=str, default='', 
                       help='Second API key (optional)')
    parser.add_argument('--base_url', type=str, default='', 
                       help='Base URL for API calls (set via CLI or env)')
    parser.add_argument('--seeds_path', type=str, default='seeds.json', 
                       help='Path to seeds.json')
    parser.add_argument('--metrics_path', type=str, default='metrics.yaml', 
                       help='Path to metrics.yaml')
    parser.add_argument('--nums', type=int, default=50, 
                       help='Number of instances to process (default: 50)')
    parser.add_argument('--mode', type=str, choices=['parallel', 'hybrid', 'validation'], 
                       default='parallel', help='Analysis mode')
    parser.add_argument('--confidence_threshold', type=float, default=0.7, 
                       help='Confidence threshold for hybrid mode (default: 0.7)')
    parser.add_argument('--workers', type=int, default=19, 
                       help='Number of parallel workers for MOE (default: 19 for dual API keys)')
    parser.add_argument('--max_rounds', type=int, default=4, 
                       help='Maximum debate rounds for Jury system (default: 4)')
    parser.add_argument('--delay', type=float, default=2, 
                       help='Delay between Jury API calls in seconds (default: 2.0)')
    parser.add_argument('--resume', type=int, default=0, 
                       help='Resume processing from instance number (0-indexed, default: 0)')
    parser.add_argument('--debug', action='store_true', 
                       help='Enable debug logging')
    
    # If run without arguments, show quick usage and full help
    if len(sys.argv) == 1:
        print("\nNo arguments provided. Showing usage and examples.\n")
        parser.print_help()
        return

    args = parser.parse_args()
    
    try:
        # Setup debug logging if requested
        if args.debug:
            logging.getLogger().setLevel(logging.DEBUG)
            logging.info("Debug mode enabled - detailed logging will be shown")
        
        # Load configuration
        logging.info("Loading metrics configuration...")
        metrics_config = load_metrics_config(args.metrics_path)
        logging.info(f"Loaded metrics config with {len(metrics_config)} scenarios")
        
        logging.info("Loading seeds data...")
        instances = load_seeds_data(args.seeds_path, args.nums, args.resume)
        if args.resume > 0:
            logging.info(f"Resuming from instance {args.resume} (0-indexed)")
        logging.info(f"Loaded {len(instances)} instances from {args.seeds_path}")
        
        # Validate instances data
        if not instances:
            logging.error("No instances loaded from seeds file!")
            return
        
        # Check instance structure and scenarios
        first_instance = instances[0]
        logging.info(f"First instance keys: {list(first_instance.keys())}")
        logging.info(f"First instance ID: {first_instance.get('id', 'NO_ID')}")
        logging.info(f"First instance scenario: {first_instance.get('scenario', 'NO_SCENARIO')}")
        logging.info(f"First instance prompt length: {len(first_instance.get('prompt', ''))}")
        logging.info(f"First instance response_a length: {len(first_instance.get('response_a', ''))}")
        logging.info(f"First instance response_b length: {len(first_instance.get('response_b', ''))}")
        
        # Check if scenarios are available in seeds.json
        scenarios_in_seeds = [inst.get('scenario') for inst in instances if inst.get('scenario')]
        unique_scenarios = set(scenarios_in_seeds)
        logging.info(f"Scenarios found in seeds.json: {len(scenarios_in_seeds)}/{len(instances)}")
        logging.info(f"Unique scenarios: {list(unique_scenarios)[:10]}...")  # Show first 10
        
        if not scenarios_in_seeds:
            logging.warning("No scenarios found in seeds.json! Using 'default' for all instances.")
            # Add default scenario to instances that don't have one
            for instance in instances:
                if not instance.get('scenario'):
                    instance['scenario'] = 'default'
        
        logging.info(f"Processing {len(instances)} instances using {args.mode} mode with dual API keys...")
        if args.resume > 0:
            logging.info(f"Instance range: {args.resume} to {args.resume + len(instances) - 1} (0-indexed)")
        logging.info("NOTE: Using scenarios directly from seeds.json (no classification needed)")
        
        # Initialize integrated system with first API key (for jury system)
        integrated_system = IntegratedSystem(args.api_key_1, args.api_key_2, args.base_url, args.max_rounds)
        
        # Run analysis based on mode
        if args.mode == "parallel":
            moe_results, jury_results, comparison_results = asyncio.run(
                integrated_system.run_parallel_analysis(instances, metrics_config, args.workers)
            )
            
        elif args.mode == "hybrid":
            moe_results = asyncio.run(
                integrated_system.run_hybrid_analysis(instances, metrics_config, 
                                                   args.confidence_threshold, args.workers)
            )
            jury_results = []
            comparison_results = []
            
        elif args.mode == "validation":
            # Run MOE first, then Jury Debate for validation
            moe_results = asyncio.run(
                integrated_system.run_moe_analysis(instances, metrics_config, args.workers)
            )
            jury_results = integrated_system.run_jury_debate_on_moe_scores(moe_results, instances)
            comparison_results = integrated_system.compare_results(moe_results, jury_results)
        
        # Validate results before saving
        logging.info(f"MOE results: {len(moe_results)} instances")
        logging.info(f"Jury results: {len(jury_results)} instances")
        if comparison_results:
            logging.info(f"Comparison results: {len(comparison_results)} instances")
        
        # Check for successful results
        moe_successful = [r for r in moe_results if r.get("status") == "ok"]
        jury_successful = [r for r in jury_results if r.get("status") == "ok"]
        
        logging.info(f"MOE successful: {len(moe_successful)}/{len(moe_results)}")
        logging.info(f"Jury successful: {len(jury_successful)}/{len(jury_results)}")
        
        # Save results with resume information in filename if resuming
        output_file = args.output
        if args.resume > 0:
            # Add resume information to output filename
            base_name, ext = os.path.splitext(args.output)
            output_file = f"{base_name}_resume_{args.resume}{ext}"
            logging.info(f"Results saved to {output_file} (resume from instance {args.resume})")
        
        save_integrated_results(moe_results, jury_results, comparison_results, output_file)
        
        logging.info("Integrated system completed successfully!")
        
    except KeyboardInterrupt:
        logging.info("Interrupted by user.")
    except Exception as e:
        logging.error(f"Error during integrated analysis: {e}")
        import traceback
        logging.error(f"Traceback: {traceback.format_exc()}")
        raise
    finally:
        logging.shutdown()

if __name__ == "__main__":
    main() 