"""Main component evaluator that coordinates traditional metrics and LLM-as-judge evaluation."""

import sys
from pathlib import Path
from typing import Dict, List, Any, Optional, Tuple
import json

# Add project root to path
root = Path(__file__).parent.parent.parent
sys.path.append(str(root))

from .config import ModelConfigLoader, EvaluationConfig
from .utils import EvaluationStateManager, DataLoader, EvaluationCache
from .metrics import TextSimilarityMetrics, ClassificationMetrics, StructuredMetrics, TimingMetrics, DrivingSafetyMetrics, SceneEvaluationMetrics
from .llm_judge import LLMJudgeEvaluator


class ComponentEvaluator:
    """Main component evaluator coordinating traditional metrics and LLM-as-judge."""
    
    def __init__(self, detailed_metrics: bool = True):
        """Initialize component evaluator.
        
        Args:
            detailed_metrics: Whether to show detailed metric breakdowns (default: True)
        """
        self.models_config = ModelConfigLoader.get_all_models()
        self.state_manager = EvaluationStateManager()
        self.cache = EvaluationCache()
        self.data_loader = DataLoader()
        
        # Display configuration
        self.detailed_metrics = detailed_metrics
        
        # Initialize metric calculators
        self.text_metrics = TextSimilarityMetrics()
        self.classification_metrics = ClassificationMetrics()
        self.structured_metrics = StructuredMetrics()
        self.timing_metrics = TimingMetrics()
        
        # Initialize domain-specific metrics
        self.driving_safety_metrics = DrivingSafetyMetrics()
        self.scene_evaluation_metrics = SceneEvaluationMetrics()
        
        # Initialize LLM judge (lazy loading)
        self.llm_judge = None
        
        detail_mode = "with detailed metric breakdowns" if detailed_metrics else "with summary display"
        print(f"✅ Component evaluator initialized {detail_mode}")
    
    def _get_llm_judge(self) -> LLMJudgeEvaluator:
        """Get LLM judge evaluator (lazy initialization)."""
        if self.llm_judge is None:
            try:
                self.llm_judge = LLMJudgeEvaluator()
            except Exception as e:
                print(f"⚠️  Failed to initialize LLM judge: {e}")
                self.llm_judge = None
        return self.llm_judge
    
    def _format_traditional_metrics(self, component: str, metrics: Dict[str, float], detailed: bool = False) -> str:
        """Format traditional metrics for display with optional detailed breakdown.
        
        Args:
            component: Component name
            metrics: Traditional metrics dictionary
            detailed: Whether to show detailed breakdown of all metrics
            
        Returns:
            Formatted metrics string
        """
        if not metrics:
            return "No metrics"
        
        if detailed:
            return self._format_detailed_metrics(component, metrics)
        
        # Existing summary format (backward compatibility)
        if component == 'annotation':
            # Show BLEU, ROUGE-L, Semantic Similarity
            bleu = metrics.get('bleu', 0)
            rouge = metrics.get('rouge_l', 0) 
            semantic = metrics.get('semantic_similarity', 0)
            avg = (bleu + rouge + semantic) / 3 if any([bleu, rouge, semantic]) else 0
            return f"BLEU={bleu:.2f}, ROUGE-L={rouge:.2f}, Semantic={semantic:.2f} (Avg: {avg:.2f})"
            
        elif component == 'scene':
            # Show basic metrics + enhanced count
            bleu = metrics.get('bleu', 0)
            rouge = metrics.get('rouge_l', 0)
            semantic = metrics.get('semantic_similarity', 0) 
            coverage = metrics.get('scene_coverage', 0)
            
            # Count enhanced metrics
            temporal_metrics = len([k for k in metrics.keys() if 'temporal' in k or 'order' in k])
            coherence_metrics = len([k for k in metrics.keys() if 'coherence' in k or 'narrative' in k])
            critical_metrics = len([k for k in metrics.keys() if 'critical' in k and 'scene' in k])
            
            total_enhanced = temporal_metrics + coherence_metrics + critical_metrics
            
            avg = (bleu + rouge + semantic + coverage) / 4 if any([bleu, rouge, semantic, coverage]) else 0
            return f"BLEU={bleu:.2f}, ROUGE-L={rouge:.2f}, Semantic={semantic:.2f}, Coverage={coverage:.2f} (Avg: {avg:.2f}) +{total_enhanced} enhanced metrics"
            
        elif component in ['violation', 'accident']:
            # Show classification metrics + safety count
            precision = metrics.get('precision', 0)
            recall = metrics.get('recall', 0)
            f1 = metrics.get('f1', 0)
            accuracy = metrics.get('accuracy', 0)
            semantic = metrics.get('semantic_similarity', 0)
            reasoning = metrics.get('reasoning_quality', 0)
            
            # Count safety metrics
            safety_metrics = len([k for k in metrics.keys() if k.startswith('safety_')])
            
            avg = (precision + recall + f1 + accuracy + semantic) / 5 if any([precision, recall, f1, accuracy, semantic]) else 0
            return f"P={precision:.2f}, R={recall:.2f}, F1={f1:.2f}, Acc={accuracy:.2f}, Sem={semantic:.2f}, Reas={reasoning:.2f} (Avg: {avg:.2f}) +{safety_metrics} safety metrics"
            
        elif component == 'assessment':
            # Show enhanced assessment metrics + comprehensive count
            score_mae = metrics.get('safety_score_mae', 0)
            risk_acc = metrics.get('risk_level_accuracy', 0)
            eval_sim = metrics.get('evaluation_similarity', 0)
            advice_sim = metrics.get('advice_similarity', 0)
            
            # Count comprehensive safety metrics
            safety_metrics = len([k for k in metrics.keys() if k.startswith('safety_')])
            coverage_metrics = len([k for k in metrics.keys() if 'coverage' in k or 'similarity' in k])
            
            avg = (risk_acc + eval_sim + advice_sim) / 3 if any([risk_acc, eval_sim, advice_sim]) else 0
            return f"ScoreMAE={score_mae:.2f}, RiskAcc={risk_acc:.2f}, EvalSim={eval_sim:.2f}, AdviceSim={advice_sim:.2f} (Avg: {avg:.2f}) +{safety_metrics} safety, +{coverage_metrics} coverage metrics"
        
        # Fallback for unknown components
        return f"Avg: {self._calculate_traditional_average([metrics]):.2f} ({len(metrics)} total metrics)"

    def _format_detailed_metrics(self, component: str, metrics: Dict[str, float]) -> str:
        """Format detailed breakdown of all computed metrics by category.
        
        Args:
            component: Component name
            metrics: Traditional metrics dictionary
            
        Returns:
            Detailed formatted metrics string with categories
        """
        if not metrics:
            return "No detailed metrics available"
        
        # Categorize metrics
        categories = {
            'Basic': [],
            'Semantic': [],
            'Safety': [], 
            'Temporal': [],
            'Coherence': [],
            'Coverage': [],
            'Assessment': [],
            'Other': []
        }
        
        for metric_name, value in metrics.items():
            if not isinstance(value, (int, float)):
                continue
                
            # Categorize metrics based on naming patterns
            if metric_name in ['bleu', 'rouge_l', 'precision', 'recall', 'f1', 'accuracy', 'scene_coverage']:
                categories['Basic'].append((metric_name, value))
            elif 'semantic' in metric_name or 'reasoning' in metric_name:
                categories['Semantic'].append((metric_name, value))
            elif metric_name.startswith('safety_') or 'critical' in metric_name:
                categories['Safety'].append((metric_name, value))
            elif 'temporal' in metric_name or 'order' in metric_name or 'causality' in metric_name:
                categories['Temporal'].append((metric_name, value))
            elif 'coherence' in metric_name or 'narrative' in metric_name or 'transition' in metric_name:
                categories['Coherence'].append((metric_name, value))
            elif 'coverage' in metric_name or 'similarity' in metric_name:
                categories['Coverage'].append((metric_name, value))
            elif any(x in metric_name for x in ['score_mae', 'risk_level', 'evaluation', 'advice', 'strengths', 'weaknesses']):
                categories['Assessment'].append((metric_name, value))
            else:
                categories['Other'].append((metric_name, value))
        
        # Build detailed output
        output_lines = []
        total_metrics = sum(len(cat_metrics) for cat_metrics in categories.values())
        
        output_lines.append(f"📊 DETAILED METRICS BREAKDOWN ({total_metrics} total)")
        
        for category, cat_metrics in categories.items():
            if not cat_metrics:
                continue
                
            # Add category header with icon
            category_icons = {
                'Basic': '📈', 'Semantic': '🧠', 'Safety': '🛡️',
                'Temporal': '⏰', 'Coherence': '🔗', 'Coverage': '📋',
                'Assessment': '🎯', 'Other': '📌'
            }
            
            icon = category_icons.get(category, '📌')
            output_lines.append(f"   {icon} {category} ({len(cat_metrics)} metrics):")
            
            # Sort metrics by value (descending) to show best performers first
            sorted_metrics = sorted(cat_metrics, key=lambda x: x[1], reverse=True)
            
            for metric_name, value in sorted_metrics:
                # Clean up metric name for display
                display_name = metric_name.replace('_', ' ').title()
                if display_name.startswith('Safety '):
                    display_name = display_name[7:]  # Remove "Safety " prefix for cleaner display
                
                output_lines.append(f"      • {display_name}: {value:.3f}")
        
        return '\n'.join(output_lines)

    def show_detailed_metrics_for_model(self, component: str, model: str, results: Dict[str, Any]):
        """Show detailed metrics breakdown for a specific model.
        
        Args:
            component: Component name
            model: Model name
            results: Model evaluation results
        """
        if not results or not results.get('traditional_metrics'):
            print(f"No detailed metrics available for {model}")
            return
        
        print(f"\n📋 DETAILED METRICS FOR {model}")
        print("=" * 50)
        
        # Calculate average metrics
        avg_metrics = {}
        for metrics_dict in results['traditional_metrics']:
            for key, value in metrics_dict.items():
                if isinstance(value, (int, float)):
                    if key not in avg_metrics:
                        avg_metrics[key] = []
                    avg_metrics[key].append(value)
        
        final_metrics = {k: sum(v) / len(v) for k, v in avg_metrics.items() if v}
        
        if final_metrics:
            detailed_breakdown = self._format_detailed_metrics(component, final_metrics)
            print(detailed_breakdown)
        else:
            print("No metrics to display")
    
    
    
    def _format_llm_judge_metrics(self, component: str, metrics: Dict[str, Any]) -> str:
        """Format LLM judge metrics for display based on component type.
        
        Args:
            component: Component name  
            metrics: LLM judge metrics dictionary
            
        Returns:
            Formatted metrics string
        """
        if not metrics:
            return "No metrics"
        
        if component == 'annotation':
            # Show Accuracy, Completeness, Clarity
            accuracy = metrics.get('accuracy_score', 0)
            completeness = metrics.get('completeness_score', 0)
            clarity = metrics.get('clarity_score', 0)
            overall = metrics.get('overall_quality', 0)
            return f"Accuracy={accuracy}, Completeness={completeness}, Clarity={clarity} (Avg: {overall:.1f})"
            
        elif component == 'scene':
            # Show Extraction Quality, Temporal Coherence, Safety Relevance
            extraction = metrics.get('extraction_quality', 0)
            temporal = metrics.get('temporal_coherence', 0)
            safety = metrics.get('safety_relevance', 0)
            overall = metrics.get('overall_quality', 0)
            return f"Extraction={extraction}, Temporal={temporal}, Safety={safety} (Avg: {overall:.1f})"
            
        elif component == 'violation':
            # Show Detection Accuracy, Explanation Quality, Legal Consistency
            detection = metrics.get('detection_accuracy', 0)
            explanation = metrics.get('explanation_quality', 0)
            legal = metrics.get('legal_consistency', 0)
            overall = metrics.get('overall_quality', 0)
            return f"Detection={detection}, Explanation={explanation}, Legal={legal} (Avg: {overall:.1f})"
            
        elif component == 'accident':
            # Show Risk Assessment, Consequence Prediction, Context Understanding
            risk = metrics.get('risk_assessment_accuracy', 0)
            consequence = metrics.get('consequence_prediction', 0)
            context = metrics.get('context_understanding', 0)
            overall = metrics.get('overall_quality', 0)
            return f"RiskAssess={risk}, Consequence={consequence}, Context={context} (Avg: {overall:.1f})"
            
        elif component == 'assessment':
            # Show Assessment Accuracy, Advice Actionability, Score Justification
            assessment = metrics.get('assessment_accuracy', 0)
            advice = metrics.get('advice_actionability', 0)
            justification = metrics.get('score_justification', 0)
            overall = metrics.get('overall_quality', 0)
            return f"Assessment={assessment}, Advice={advice}, Justification={justification} (Avg: {overall:.1f})"
        
        # Fallback for unknown components
        overall = metrics.get('overall_quality', 0)
        return f"Overall: {overall:.1f}"
    
    def _display_metric_leaders(self, component: str, results: Dict[str, Any]):
        """Display metric leaders for the component with enhanced domain-specific metrics.
        
        Args:
            component: Component name
            results: Evaluation results by model
        """
        if not results:
            return
        
        # Collect all metrics across models
        traditional_leaders = {}
        llm_judge_leaders = {}
        
        for model, result in results.items():
            if not result:
                continue
                
            # Process traditional metrics
            if result.get('traditional_metrics'):
                avg_metrics = {}
                for metrics_dict in result['traditional_metrics']:
                    for key, value in metrics_dict.items():
                        if isinstance(value, (int, float)):
                            if key not in avg_metrics:
                                avg_metrics[key] = []
                            avg_metrics[key].append(value)
                
                # Average and store best performers
                for metric, values in avg_metrics.items():
                    avg_value = sum(values) / len(values)
                    if metric not in traditional_leaders or avg_value > traditional_leaders[metric][1]:
                        traditional_leaders[metric] = (model, avg_value)
            
            # Process LLM judge metrics
            if result.get('llm_judge_scores'):
                avg_llm_metrics = {}
                for metrics_dict in result['llm_judge_scores']:
                    if isinstance(metrics_dict, dict):
                        for key, value in metrics_dict.items():
                            if isinstance(value, (int, float)):
                                if key not in avg_llm_metrics:
                                    avg_llm_metrics[key] = []
                                avg_llm_metrics[key].append(value)
                
                # Average and store best performers
                for metric, values in avg_llm_metrics.items():
                    avg_value = sum(values) / len(values)
                    if metric not in llm_judge_leaders or avg_value > llm_judge_leaders[metric][1]:
                        llm_judge_leaders[metric] = (model, avg_value)
        
        if traditional_leaders or llm_judge_leaders:
            print(f"\n🏆 METRIC LEADERS")
            
            # Enhanced traditional metrics leaders with categories
            if traditional_leaders:
                print(f"📊 Traditional Metrics:")
                
                # Define metric categories for organized display
                metric_categories = {
                    'Basic Performance': {
                        'annotation': ['bleu', 'rouge_l', 'semantic_similarity'],
                        'scene': ['bleu', 'rouge_l', 'semantic_similarity', 'scene_coverage'],
                        'violation': ['precision', 'recall', 'f1', 'accuracy'],
                        'accident': ['precision', 'recall', 'f1', 'accuracy'], 
                        'assessment': ['safety_score_mae', 'risk_level_accuracy']
                    },
                    'Semantic & Reasoning': ['semantic_similarity', 'semantic_f1', 'reasoning_quality'],
                    'Domain-Specific Safety': [k for k in traditional_leaders.keys() if k.startswith('safety_')],
                    'Temporal & Order': [k for k in traditional_leaders.keys() if 'temporal' in k or 'order' in k],
                    'Scene Coherence': [k for k in traditional_leaders.keys() if 'coherence' in k or 'narrative' in k],
                    'Critical Detection': [k for k in traditional_leaders.keys() if 'critical' in k],
                    'Content Coverage': [k for k in traditional_leaders.keys() if 'coverage' in k and not k.startswith('safety_')]
                }
                
                # Show basic performance metrics first
                basic_metrics = metric_categories['Basic Performance'].get(component, [])
                for metric_key in basic_metrics:
                    if metric_key in traditional_leaders:
                        model, value = traditional_leaders[metric_key]
                        display_name = metric_key.replace('_', ' ').title()
                        if metric_key in ['safety_score_mae']:  # Lower is better
                            print(f"  • Best {display_name}: {model} ({value:.3f})")
                        else:
                            print(f"  • Best {display_name}: {model} ({value:.3f})")
                
                # Show enhanced metric categories
                for category, metrics_list in metric_categories.items():
                    if category == 'Basic Performance':
                        continue  # Already shown
                    
                    # Get component-specific metrics or use the full list
                    if isinstance(metrics_list, dict):
                        relevant_metrics = []
                    else:
                        relevant_metrics = metrics_list
                    
                    category_leaders = [(k, v) for k, v in traditional_leaders.items() 
                                      if k in relevant_metrics and k not in basic_metrics]
                    
                    if category_leaders:
                        category_icons = {
                            'Semantic & Reasoning': '🧠',
                            'Domain-Specific Safety': '🛡️', 
                            'Temporal & Order': '⏰',
                            'Scene Coherence': '🔗',
                            'Critical Detection': '🚨',
                            'Content Coverage': '📋'
                        }
                        
                        icon = category_icons.get(category, '📌')
                        print(f"  {icon} {category}:")
                        
                        # Sort by value (descending) and show top 3
                        sorted_leaders = sorted(category_leaders, key=lambda x: x[1][1], reverse=True)
                        for metric_key, (model, value) in sorted_leaders[:3]:
                            display_name = metric_key.replace('_', ' ').title()
                            if display_name.startswith('Safety '):
                                display_name = display_name[7:]  # Clean up safety prefix
                            print(f"    • {display_name}: {model} ({value:.3f})")
            
            # Enhanced LLM judge metrics leaders  
            if llm_judge_leaders:
                print(f"🤖 LLM Judge Metrics:")
                if component == 'annotation':
                    metrics_to_show = [('accuracy_score', 'Accuracy'), ('completeness_score', 'Completeness'), ('clarity_score', 'Clarity')]
                elif component == 'scene':
                    metrics_to_show = [('extraction_quality', 'Extraction'), ('temporal_coherence', 'Temporal'), ('safety_relevance', 'Safety')]
                elif component == 'violation':
                    metrics_to_show = [('detection_accuracy', 'Detection'), ('explanation_quality', 'Explanation'), ('legal_consistency', 'Legal')]
                elif component == 'accident':
                    metrics_to_show = [('risk_assessment_accuracy', 'Risk Assess'), ('consequence_prediction', 'Consequence'), ('context_understanding', 'Context')]
                elif component == 'assessment':
                    metrics_to_show = [('assessment_accuracy', 'Assessment'), ('advice_actionability', 'Advice'), ('score_justification', 'Justification')]
                else:
                    metrics_to_show = [(k, k.replace('_', ' ').title()) for k in llm_judge_leaders.keys() if k != 'overall_quality']
                
                for metric_key, display_name in metrics_to_show:
                    if metric_key in llm_judge_leaders:
                        model, value = llm_judge_leaders[metric_key]
                        print(f"  • Best {display_name}: {model} ({value:.0f})")
                
                # Show overall quality leader
                if 'overall_quality' in llm_judge_leaders:
                    model, value = llm_judge_leaders['overall_quality']
                    print(f"  • Best Overall: {model} ({value:.1f})")
            
            # Summary of enhanced metrics coverage
            total_traditional = len(traditional_leaders)
            safety_metrics = len([k for k in traditional_leaders.keys() if k.startswith('safety_')])
            enhanced_metrics = len([k for k in traditional_leaders.keys() 
                                  if any(x in k for x in ['temporal', 'coherence', 'critical', 'narrative'])])
            
            if safety_metrics > 0 or enhanced_metrics > 0:
                print(f"\n📈 Enhanced Metrics Summary:")
                print(f"   • Total metrics evaluated: {total_traditional}")
                if safety_metrics > 0:
                    print(f"   🛡️  Domain-specific safety metrics: {safety_metrics}")
                if enhanced_metrics > 0:
                    print(f"   🎯 Advanced evaluation metrics: {enhanced_metrics}")
    
    def display_resume_info(self, component: str):
        """Show what will be resumed vs. re-evaluated.
        
        Args:
            component: Component name
        """
        models = self.models_config[component]
        completed = [m for m in models if self.state_manager.is_evaluation_complete(component, m)]
        pending = [m for m in models if m not in completed]
        
        print(f"📋 {len(models)} models loaded for {component} component")
        
        if completed:
            print(f"✅ Found {len(completed)} completed evaluations")
            print(f"⏳ Will evaluate {len(pending)} remaining models")
            if len(completed) > 0:
                print(f"   💡 Use --overwrite to re-evaluate completed models")
                # Show first few completed models
                shown_completed = completed[:3]
                for model in shown_completed:
                    print(f"     ✅ {model}")
                if len(completed) > 3:
                    print(f"     ... and {len(completed) - 3} more")
        else:
            print(f"🆕 Starting fresh evaluation of {len(models)} models")
        
        if pending:
            print(f"\n⏳ Models to evaluate:")
            for model in pending:
                print(f"     🔄 {model}")
    
    def _evaluate_traditional_metrics(self, component: str, ground_truth: Dict[str, Any], 
                                    system_output: Dict[str, Any], video_id: str, 
                                    model: str) -> Optional[Dict[str, float]]:
        """Evaluate using traditional metrics with domain-specific enhancements.
        
        Args:
            component: Component name
            ground_truth: Ground truth data
            system_output: System output data
            video_id: Video identifier
            model: Model identifier
            
        Returns:
            Traditional metrics results or None if failed
        """
        try:
            if component == 'annotation':
                # Text similarity metrics for annotation
                gt_text = ground_truth.get('ground_truth', {}).get('annotation', '')
                sys_text = system_output.get('content', '')
                
                if not gt_text or not sys_text:
                    return {'bleu': 0.0, 'rouge_l': 0.0, 'semantic_similarity': 0.0}
                
                return self.text_metrics.evaluate_text_pair(gt_text, sys_text)
            
            elif component == 'scene':
                # Enhanced scene extraction evaluation with domain-specific metrics
                gt_scenes = ground_truth.get('ground_truth', {}).get('scenes', [])
                sys_scenes = system_output.get('content', [])
                
                if not isinstance(sys_scenes, list):
                    sys_scenes = []
                
                # Basic coverage and text similarity
                coverage = min(len(sys_scenes) / len(gt_scenes), 1.0) if gt_scenes else 0.0
                
                gt_text = ' '.join(gt_scenes) if gt_scenes else ''
                sys_text = ' '.join(sys_scenes) if sys_scenes else ''
                
                text_metrics = self.text_metrics.evaluate_text_pair(gt_text, sys_text)
                text_metrics['scene_coverage'] = coverage
                
                # Add domain-specific scene metrics
                try:
                    scene_metrics = self.scene_evaluation_metrics.calculate_comprehensive_scene_metrics(
                        sys_scenes, gt_scenes
                    )
                    text_metrics.update(scene_metrics)
                except Exception as e:
                    print(f"⚠️  Scene domain metrics failed: {e}")
                
                return text_metrics
            
            elif component in ['violation', 'accident']:
                # Enhanced classification metrics with semantic similarity
                if component == 'violation':
                    gt_items = ground_truth.get('ground_truth', {}).get('violations', [])
                    sys_items = system_output.get('content', [])
                    detection_type = 'violation'
                else:
                    gt_items = ground_truth.get('ground_truth', {}).get('accidents', [])
                    sys_items = system_output.get('content', [])
                    detection_type = 'accident'
                
                if not isinstance(gt_items, list) or not isinstance(sys_items, list):
                    return {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'accuracy': 0.0, 'semantic_similarity': 0.0}
                
                # Use enhanced semantic classification metrics
                metrics = self.classification_metrics.calculate_semantic_classification_metrics(
                    gt_items, sys_items, detection_type
                )
                
                # Add domain-specific safety metrics if this is part of a complete assessment
                try:
                    if component == 'violation':
                        # Get accidents and assessment data for safety context
                        gt_accidents = ground_truth.get('ground_truth', {}).get('accidents', [])
                        gt_assessment = ground_truth.get('ground_truth', {}).get('assessment', {})
                        
                        safety_metrics = self.driving_safety_metrics.calculate_safety_criticality_score(
                            sys_items, gt_accidents  # Use predicted violations with GT accidents for context
                        )
                        
                        # Add safety criticality to metrics
                        for key, value in safety_metrics.items():
                            metrics[f'safety_{key}'] = value
                            
                    elif component == 'accident':
                        # Get violations and assessment for complete safety evaluation  
                        gt_violations = ground_truth.get('ground_truth', {}).get('violations', [])
                        gt_assessment = ground_truth.get('ground_truth', {}).get('assessment', {})
                        
                        safety_metrics = self.driving_safety_metrics.calculate_safety_criticality_score(
                            gt_violations, sys_items  # Use GT violations with predicted accidents
                        )
                        
                        # Add safety criticality to metrics
                        for key, value in safety_metrics.items():
                            metrics[f'safety_{key}'] = value
                            
                except Exception as e:
                    print(f"⚠️  Safety domain metrics failed for {component}: {e}")
                
                return metrics
            
            elif component == 'assessment':
                # Enhanced assessment evaluation with comprehensive metrics
                gt_assessment = ground_truth.get('ground_truth', {}).get('assessment', {})
                sys_assessment = system_output.get('content', {})
                
                if not isinstance(gt_assessment, dict) or not isinstance(sys_assessment, dict):
                    return {'safety_score_mae': 0.0, 'risk_level_accuracy': 0.0, 'evaluation_similarity': 0.0}
                
                # Use enhanced assessment metrics
                metrics = self.structured_metrics.calculate_assessment_metrics(gt_assessment, sys_assessment)
                
                # Add comprehensive domain-specific safety evaluation
                try:
                    gt_violations = ground_truth.get('ground_truth', {}).get('violations', [])
                    gt_accidents = ground_truth.get('ground_truth', {}).get('accidents', [])
                    
                    # Calculate comprehensive safety evaluation quality
                    safety_metrics = self.driving_safety_metrics.calculate_comprehensive_safety_metrics(
                        gt_violations, gt_accidents, sys_assessment  # Use GT data with predicted assessment
                    )
                    
                    # Add comprehensive safety metrics
                    for key, value in safety_metrics.items():
                        metrics[f'safety_{key}'] = value
                    
                except Exception as e:
                    print(f"⚠️  Comprehensive safety metrics failed: {e}")
                
                return metrics
            
            else:
                print(f"⚠️  Unknown component: {component}")
                return None
        
        except Exception as e:
            print(f"❌ Traditional metrics evaluation failed for {component}/{model}/{video_id}: {e}")
            return None
    
    def evaluate_model_component(self, component: str, model: str, 
                                overwrite: bool = False) -> Optional[Dict[str, Any]]:
        """Evaluate a single model on a component.
        
        Args:
            component: Component name
            model: Model identifier
            overwrite: Whether to overwrite cached results
            
        Returns:
            Evaluation results or None if failed
        """
        if not overwrite and self.state_manager.is_evaluation_complete(component, model):
            print(f"⏭️  {model} - using cached result")
            # Load cached results from individual video evaluations
            return self._load_cached_model_results(component, model)
        
        print(f"🔄 Evaluating {model}...")
        
        # Load ground truth files
        ground_truth_files = self.data_loader.load_ground_truth_files()
        if not ground_truth_files:
            print(f"❌ No ground truth files found")
            return None
        
        model_results = {
            'model': model,
            'component': component,
            'video_results': {},
            'traditional_metrics': [],
            'llm_judge_scores': [],
            'timing_metrics': []
        }
        
        success_count = 0
        total_count = len(ground_truth_files)
        
        for video_id, gt_data in ground_truth_files:
            try:
                # Load system output
                sys_data = self.data_loader.load_system_output(component, model, video_id)
                if not sys_data:
                    print(f"    ⚠️  No system output for {video_id}, skipping")
                    continue
                
                # Check cache first (if not overwriting)
                if not overwrite:
                    cached_traditional = self.cache.get_cached_result(component, model, video_id, "traditional")
                    cached_llm_judge = self.cache.get_cached_result(component, model, video_id, "llm_judge")
                    
                    if cached_traditional and cached_llm_judge:
                        # Extract timing from current system output even if other metrics are cached
                        generation_time = self.timing_metrics.extract_generation_time(sys_data)
                        
                        model_results['video_results'][video_id] = {
                            'traditional': cached_traditional['result'],
                            'llm_judge': cached_llm_judge['result'],
                            'timing': {
                                'generation_time': generation_time
                            }
                        }
                        
                        # Add to aggregate collections
                        if cached_traditional['result']:
                            model_results['traditional_metrics'].append(cached_traditional['result'])
                        if cached_llm_judge['result']:
                            model_results['llm_judge_scores'].append(cached_llm_judge['result'])
                        if generation_time is not None:
                            model_results['timing_metrics'].append(generation_time)
                        
                        success_count += 1
                        continue
                
                # Extract timing data
                generation_time = self.timing_metrics.extract_generation_time(sys_data)
                
                # Evaluate traditional metrics
                traditional_result = self._evaluate_traditional_metrics(
                    component, gt_data, sys_data, video_id, model
                )
                
                # Evaluate with LLM judge
                llm_judge = self._get_llm_judge()
                llm_judge_result = None
                
                if llm_judge:
                    llm_judge_result = llm_judge.evaluate_component(
                        component, gt_data, sys_data, video_id, model
                    )
                
                # Store results
                video_result = {
                    'traditional': traditional_result,
                    'llm_judge': llm_judge_result,
                    'timing': {
                        'generation_time': generation_time
                    }
                }
                
                model_results['video_results'][video_id] = video_result
                
                # Cache results
                if traditional_result:
                    self.cache.cache_result(component, model, video_id, traditional_result, "traditional")
                    model_results['traditional_metrics'].append(traditional_result)
                
                if llm_judge_result:
                    self.cache.cache_result(component, model, video_id, llm_judge_result, "llm_judge")
                    model_results['llm_judge_scores'].append(llm_judge_result)
                
                # Store timing data
                if generation_time is not None:
                    model_results['timing_metrics'].append(generation_time)
                
                success_count += 1
                
            except Exception as e:
                print(f"    ❌ Failed to evaluate {video_id}: {e}")
                continue
        
        completion_rate = success_count / total_count * 100 if total_count > 0 else 0
        print(f"    ✅ Completed: {success_count}/{total_count} videos ({completion_rate:.1f}%)")
        
        # Add timing summary to results
        if model_results['timing_metrics']:
            timing_stats = self.timing_metrics.calculate_timing_stats(model_results['timing_metrics'])
            model_results['timing_summary'] = timing_stats
            
            # Display timing summary
            mean_time = timing_stats['mean_time']
            total_time = timing_stats['total_time']
            print(f"    ⏱️  Timing: {self.timing_metrics.format_time(mean_time)} avg, {self.timing_metrics.format_time(total_time)} total")
        
        # Add detailed metrics summary display
        if model_results['traditional_metrics']:
            # Calculate average traditional metrics for display
            avg_metrics = {}
            for metrics_dict in model_results['traditional_metrics']:
                for key, value in metrics_dict.items():
                    if isinstance(value, (int, float)):
                        if key not in avg_metrics:
                            avg_metrics[key] = []
                        avg_metrics[key].append(value)
            
            # Average the metrics
            final_metrics = {k: sum(v) / len(v) for k, v in avg_metrics.items() if v}
            if final_metrics:
                traditional_display = self._format_traditional_metrics(component, final_metrics, self.detailed_metrics)
                if self.detailed_metrics:
                    print(f"    📊 Traditional Metrics:")
                    # Print detailed breakdown with indentation
                    for line in traditional_display.split('\n'):
                        print(f"    {line}")
                else:
                    print(f"    📊 Traditional: {traditional_display}")
        
        if model_results['llm_judge_scores']:
            # Calculate average LLM judge metrics for display
            avg_llm_metrics = {}
            for metrics_dict in model_results['llm_judge_scores']:
                if isinstance(metrics_dict, dict):
                    for key, value in metrics_dict.items():
                        if isinstance(value, (int, float)):
                            if key not in avg_llm_metrics:
                                avg_llm_metrics[key] = []
                            avg_llm_metrics[key].append(value)
            
            # Average the metrics
            final_llm_metrics = {k: sum(v) / len(v) for k, v in avg_llm_metrics.items() if v}
            if final_llm_metrics:
                llm_display = self._format_llm_judge_metrics(component, final_llm_metrics)
                print(f"    🤖 LLM Judge: {llm_display}")
        
        return model_results if success_count > 0 else None

    def _load_cached_model_results(self, component: str, model: str) -> Optional[Dict[str, Any]]:
        """Load cached evaluation results for a model-component pair.
        
        Args:
            component: Component name
            model: Model identifier
            
        Returns:
            Cached evaluation results in the same format as fresh evaluation
        """
        # Load ground truth files to get list of videos
        ground_truth_files = self.data_loader.load_ground_truth_files()
        if not ground_truth_files:
            return None
        
        model_results = {
            'model': model,
            'component': component,
            'video_results': {},
            'traditional_metrics': [],
            'llm_judge_scores': [],
            'timing_metrics': []
        }
        
        success_count = 0
        
        for video_id, _ in ground_truth_files:
            # Load cached results
            cached_traditional = self.cache.get_cached_result(component, model, video_id, "traditional")
            cached_llm_judge = self.cache.get_cached_result(component, model, video_id, "llm_judge")
            
            if cached_traditional and cached_llm_judge:
                # Load system output for timing data extraction
                sys_data = self.data_loader.load_system_output(component, model, video_id)
                generation_time = None
                if sys_data:
                    generation_time = self.timing_metrics.extract_generation_time(sys_data)
                    if generation_time is not None:
                        model_results['timing_metrics'].append(generation_time)
                
                model_results['video_results'][video_id] = {
                    'traditional': cached_traditional['result'],
                    'llm_judge': cached_llm_judge['result']
                }
                
                # Add to aggregate collections
                if cached_traditional['result']:
                    model_results['traditional_metrics'].append(cached_traditional['result'])
                if cached_llm_judge['result']:
                    model_results['llm_judge_scores'].append(cached_llm_judge['result'])
                
                success_count += 1
        
        if success_count == 0:
            return None
        
        # Calculate timing summary from extracted timing data
        timing_summary = None
        if model_results['timing_metrics']:
            timing_summary = self.timing_metrics.calculate_timing_stats(model_results['timing_metrics'])
            model_results['timing_summary'] = timing_summary
        
        return model_results
    
    def evaluate_component(self, component: str, overwrite: bool = False) -> Dict[str, Any]:
        """Evaluate all models for a component.
        
        Args:
            component: Component name to evaluate
            overwrite: Whether to overwrite cached results
            
        Returns:
            Dictionary of evaluation results by model
        """
        print(f"\n{'='*60}")
        print(f"EVALUATING {component.upper()} COMPONENT")
        print(f"{'='*60}")
        
        if not overwrite:
            self.display_resume_info(component)
        else:
            models = self.models_config[component]
            print(f"⚠️  Overwrite mode: re-evaluating all {len(models)} models")
        
        models = self.models_config[component]
        results = {}
        
        for i, model in enumerate(models, 1):
            print(f"\n[{i}/{len(models)}] Model: {model}")
            
            try:
                result = self.evaluate_model_component(component, model, overwrite)
                if result:
                    results[model] = result
                else:
                    print(f"❌ Failed to evaluate {model}")
                    
            except Exception as e:
                print(f"❌ Error evaluating {model}: {e}")
                continue
        
        # Summary
        successful_models = len([r for r in results.values() if r is not None])
        print(f"\n{'='*60}")
        print(f"{component.upper()} COMPONENT SUMMARY")
        print(f"{'='*60}")
        print(f"✅ Successfully evaluated: {successful_models}/{len(models)} models")
        
        if successful_models < len(models):
            failed_models = [m for m in models if m not in results or results[m] is None]
            print(f"❌ Failed models: {', '.join(failed_models)}")
        
        # Metric leaders analysis
        self._display_metric_leaders(component, results)
        
        # Timing comparison across models
        model_timings = {}
        for model, result in results.items():
            if result and result.get('timing_metrics'):
                model_timings[model] = result['timing_metrics']
        
        if model_timings:
            timing_comparison = self.timing_metrics.compare_model_timing(model_timings)
            if timing_comparison.get('fastest_model') and timing_comparison.get('slowest_model'):
                fastest = timing_comparison['fastest_model']
                slowest = timing_comparison['slowest_model']
                speed_ratio = timing_comparison['speed_ratio']
                
                print(f"\n⏱️  TIMING COMPARISON")
                print(f"🚀 Fastest: {fastest['model']} ({self.timing_metrics.format_time(fastest['mean_time'])} avg)")
                print(f"🐌 Slowest: {slowest['model']} ({self.timing_metrics.format_time(slowest['mean_time'])} avg)")
                print(f"📈 Speed ratio: {speed_ratio:.1f}x")
        
        return results

    def load_evaluation_results(self, component: str) -> Dict[str, Any]:
        """Load evaluation results from cache for a specific component.
        
        Args:
            component: Component name
            
        Returns:
            Dictionary mapping model names to their evaluation results
        """
        print(f"🔍 Loading cached evaluation results for {component} component...")
        
        models = self.models_config.get(component, [])
        ground_truth_files = self.data_loader.load_ground_truth_files()
        
        results = {}
        
        for model in models:
            print(f"  📂 Loading {model}...")
            
            # Check if this model has complete evaluation
            if not self.state_manager.is_evaluation_complete(component, model):
                print(f"    ⚠️  Incomplete evaluation - skipping")
                results[model] = None
                continue
            
            # Load all cached results for this model
            traditional_metrics = []
            llm_judge_scores = []
            timing_metrics = []
            video_results = {}
            
            for video_id, _ in ground_truth_files:
                # Load traditional metrics
                trad_result = self.cache.get_cached_result(component, model, video_id, "traditional")
                if trad_result and trad_result.get('result'):
                    traditional_metrics.append(trad_result['result'])
                
                # Load LLM judge scores
                llm_result = self.cache.get_cached_result(component, model, video_id, "llm_judge")
                if llm_result and llm_result.get('result'):
                    llm_judge_scores.append(llm_result['result'])
                
                # Store individual video results
                if trad_result or llm_result:
                    video_results[video_id] = {
                        'traditional': trad_result['result'] if trad_result else None,
                        'llm_judge': llm_result['result'] if llm_result else None
                    }
            
            if traditional_metrics or llm_judge_scores:
                # Calculate timing summary from available data
                timing_summary = None
                if timing_metrics:
                    timing_summary = self.timing_metrics.calculate_timing_stats(timing_metrics)
                
                results[model] = {
                    'traditional_metrics': traditional_metrics,
                    'llm_judge_scores': llm_judge_scores,
                    'timing_summary': timing_summary,
                    'video_results': video_results
                }
                print(f"    ✅ Loaded {len(video_results)} video evaluations")
            else:
                print(f"    ❌ No cached results found")
                results[model] = None
        
        successful_models = len([r for r in results.values() if r is not None])
        print(f"✅ Loaded results for {successful_models}/{len(models)} models")
        
        return results