"""RAGAS-based evaluation framework for DriveGuard workflow.

This module adapts RAGAS metrics for evaluating driving safety assessment quality.
"""

import json
import pandas as pd
from pathlib import Path
from typing import Dict, List, Optional, Any
from dataclasses import dataclass

# RAGAS imports (install with: pip install ragas)
try:
    from ragas import evaluate
    from ragas.metrics import (
        answer_relevancy,
        answer_correctness,
        context_precision,
        context_recall,
        faithfulness
    )
    from datasets import Dataset
    RAGAS_AVAILABLE = True
except ImportError:
    RAGAS_AVAILABLE = False
    print("RAGAS not installed. Install with: pip install ragas")


@dataclass
class DriveGuardEvaluationSample:
    """Single evaluation sample for DriveGuard workflow."""
    video_id: str
    video_path: str
    ground_truth_annotation: str
    ground_truth_scenes: List[str]
    ground_truth_violations: List[Dict[str, str]]
    ground_truth_accidents: List[Dict[str, str]]
    ground_truth_assessment: Dict[str, Any]
    
    # System outputs
    system_annotation: Optional[str] = None
    system_scenes: Optional[List[str]] = None
    system_violations: Optional[List[Dict[str, str]]] = None
    system_accidents: Optional[List[Dict[str, str]]] = None
    system_assessment: Optional[Dict[str, Any]] = None


class DriveGuardEvaluationDataset:
    """Dataset class for DriveGuard evaluation samples."""
    
    def __init__(self, evaluation_dir: Path):
        """Initialize the evaluation dataset.
        
        Args:
            evaluation_dir (Path): Directory containing evaluation data.
        """
        self.evaluation_dir = Path(evaluation_dir)
        self.samples: List[DriveGuardEvaluationSample] = []
        
    def create_sample_template(self, video_id: str, video_path: str) -> Dict:
        """Create a template for manual annotation.
        
        Args:
            video_id (str): Unique identifier for the video.
            video_path (str): Path to the video file.
            
        Returns:
            Dict: Template for ground truth annotation.
        """
        template = {
            "video_id": video_id,
            "video_path": video_path,
            "ground_truth": {
                "annotation": "MANUAL_ANNOTATION_REQUIRED",
                "scenes": [
                    "Example: Ego vehicle approaches intersection with stop sign",
                    "Example: Ego vehicle fails to come to complete stop",
                    "Example: Ego vehicle proceeds through intersection"
                ],
                "violations": [
                    {
                        "scene": "Scene description",
                        "violation": "found/not_found",
                        "reason": "Specific violation description"
                    }
                ],
                "accidents": [
                    {
                        "scene": "Scene description", 
                        "accident": "found/not_found",
                        "consequence": "Potential accident description"
                    }
                ],
                "assessment": {
                    "safety_score": 6,
                    "risk_level": "medium",
                    "overall_evaluation": "Manual evaluation of driving behavior",
                    "strengths": ["List of positive behaviors"],
                    "weaknesses": ["List of problematic behaviors"],
                    "improvement_advice": ["List of recommendations"]
                }
            },
            "evaluation_criteria": {
                "annotation_quality": "How accurate is the scene description?",
                "scene_extraction": "Are all important scenes identified?",
                "violation_detection": "Are traffic violations correctly identified?",
                "accident_assessment": "Are accident risks properly evaluated?",
                "safety_scoring": "Is the safety score appropriate?",
                "advice_relevance": "Are recommendations actionable and relevant?"
            }
        }
        return template
    
    def save_template(self, video_id: str, video_path: str) -> Path:
        """Save a ground truth template file for manual annotation.
        
        Args:
            video_id (str): Unique identifier for the video.
            video_path (str): Path to the video file.
            
        Returns:
            Path: Path to the saved template file.
        """
        # Create evaluation directory if it doesn't exist
        self.evaluation_dir.mkdir(parents=True, exist_ok=True)

        template_path = self.evaluation_dir / f"{video_id}.json"

        # Check if file exists and has been manually edited
        if template_path.exists():
            try:
                with open(template_path, 'r', encoding='utf-8') as f:
                    existing_data = json.load(f)

                # Check if the file has been manually annotated
                annotation = existing_data.get('ground_truth', {}).get('annotation', '')
                if annotation != "MANUAL_ANNOTATION_REQUIRED" and annotation:
                    print(f"Skipping existing manually edited ground truth: {template_path}")
                    print("File contains manual annotations and will not be overwritten")
                    return template_path

            except (json.JSONDecodeError, KeyError):
                # If file is corrupted or has wrong structure, we can overwrite it
                print(f"Found corrupted ground truth file, will recreate: {template_path}")

        # Create and save new template
        template = self.create_sample_template(video_id, video_path)
        with open(template_path, 'w', encoding='utf-8') as f:
            json.dump(template, f, indent=2, ensure_ascii=False)

        print(f"Ground truth template saved: {template_path}")
        print("Please fill in the MANUAL_ANNOTATION_REQUIRED fields with expert annotations")

        return template_path
    
    def load_ground_truth(self, ground_truth_file: Path) -> DriveGuardEvaluationSample:
        """Load ground truth data from JSON file.
        
        Args:
            ground_truth_file (Path): Path to ground truth JSON file.
            
        Returns:
            DriveGuardEvaluationSample: Loaded evaluation sample.
        """
        with open(ground_truth_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        gt = data['ground_truth']
        sample = DriveGuardEvaluationSample(
            video_id=data['video_id'],
            video_path=data['video_path'],
            ground_truth_annotation=gt['annotation'],
            ground_truth_scenes=gt['scenes'],
            ground_truth_violations=gt['violations'],
            ground_truth_accidents=gt['accidents'],
            ground_truth_assessment=gt['assessment']
        )
        
        return sample
    
    def add_sample(self, sample: DriveGuardEvaluationSample):
        """Add an evaluation sample to the dataset."""
        self.samples.append(sample)
    
    def create_ragas_dataset(self) -> Dataset:
        """Create a RAGAS-compatible dataset from evaluation samples.
        
        Returns:
            Dataset: RAGAS-compatible dataset for evaluation.
        """
        if not RAGAS_AVAILABLE:
            raise ImportError("RAGAS not available. Install with: pip install ragas")
        
        # Prepare data for RAGAS evaluation
        data = {
            'question': [],
            'answer': [],
            'contexts': [],
            'ground_truth': []
        }
        
        for sample in self.samples:
            if sample.system_assessment is None:
                continue
                
            # Question: Video analysis request
            question = f"Analyze the driving behavior in video {sample.video_id} and provide a safety assessment."
            
            # Answer: System's safety assessment
            answer = self._format_assessment_for_ragas(sample.system_assessment)
            
            # Contexts: Retrieved information (scenes, violations, accidents)
            contexts = self._format_contexts_for_ragas(sample)
            
            # Ground truth: Expected assessment
            ground_truth = self._format_assessment_for_ragas(sample.ground_truth_assessment)
            
            data['question'].append(question)
            data['answer'].append(answer)
            data['contexts'].append(contexts)
            data['ground_truth'].append(ground_truth)
        
        return Dataset.from_dict(data)
    
    def _format_assessment_for_ragas(self, assessment: Dict[str, Any]) -> str:
        """Format assessment for RAGAS evaluation."""
        return f"""Safety Score: {assessment['safety_score']}/10
Risk Level: {assessment['risk_level']}
Overall Evaluation: {assessment['overall_evaluation']}
Strengths: {'; '.join(assessment['strengths'])}
Weaknesses: {'; '.join(assessment['weaknesses'])}
Improvement Advice: {'; '.join(assessment['improvement_advice'])}"""
    
    def _format_contexts_for_ragas(self, sample: DriveGuardEvaluationSample) -> List[str]:
        """Format context information for RAGAS evaluation."""
        contexts = []
        
        # Add video annotation as context
        if sample.system_annotation:
            contexts.append(f"Video Annotation: {sample.system_annotation}")
        
        # Add extracted scenes
        if sample.system_scenes:
            contexts.append(f"Extracted Scenes: {'; '.join(sample.system_scenes)}")
        
        # Add violation analysis
        if sample.system_violations:
            violations_text = "; ".join([
                f"{v['scene']}: {v['violation']} - {v.get('reason', '')}"
                for v in sample.system_violations
            ])
            contexts.append(f"Traffic Violations: {violations_text}")
        
        # Add accident analysis
        if sample.system_accidents:
            accidents_text = "; ".join([
                f"{a['scene']}: {a['accident']} - {a.get('consequence', '')}"
                for a in sample.system_accidents
            ])
            contexts.append(f"Accident Risks: {accidents_text}")
        
        return contexts


class DriveGuardRAGASEvaluator:
    """RAGAS-based evaluator for DriveGuard workflow."""
    
    def __init__(self, dataset: DriveGuardEvaluationDataset):
        """Initialize the evaluator with a dataset."""
        self.dataset = dataset
        
        if not RAGAS_AVAILABLE:
            raise ImportError("RAGAS not available. Install with: pip install ragas")
    
    def evaluate(self, metrics: Optional[List] = None) -> Dict[str, float]:
        """Evaluate the DriveGuard workflow using RAGAS metrics.
        
        Args:
            metrics (List, optional): List of RAGAS metrics to use.
            
        Returns:
            Dict[str, float]: Evaluation results.
        """
        if metrics is None:
            metrics = [
                faithfulness,        # How faithful is the assessment to the retrieved information
                answer_relevancy,    # How relevant is the safety assessment
                answer_correctness,  # How correct is the assessment compared to ground truth
                context_precision,   # How precise is the retrieved context
                context_recall       # How complete is the retrieved context
            ]
        
        # Create RAGAS dataset
        ragas_dataset = self.dataset.create_ragas_dataset()
        
        # Run evaluation
        result = evaluate(
            dataset=ragas_dataset,
            metrics=metrics
        )
        
        return result
    
    def _analyze_individual_components(self) -> str:
        """Analyze individual components of the DriveGuard system.
        
        Returns:
            str: Formatted component analysis report.
        """
        analysis = []
        
        # Component 1: Video Annotation Analysis
        annotation_analysis = self._analyze_annotation_component()
        analysis.append(f"### 1. Video Annotation (dashcam_annotation.py)\n{annotation_analysis}")
        
        # Component 2: Scene Extraction Analysis  
        scene_analysis = self._analyze_scene_extraction_component()
        analysis.append(f"### 2. Scene Extraction (scene_extraction.py)\n{scene_analysis}")
        
        # Component 3: Traffic Rule Checker Analysis
        rule_analysis = self._analyze_traffic_rule_component()
        analysis.append(f"### 3. Traffic Rule Checker (traffic_rule_checker.py)\n{rule_analysis}")
        
        # Component 4: Accident Retriever Analysis
        accident_analysis = self._analyze_accident_retriever_component()
        analysis.append(f"### 4. Accident Retriever (traffic_accident_retriever.py)\n{accident_analysis}")
        
        # Component 5: Driving Mentor Analysis
        mentor_analysis = self._analyze_driving_mentor_component()
        analysis.append(f"### 5. Driving Mentor (driving_suggestion.py)\n{mentor_analysis}")
        
        return "\n\n".join(analysis)
    
    def _analyze_annotation_component(self) -> str:
        """Analyze video annotation component quality."""
        total_samples = len(self.dataset.samples)
        if total_samples == 0:
            return "**Status**: No data available for analysis"
        
        # Analyze annotation quality
        annotation_scores = []
        completeness_scores = []
        detail_scores = []
        safety_focus_scores = []
        issues = []
        
        for sample in self.dataset.samples:
            if sample.system_annotation and sample.ground_truth_annotation:
                sys_text = sample.system_annotation.lower()
                gt_text = sample.ground_truth_annotation.lower()
                
                sys_words = len(sample.system_annotation.split())
                gt_words = len(sample.ground_truth_annotation.split())
                
                # 1. Coverage score based on word count ratio
                coverage = min(sys_words / max(gt_words, 1), 1.0)
                annotation_scores.append(coverage)
                
                # 2. Completeness - check for key driving elements
                required_elements = ["speed", "lane", "vehicle", "traffic", "road"]
                sys_elements = sum(1 for elem in required_elements if elem in sys_text)
                gt_elements = sum(1 for elem in required_elements if elem in gt_text)
                completeness = sys_elements / max(gt_elements, 1) if gt_elements > 0 else 1.0
                completeness_scores.append(min(completeness, 1.0))
                
                # 3. Detail level - sentence complexity and specificity
                sys_sentences = sample.system_annotation.count('.') + sample.system_annotation.count('!')
                detail_level = min(sys_sentences / 10.0, 1.0)  # Normalize to expected ~10 sentences
                detail_scores.append(detail_level)
                
                # 4. Safety focus - mentions of safety-related terms
                safety_terms = ["safe", "dangerous", "violation", "risk", "hazard", "collision", "accident", "brake", "signal"]
                safety_mentions = sum(1 for term in safety_terms if term in sys_text)
                safety_focus = min(safety_mentions / 5.0, 1.0)  # Normalize to expected ~5 safety terms
                safety_focus_scores.append(safety_focus)
                
                # Issue detection
                if "speed" not in sys_text and "speed" in gt_text:
                    issues.append("Missing speed analysis")
                if "lane" not in sys_text and "lane" in gt_text:
                    issues.append("Missing lane information")
                if "pedestrian" not in sys_text and "pedestrian" in gt_text:
                    issues.append("Missing pedestrian details")
                if "signal" not in sys_text and ("light" in gt_text or "sign" in gt_text):
                    issues.append("Missing traffic signal/sign analysis")
                if sys_sentences < 3:
                    issues.append("Annotation too brief")
        
        # Calculate averages
        avg_coverage = sum(annotation_scores) / len(annotation_scores) if annotation_scores else 0
        avg_completeness = sum(completeness_scores) / len(completeness_scores) if completeness_scores else 0
        avg_detail = sum(detail_scores) / len(detail_scores) if detail_scores else 0
        avg_safety_focus = sum(safety_focus_scores) / len(safety_focus_scores) if safety_focus_scores else 0
        
        overall_score = (avg_coverage + avg_completeness + avg_detail + avg_safety_focus) / 4
        status = "🟢 Good" if overall_score >= 0.7 else "🟡 Needs Improvement" if overall_score >= 0.5 else "🔴 Poor"
        
        result = f"**Status**: {status} (Overall: {overall_score:.1%})\n"
        result += f"**Function**: Converts dashcam video to detailed driving behavior description\n"
        result += f"**Performance Metrics**:\n"
        result += f"  - Content Coverage: {avg_coverage:.1%}\n"
        result += f"  - Element Completeness: {avg_completeness:.1%}\n"  
        result += f"  - Detail Level: {avg_detail:.1%}\n"
        result += f"  - Safety Focus: {avg_safety_focus:.1%}\n"
        result += f"**Samples Analyzed**: {len(annotation_scores)}\n"
        
        if issues:
            common_issues = list(set(issues))[:4]  # Top 4 unique issues
            result += f"**Common Issues**: {', '.join(common_issues)}\n"
        
        return result
    
    def _analyze_scene_extraction_component(self) -> str:
        """Analyze scene extraction component quality."""
        total_samples = len(self.dataset.samples)
        if total_samples == 0:
            return "**Status**: No data available for analysis"
        
        extraction_scores = []
        precision_scores = []
        recall_scores = []
        specificity_scores = []
        granularity_scores = []
        coherence_scores = []
        
        for sample in self.dataset.samples:
            if sample.system_scenes and sample.ground_truth_scenes:
                sys_scenes = len(sample.system_scenes)
                gt_scenes = len(sample.ground_truth_scenes)
                
                # 1. Precision and Recall
                precision = min(sys_scenes / max(gt_scenes, 1), 1.0)
                recall = min(gt_scenes / max(sys_scenes, 1), 1.0) if sys_scenes > 0 else 0
                precision_scores.append(precision)
                recall_scores.append(recall)
                
                f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
                extraction_scores.append(f1_score)
                
                # 2. Scene specificity (how specific vs. generic are the scenes)
                sys_text = ' '.join(sample.system_scenes).lower()
                specific_terms = ['ego', 'vehicle', 'lane', 'speed', 'signal', 'pedestrian', 'intersection']
                specific_count = sum(1 for term in specific_terms if term in sys_text)
                specificity = min(specific_count / len(specific_terms), 1.0)
                specificity_scores.append(specificity)
                
                # 3. Granularity (appropriate level of detail)
                avg_scene_length = sum(len(scene.split()) for scene in sample.system_scenes) / max(sys_scenes, 1)
                granularity = min(max(avg_scene_length - 5, 0) / 10.0, 1.0)  # Target 5-15 words per scene
                granularity_scores.append(granularity)
                
                # 4. Coherence (logical consistency between scenes)
                scene_keywords = []
                for scene in sample.system_scenes:
                    keywords = ['ego', 'vehicle', 'lane', 'traffic', 'road', 'speed']
                    scene_keywords.append(sum(1 for kw in keywords if kw in scene.lower()))
                
                coherence = 1.0 if len(set(scene_keywords)) <= 2 else 0.5  # Consistent keyword usage
                coherence_scores.append(coherence)
        
        # Calculate averages
        avg_f1 = sum(extraction_scores) / len(extraction_scores) if extraction_scores else 0
        avg_precision = sum(precision_scores) / len(precision_scores) if precision_scores else 0
        avg_recall = sum(recall_scores) / len(recall_scores) if recall_scores else 0
        avg_specificity = sum(specificity_scores) / len(specificity_scores) if specificity_scores else 0
        avg_granularity = sum(granularity_scores) / len(granularity_scores) if granularity_scores else 0
        avg_coherence = sum(coherence_scores) / len(coherence_scores) if coherence_scores else 0
        
        overall_score = (avg_f1 + avg_specificity + avg_granularity + avg_coherence) / 4
        status = "🟢 Good" if overall_score >= 0.7 else "🟡 Needs Improvement" if overall_score >= 0.5 else "🔴 Poor"
        
        result = f"**Status**: {status} (Overall: {overall_score:.1%})\n"
        result += f"**Function**: Extracts discrete traffic scenes from complex video annotations\n"
        result += f"**Performance Metrics**:\n"
        result += f"  - F1 Score: {avg_f1:.1%}\n"
        result += f"  - Precision: {avg_precision:.1%}\n"
        result += f"  - Recall: {avg_recall:.1%}\n"
        result += f"  - Scene Specificity: {avg_specificity:.1%}\n"
        result += f"  - Granularity: {avg_granularity:.1%}\n"
        result += f"  - Coherence: {avg_coherence:.1%}\n"
        result += f"**Samples Processed**: {len(extraction_scores)}\n"
        
        return result
    
    def _analyze_traffic_rule_component(self) -> str:
        """Analyze traffic rule checker component."""
        total_samples = len(self.dataset.samples)
        if total_samples == 0:
            return "**Status**: No data available for analysis"
        
        violation_accuracy = []
        found_violations = 0
        missed_violations = 0
        false_positives = 0
        true_positives = 0
        precision_scores = []
        recall_scores = []
        reasoning_quality_scores = []
        
        for sample in self.dataset.samples:
            if sample.system_violations and sample.ground_truth_violations:
                # Compare violation detection
                sys_found = sum(1 for v in sample.system_violations if v.get('violation') == 'found')
                gt_found = sum(1 for v in sample.ground_truth_violations if v.get('violation') == 'found')
                sys_not_found = sum(1 for v in sample.system_violations if v.get('violation') == 'not_found')
                gt_not_found = sum(1 for v in sample.ground_truth_violations if v.get('violation') == 'not_found')
                
                # Calculate confusion matrix elements
                tp = min(sys_found, gt_found)  # True positives
                fp = max(0, sys_found - gt_found)  # False positives
                fn = max(0, gt_found - sys_found)  # False negatives
                
                true_positives += tp
                found_violations += sys_found
                false_positives += fp
                missed_violations += fn
                
                # Precision and Recall
                precision = tp / max(sys_found, 1) if sys_found > 0 else 1.0
                recall = tp / max(gt_found, 1) if gt_found > 0 else 1.0
                precision_scores.append(precision)
                recall_scores.append(recall)
                
                # Overall accuracy
                accuracy = 1.0 if sys_found == gt_found else max(0.0, 1.0 - abs(sys_found - gt_found) / max(gt_found, 1))
                violation_accuracy.append(accuracy)
                
                # Reasoning quality (check if reasons are provided when violations found)
                sys_violations_with_reasons = [v for v in sample.system_violations 
                                             if v.get('violation') == 'found' and v.get('reason') 
                                             and len(v.get('reason', '').split()) >= 5]
                reasoning_quality = len(sys_violations_with_reasons) / max(sys_found, 1) if sys_found > 0 else 1.0
                reasoning_quality_scores.append(reasoning_quality)
        
        # Calculate metrics
        avg_accuracy = sum(violation_accuracy) / len(violation_accuracy) if violation_accuracy else 0
        avg_precision = sum(precision_scores) / len(precision_scores) if precision_scores else 0
        avg_recall = sum(recall_scores) / len(recall_scores) if recall_scores else 0
        avg_reasoning = sum(reasoning_quality_scores) / len(reasoning_quality_scores) if reasoning_quality_scores else 0
        
        f1_score = 2 * (avg_precision * avg_recall) / (avg_precision + avg_recall) if (avg_precision + avg_recall) > 0 else 0
        
        overall_score = (avg_accuracy + f1_score + avg_reasoning) / 3
        status = "🟢 Good" if overall_score >= 0.8 else "🟡 Needs Improvement" if overall_score >= 0.6 else "🔴 Poor"
        
        result = f"**Status**: {status} (Overall: {overall_score:.1%})\n"
        result += f"**Function**: Identifies traffic rule violations in driving scenes\n"
        result += f"**Performance Metrics**:\n"
        result += f"  - Accuracy: {avg_accuracy:.1%}\n"
        result += f"  - Precision: {avg_precision:.1%}\n"
        result += f"  - Recall: {avg_recall:.1%}\n"
        result += f"  - F1 Score: {f1_score:.1%}\n"
        result += f"  - Reasoning Quality: {avg_reasoning:.1%}\n"
        result += f"**Detection Summary**: {true_positives} correct, {false_positives} false alarms, {missed_violations} missed\n"
        
        return result
    
    def _analyze_accident_retriever_component(self) -> str:
        """Analyze accident retriever component."""
        total_samples = len(self.dataset.samples)
        if total_samples == 0:
            return "**Status**: No data available for analysis"
        
        relevance_scores = []
        coverage_scores = []
        specificity_scores = []
        context_quality_scores = []
        
        for sample in self.dataset.samples:
            if sample.system_accidents and sample.ground_truth_accidents:
                sys_text = str(sample.system_accidents).lower()
                gt_text = str(sample.ground_truth_accidents).lower()
                
                # 1. Content relevance (keyword overlap)
                sys_words = set(sys_text.split())
                gt_words = set(gt_text.split())
                relevance = len(sys_words & gt_words) / len(gt_words) if len(gt_words) > 0 else 0
                relevance_scores.append(relevance)
                
                # 2. Coverage (how much of expected content is covered)
                important_terms = ['accident', 'collision', 'crash', 'risk', 'hazard', 'danger', 'injury']
                sys_important = sum(1 for term in important_terms if term in sys_text)
                gt_important = sum(1 for term in important_terms if term in gt_text)
                coverage = sys_important / max(gt_important, 1) if gt_important > 0 else 1.0
                coverage_scores.append(min(coverage, 1.0))
                
                # 3. Specificity (specific vs generic accident descriptions)
                specific_terms = ['vehicle', 'lane', 'speed', 'intersection', 'merge', 'pedestrian', 'signal']
                specificity_count = sum(1 for term in specific_terms if term in sys_text)
                specificity = min(specificity_count / 4.0, 1.0)  # Expect at least 4 specific terms
                specificity_scores.append(specificity)
                
                # 4. Context quality (actionable vs vague descriptions)
                sys_length = len(sys_text.split())
                context_quality = min(sys_length / 50.0, 1.0)  # Expect ~50 words for good context
                context_quality_scores.append(context_quality)
        
        # Calculate averages
        avg_relevance = sum(relevance_scores) / len(relevance_scores) if relevance_scores else 0
        avg_coverage = sum(coverage_scores) / len(coverage_scores) if coverage_scores else 0
        avg_specificity = sum(specificity_scores) / len(specificity_scores) if specificity_scores else 0
        avg_context_quality = sum(context_quality_scores) / len(context_quality_scores) if context_quality_scores else 0
        
        overall_score = (avg_relevance + avg_coverage + avg_specificity + avg_context_quality) / 4
        status = "🟢 Good" if overall_score >= 0.6 else "🟡 Needs Improvement" if overall_score >= 0.4 else "🔴 Poor"
        
        result = f"**Status**: {status} (Overall: {overall_score:.1%})\n"
        result += f"**Function**: Retrieves relevant accident scenarios for risk assessment\n"
        result += f"**Performance Metrics**:\n"
        result += f"  - Content Relevance: {avg_relevance:.1%}\n"
        result += f"  - Topic Coverage: {avg_coverage:.1%}\n"
        result += f"  - Specificity: {avg_specificity:.1%}\n"
        result += f"  - Context Quality: {avg_context_quality:.1%}\n"
        result += f"**Retrievals Analyzed**: {len(relevance_scores)}\n"
        
        return result
    
    def _analyze_driving_mentor_component(self) -> str:
        """Analyze driving mentor component."""
        total_samples = len(self.dataset.samples)
        if total_samples == 0:
            return "**Status**: No data available for analysis"
        
        safety_score_agreement = []
        risk_level_agreement = []
        completeness_scores = []
        actionability_scores = []
        consistency_scores = []
        
        for sample in self.dataset.samples:
            if sample.system_assessment and sample.ground_truth_assessment:
                # 1. Safety score agreement
                sys_score = sample.system_assessment.get('safety_score', 5)
                gt_score = sample.ground_truth_assessment.get('safety_score', 5)
                score_diff = abs(sys_score - gt_score) / 10.0
                safety_score_agreement.append(1.0 - score_diff)
                
                # 2. Risk level agreement
                sys_risk = sample.system_assessment.get('risk_level', 'medium')
                gt_risk = sample.ground_truth_assessment.get('risk_level', 'medium')
                risk_agreement = 1.0 if sys_risk == gt_risk else 0.5 if abs(['low', 'medium', 'high', 'critical'].index(sys_risk) - ['low', 'medium', 'high', 'critical'].index(gt_risk)) == 1 else 0.0
                risk_level_agreement.append(risk_agreement)
                
                # 3. Assessment completeness (presence of key elements)
                required_fields = ['strengths', 'weaknesses', 'improvement_advice']
                sys_fields = sum(1 for field in required_fields if sample.system_assessment.get(field))
                completeness = sys_fields / len(required_fields)
                completeness_scores.append(completeness)
                
                # 4. Actionability (quality of improvement advice)
                advice = sample.system_assessment.get('improvement_advice', [])
                actionable_count = sum(1 for item in advice if len(str(item).split()) >= 5)  # At least 5 words
                actionability = min(actionable_count / 3.0, 1.0) if advice else 0.0  # Expect ~3 actionable items
                actionability_scores.append(actionability)
                
                # 5. Internal consistency (weaknesses align with low safety score)
                weaknesses = sample.system_assessment.get('weaknesses', [])
                if sys_score <= 5 and len(weaknesses) >= 2:
                    consistency = 1.0
                elif sys_score > 7 and len(weaknesses) <= 1:
                    consistency = 1.0
                else:
                    consistency = 0.5
                consistency_scores.append(consistency)
        
        # Calculate averages
        avg_safety_agreement = sum(safety_score_agreement) / len(safety_score_agreement) if safety_score_agreement else 0
        avg_risk_agreement = sum(risk_level_agreement) / len(risk_level_agreement) if risk_level_agreement else 0
        avg_completeness = sum(completeness_scores) / len(completeness_scores) if completeness_scores else 0
        avg_actionability = sum(actionability_scores) / len(actionability_scores) if actionability_scores else 0
        avg_consistency = sum(consistency_scores) / len(consistency_scores) if consistency_scores else 0
        
        overall_score = (avg_safety_agreement + avg_risk_agreement + avg_completeness + avg_actionability + avg_consistency) / 5
        status = "🟢 Good" if overall_score >= 0.8 else "🟡 Needs Improvement" if overall_score >= 0.6 else "🔴 Poor"
        
        # Calculate average score difference for display
        avg_score_diff = sum(abs(sample.system_assessment.get('safety_score', 5) - sample.ground_truth_assessment.get('safety_score', 5)) 
                           for sample in self.dataset.samples 
                           if sample.system_assessment and sample.ground_truth_assessment) / max(total_samples, 1)
        
        result = f"**Status**: {status} (Overall: {overall_score:.1%})\n"
        result += f"**Function**: Synthesizes analysis into comprehensive safety assessment\n"
        result += f"**Performance Metrics**:\n"
        result += f"  - Safety Score Agreement: {avg_safety_agreement:.1%}\n"
        result += f"  - Risk Level Agreement: {avg_risk_agreement:.1%}\n"
        result += f"  - Assessment Completeness: {avg_completeness:.1%}\n"
        result += f"  - Advice Actionability: {avg_actionability:.1%}\n"
        result += f"  - Internal Consistency: {avg_consistency:.1%}\n"
        result += f"**Avg Score Difference**: {avg_score_diff:.1f}/10 points\n"
        
        return result
    
    def generate_report(self, results: Dict[str, float], output_path: Optional[Path] = None) -> str:
        """Generate an evaluation report.
        
        Args:
            results (Dict[str, float]): Evaluation results from RAGAS.
            output_path (Path, optional): Path to save the report.
            
        Returns:
            str: Formatted evaluation report.
        """
        # Generate individual component analysis
        component_analysis = self._analyze_individual_components()
        
        report = f"""
# DriveGuard Workflow Evaluation Report

## RAGAS Evaluation Results

### Overall Metrics
- **Faithfulness**: {results.get('faithfulness', 'N/A'):.3f}
  - Measures how grounded the safety assessment is in the retrieved context
- **Answer Relevancy**: {results.get('answer_relevancy', 'N/A'):.3f}
  - Measures how relevant the safety assessment is to the driving analysis
- **Answer Correctness**: {results.get('answer_correctness', 'N/A'):.3f}
  - Measures how correct the assessment is compared to ground truth
- **Context Precision**: {results.get('context_precision', 'N/A'):.3f}
  - Measures precision of retrieved driving scenes and analysis
- **Context Recall**: {results.get('context_recall', 'N/A'):.3f}
  - Measures completeness of retrieved driving information

## Individual Component Analysis

{component_analysis}

### Dataset Statistics
- **Total Samples**: {len(self.dataset.samples)}
- **Evaluation Date**: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}

### Recommendations
Based on the evaluation results:
"""
        
        # Add recommendations based on scores
        if results.get('faithfulness', 0) < 0.7:
            report += "- **Improve Faithfulness**: The system may be hallucinating or not grounding assessments properly in the video analysis.\n"
        
        if results.get('answer_relevancy', 0) < 0.7:
            report += "- **Improve Relevancy**: Safety assessments may be too generic or not specific to the driving scenarios.\n"
        
        if results.get('answer_correctness', 0) < 0.7:
            report += "- **Improve Accuracy**: System assessments don't align well with expert evaluations.\n"
        
        if results.get('context_precision', 0) < 0.7:
            report += "- **Improve Context Quality**: Scene extraction and analysis may include irrelevant information.\n"
        
        if results.get('context_recall', 0) < 0.7:
            report += "- **Improve Context Completeness**: Important driving behaviors or risks may be missed.\n"
        
        if output_path:
            output_path = Path(output_path)
            output_path.parent.mkdir(parents=True, exist_ok=True)
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(report)
            print(f"Evaluation report saved to: {output_path}")
        
        return report


# Convenience functions for easy usage
def create_evaluation_templates(video_paths: List[str], evaluation_dir: str = "data/evaluation/ground_truth"):
    """Create ground truth templates for a list of videos.
    
    Args:
        video_paths (List[str]): List of video file paths.
        evaluation_dir (str): Directory to save templates.
    """
    dataset = DriveGuardEvaluationDataset(Path(evaluation_dir))
    
    for i, video_path in enumerate(video_paths):
        video_id = f"video_{i:03d}"
        template_path = dataset.save_template(video_id, video_path)
        print(f"Created template for {video_path}: {template_path}")


def run_ragas_evaluation(ground_truth_dir: str, system_outputs_dir: str) -> Dict[str, float]:
    """Run RAGAS evaluation with ground truth and system outputs.
    
    Args:
        ground_truth_dir (str): Directory containing ground truth files.
        system_outputs_dir (str): Directory containing system output files.
        
    Returns:
        Dict[str, float]: Evaluation results.
    """
    dataset = DriveGuardEvaluationDataset(Path(ground_truth_dir))
    
    # Load ground truth and system outputs
    # (This would need to be implemented based on your file structure)
    
    evaluator = DriveGuardRAGASEvaluator(dataset)
    results = evaluator.evaluate()
    
    # Generate report
    report = evaluator.generate_report(results, Path("data/evaluation/report/evaluation_report.md"))
    print(report)
    
    return results