"""Run RAGAS evaluation for DriveGuard workflow."""

import sys
import json
import hashlib
import time
from pathlib import Path

# Add project root to path
root = Path(__file__).parent.parent
sys.path.append(str(root))

from evaluation.ragas_evaluation_setup import DriveGuardRAGASEvaluator
import importlib.util
import os

# Import from numbered file using importlib
spec = importlib.util.spec_from_file_location("prepare_evaluation_data", 
                                             os.path.join(os.path.dirname(__file__), "1_prepare_evaluation_data.py"))
prepare_evaluation_data = importlib.util.module_from_spec(spec)
spec.loader.exec_module(prepare_evaluation_data)
load_evaluation_dataset = prepare_evaluation_data.load_evaluation_dataset
from src.utils.log import logger


def get_sample_hash(sample) -> str:
    """Generate hash of a single evaluation sample for caching.
    
    Args:
        sample: DriveGuardEvaluationSample object.
        
    Returns:
        str: SHA256 hash of the sample content.
    """
    sample_data = {
        'video_id': sample.video_id,
        'ground_truth_annotation': sample.ground_truth_annotation,
        'ground_truth_scenes': sample.ground_truth_scenes,
        'ground_truth_violations': sample.ground_truth_violations,
        'ground_truth_accidents': sample.ground_truth_accidents,
        'ground_truth_assessment': sample.ground_truth_assessment,
        'system_annotation': sample.system_annotation,
        'system_scenes': sample.system_scenes,
        'system_violations': sample.system_violations,
        'system_accidents': sample.system_accidents,
        'system_assessment': sample.system_assessment
    }
    
    content_str = json.dumps(sample_data, sort_keys=True, ensure_ascii=False, default=str)
    return hashlib.sha256(content_str.encode('utf-8')).hexdigest()[:16]


def get_dataset_hash(dataset) -> str:
    """Generate hash of dataset content for caching.
    
    Args:
        dataset: Evaluation dataset object.
        
    Returns:
        str: SHA256 hash of the dataset content.
    """
    # Create a hash based on sorted sample hashes
    sample_hashes = []
    for sample in dataset.samples:
        sample_hash = get_sample_hash(sample)
        sample_hashes.append(f"{sample.video_id}:{sample_hash}")
    
    sample_hashes.sort()  # Ensure consistent ordering
    content_str = "|".join(sample_hashes)
    return hashlib.sha256(content_str.encode('utf-8')).hexdigest()[:16]


def load_evaluation_cache(cache_dir: Path) -> dict:
    """Load RAGAS evaluation cache from disk.
    
    Args:
        cache_dir (Path): Directory containing cache files.
        
    Returns:
        dict: Cache data with dataset hashes and results.
    """
    cache_file = cache_dir / "ragas_evaluation_cache.json"
    if cache_file.exists():
        try:
            with open(cache_file, 'r', encoding='utf-8') as f:
                return json.load(f)
        except (json.JSONDecodeError, FileNotFoundError):
            logger.warning(f"Invalid cache file {cache_file}, starting fresh")
    
    return {
        "created_at": time.time(),
        "evaluations": {},
        "samples": {}  # Cache for individual samples
    }


def get_cached_sample_result(cache: dict, video_id: str, sample_hash: str):
    """Get cached evaluation result for a specific sample.
    
    Args:
        cache (dict): Cache data.
        video_id (str): Video ID of the sample.
        sample_hash (str): Hash of the sample content.
        
    Returns:
        dict: Cached sample result if available, None otherwise.
    """
    samples_cache = cache.get("samples", {})
    if video_id in samples_cache:
        sample_cache = samples_cache[video_id]
        if sample_cache.get("sample_hash") == sample_hash:
            return sample_cache.get("ragas_data")
    return None


def create_partial_dataset_from_cache_and_new_samples(dataset, cache: dict, cache_dir: Path):
    """Create a partial dataset with cached results and new samples to evaluate.
    
    Args:
        dataset: Full evaluation dataset.
        cache (dict): Cache data.
        cache_dir (Path): Cache directory.
        
    Returns:
        tuple: (new_samples_dataset, cached_results, sample_mapping)
    """
    new_samples = []
    cached_results = {}
    sample_mapping = {}  # Maps index in new dataset to original video_id
    
    cache_hits = 0
    cache_misses = 0
    
    for sample in dataset.samples:
        sample_hash = get_sample_hash(sample)
        cached_result = get_cached_sample_result(cache, sample.video_id, sample_hash)
        
        if cached_result is not None:
            # Sample is cached
            cached_results[sample.video_id] = cached_result
            cache_hits += 1
            logger.debug(f"Cache hit for {sample.video_id}")
        else:
            # Sample needs evaluation
            new_samples.append(sample)
            sample_mapping[len(new_samples) - 1] = sample.video_id
            cache_misses += 1
            logger.debug(f"Cache miss for {sample.video_id}")
    
    logger.info(f"Cache analysis: {cache_hits} hits, {cache_misses} misses")
    
    # Create a new dataset with only samples that need evaluation
    from evaluation.ragas_evaluation_setup import DriveGuardEvaluationDataset
    new_dataset = DriveGuardEvaluationDataset(dataset.evaluation_dir)
    new_dataset.samples = new_samples
    
    return new_dataset, cached_results, sample_mapping


def cache_sample_result(cache: dict, video_id: str, sample_hash: str, ragas_data: dict) -> None:
    """Cache evaluation result for a specific sample.
    
    Args:
        cache (dict): Cache data.
        video_id (str): Video ID of the sample.
        sample_hash (str): Hash of the sample content.
        ragas_data (dict): RAGAS evaluation data for the sample.
    """
    if "samples" not in cache:
        cache["samples"] = {}
    
    cache["samples"][video_id] = {
        "sample_hash": sample_hash,
        "ragas_data": ragas_data,
        "evaluated_at": time.time()
    }


def combine_cached_results(sample_results: dict) -> dict:
    """Combine individual sample results into aggregate metrics.
    
    Args:
        sample_results (dict): Dictionary of video_id -> sample RAGAS results.
        
    Returns:
        dict: Aggregated RAGAS metrics.
    """
    if not sample_results:
        return {}
    
    # Collect all metric values by metric name
    metric_values = {}
    
    for video_id, sample_result in sample_results.items():
        for metric, value in sample_result.items():
            if isinstance(value, (int, float)):
                if metric not in metric_values:
                    metric_values[metric] = []
                metric_values[metric].append(value)
    
    # Calculate mean for each metric
    combined_results = {}
    for metric, values in metric_values.items():
        if values:  # Only if we have values
            combined_results[metric] = sum(values) / len(values)
    
    return combined_results


def save_evaluation_cache(cache: dict, cache_dir: Path) -> None:
    """Save RAGAS evaluation cache to disk.
    
    Args:
        cache (dict): Cache data to save.
        cache_dir (Path): Directory to save cache files.
    """
    cache_dir.mkdir(parents=True, exist_ok=True)
    cache_file = cache_dir / "ragas_evaluation_cache.json"
    
    cache["updated_at"] = time.time()
    
    try:
        with open(cache_file, 'w', encoding='utf-8') as f:
            json.dump(cache, f, indent=2, ensure_ascii=False)
        logger.debug(f"Evaluation cache saved to {cache_file}")
    except Exception as e:
        logger.warning(f"Failed to save evaluation cache: {e}")


def get_cached_results(cache: dict, dataset_hash: str) -> tuple:
    """Get cached evaluation results if available.
    
    Args:
        cache (dict): Cache data.
        dataset_hash (str): Hash of current dataset.
        
    Returns:
        tuple: (results_dict, report_content) if cached, (None, None) otherwise.
    """
    if dataset_hash in cache.get("evaluations", {}):
        cached_eval = cache["evaluations"][dataset_hash]
        return cached_eval.get("results_dict"), cached_eval.get("report_content")
    return None, None


def cache_evaluation_results(cache: dict, dataset_hash: str, results_dict: dict, report_content: str) -> None:
    """Cache evaluation results.
    
    Args:
        cache (dict): Cache data.
        dataset_hash (str): Hash of dataset.
        results_dict (dict): Evaluation results.
        report_content (str): Generated report content.
    """
    if "evaluations" not in cache:
        cache["evaluations"] = {}
    
    cache["evaluations"][dataset_hash] = {
        "results_dict": results_dict,
        "report_content": report_content,
        "evaluated_at": time.time()
    }


def run_evaluation():
    """Run the complete RAGAS evaluation for DriveGuard."""
    
    logger.info("="*60)
    logger.info("STARTING DRIVEGUARD RAGAS EVALUATION")
    logger.info("="*60)
    
    # Configuration
    ground_truth_dir = root / "data" / "evaluation" / "ground_truth"
    system_output_dir = root / "data" / "evaluation" / "system_outputs"
    report_path = root / "data" / "evaluation" / "report"/ "evaluation_report.md"
    cache_dir = root / "data" / "evaluation" / "cache"
    
    # Setup caching
    cache = load_evaluation_cache(cache_dir)
    cached_samples = len(cache.get('samples', {}))
    cached_evaluations = len(cache.get('evaluations', {}))
    logger.info(f"Loaded cache: {cached_samples} sample results, {cached_evaluations} full evaluations")
    
    try:
        # Load evaluation dataset
        logger.info("Loading evaluation dataset...")
        dataset = load_evaluation_dataset(ground_truth_dir, system_output_dir)
        
        if len(dataset.samples) == 0:
            logger.error("No evaluation samples found!")
            logger.error("Please ensure:")
            logger.error("1. Ground truth templates are completed")
            logger.error("2. System outputs are generated")
            logger.error("3. Files are properly matched")
            return
        
        logger.info(f"Loaded {len(dataset.samples)} evaluation samples")
        
        # Create partial dataset with cache analysis
        new_dataset, cached_sample_results, sample_mapping = create_partial_dataset_from_cache_and_new_samples(
            dataset, cache, cache_dir
        )
        
        # Check if we can use full cached results
        dataset_hash = get_dataset_hash(dataset)
        cached_results, cached_report = get_cached_results(cache, dataset_hash)
        
        if cached_results is not None and len(new_dataset.samples) == 0:
            # All samples are cached and full dataset result is available
            logger.info("⚡ Using fully cached evaluation results (no changes)")
            results_dict = cached_results
            report_content = cached_report
            
            # Write cached report to disk
            report_path.parent.mkdir(parents=True, exist_ok=True)
            with open(report_path, 'w', encoding='utf-8') as f:
                f.write(report_content)
        elif len(new_dataset.samples) == 0:
            # All samples cached but need to rebuild aggregate results
            logger.info("⚡ All samples cached - rebuilding aggregate results")
            results_dict = combine_cached_results(cached_sample_results)
            
            # Generate report from combined results
            evaluator = DriveGuardRAGASEvaluator(dataset)
            report_content = evaluator.generate_report(results_dict, report_path)
            
            # Cache the full dataset results
            cache_evaluation_results(cache, dataset_hash, results_dict, report_content)
            save_evaluation_cache(cache, cache_dir)
        else:
            # Some samples need evaluation
            logger.info(f"🔥 Evaluating {len(new_dataset.samples)} new/changed samples")
            logger.info(f"⚡ Using cached results for {len(cached_sample_results)} unchanged samples")
            
            # Initialize RAGAS evaluator for new samples only
            logger.info("Initializing RAGAS evaluator...")
            evaluator = DriveGuardRAGASEvaluator(new_dataset)
            
            # Run evaluation on new samples only
            logger.info("Running RAGAS evaluation...")
            new_results = evaluator.evaluate()
            
            # Convert EvaluationResult to dictionary for new samples
            new_results_dict = {}
            if hasattr(new_results, 'to_pandas'):
                df = new_results.to_pandas()
                for idx, row in df.iterrows():
                    video_id = sample_mapping[idx]
                    sample_result = {}
                    for col in df.columns:
                        if col not in ['user_input', 'response', 'retrieved_contexts', 'reference']:
                            sample_result[col] = row[col]
                    new_results_dict[video_id] = sample_result
            
            # Cache new sample results
            for sample in new_dataset.samples:
                sample_hash = get_sample_hash(sample)
                if sample.video_id in new_results_dict:
                    cache_sample_result(cache, sample.video_id, sample_hash, new_results_dict[sample.video_id])
            
            # Combine cached and new results
            all_sample_results = {**cached_sample_results, **new_results_dict}
            results_dict = combine_cached_results(all_sample_results)
            
            # Generate report from combined results
            evaluator_full = DriveGuardRAGASEvaluator(dataset)
            report_content = evaluator_full.generate_report(results_dict, report_path)
            
            # Cache the full dataset results
            cache_evaluation_results(cache, dataset_hash, results_dict, report_content)
            save_evaluation_cache(cache, cache_dir)
            logger.info("💾 New evaluation results cached")
        
        logger.info("\\n" + "="*60)
        logger.info("EVALUATION RESULTS")
        logger.info("="*60)
        
        # Display key metrics
        for metric, score in results_dict.items():
            if isinstance(score, (int, float)):
                logger.info(f"{metric.title().replace('_', ' ')}: {score:.3f}")
        
        logger.info(f"\\nDetailed report saved to: {report_path}")
        
        # Display cache info
        total_samples = len(dataset.samples)
        cached_count = len(cached_sample_results)
        new_count = len(new_dataset.samples) if 'new_dataset' in locals() else 0
        
        if new_count == 0:
            logger.info("⚡ Used fully cached results - evaluation completed instantly")
        elif cached_count > 0:
            logger.info(f"🚀 Hybrid evaluation: {cached_count}/{total_samples} samples from cache, {new_count} newly evaluated")
            efficiency = (cached_count / total_samples) * 100
            logger.info(f"💾 Cache efficiency: {efficiency:.1f}% - saved {cached_count} evaluations")
        else:
            logger.info("🔥 Full evaluation completed - all results cached for future runs")
        
        # Display summary
        if results_dict:
            avg_score = sum(v for v in results_dict.values() if isinstance(v, (int, float))) / len([v for v in results_dict.values() if isinstance(v, (int, float))])
            logger.info(f"\\nOverall Average Score: {avg_score:.3f}")
            
            if avg_score >= 0.8:
                logger.info("\u2705 EXCELLENT: System performance is very good")
            elif avg_score >= 0.7:
                logger.info("\u2705 GOOD: System performance is acceptable")
            elif avg_score >= 0.6:
                logger.info("\u26a0\ufe0f FAIR: System needs improvement")
            else:
                logger.info("\u274c POOR: System requires significant improvement")
        else:
            logger.warning("No numeric results to display")
        
    except ImportError as e:
        logger.error("RAGAS not installed. Please install with:")
        logger.error("pip install ragas")
        logger.error(f"Error: {e}")
    except Exception as e:
        logger.error(f"Evaluation failed: {e}")
        logger.error("Please check your ground truth annotations and system outputs")


def analyze_results(report_path: Path):
    """Analyze and display key insights from evaluation results."""
    
    try:
        with open(report_path, 'r', encoding='utf-8') as f:
            report = f.read()
        
        logger.info("\\n" + "="*60)
        logger.info("EVALUATION ANALYSIS")
        logger.info("="*60)
        
        # Extract key insights (simplified parsing)
        if "Improve Faithfulness" in report:
            logger.warning("⚠️  Issue: System may be hallucinating or not grounding assessments properly")
        
        if "Improve Relevancy" in report:
            logger.warning("⚠️  Issue: Safety assessments may be too generic")
        
        if "Improve Accuracy" in report:
            logger.warning("⚠️  Issue: System assessments don't align with expert evaluations")
        
        if "Improve Context Quality" in report:
            logger.warning("⚠️  Issue: Scene extraction may include irrelevant information")
        
        if "Improve Context Completeness" in report:
            logger.warning("⚠️  Issue: Important driving behaviors may be missed")
        
        logger.info("\\nFor detailed analysis, see the full report.")
        
    except FileNotFoundError:
        logger.error(f"Report file not found: {report_path}")


if __name__ == "__main__":
    print("DriveGuard RAGAS Evaluation Runner")
    print("=" * 60)
    
    # Check if ground truth data exists
    ground_truth_dir = root / "data" / "evaluation" / "ground_truth"
    if not ground_truth_dir.exists():
        print("❌ Ground truth directory not found!")
        print("Please run: python evaluation/1_prepare_evaluation_data.py")
        sys.exit(1)
    
    # Check for completed ground truth files
    gt_files = list(ground_truth_dir.glob("*.json"))
    if not gt_files:
        print("❌ No ground truth templates found!")
        print("Please run: python evaluation/1_prepare_evaluation_data.py")
        sys.exit(1)
    
    # Check if any ground truth is completed
    completed_files = 0
    for gt_file in gt_files:
        try:
            import json
            with open(gt_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            if data['ground_truth']['annotation'] != "MANUAL_ANNOTATION_REQUIRED":
                completed_files += 1
        except Exception:
            pass
    
    if completed_files == 0:
        print("❌ No completed ground truth annotations found!")
        print(f"Please complete the annotations in: {ground_truth_dir}")
        print("Replace 'MANUAL_ANNOTATION_REQUIRED' with expert annotations")
        sys.exit(1)
    
    print(f"✅ Found {completed_files} completed ground truth files")
    print("Starting RAGAS evaluation...")
    print()
    
    # Run evaluation
    run_evaluation()
    
    # Analyze results
    report_path = root / "data" / "evaluation" / "report"/ "evaluation_report.md"
    if report_path.exists():
        analyze_results(report_path)