#!/usr/bin/env python3
"""Script to generate ground truth annotations from system outputs using LLMs."""

import json
import sys
from pathlib import Path
from typing import Dict, List, Any, Optional

# Add project root to path
root = Path(__file__).parent.parent
sys.path.append(str(root))

from src.llm.llms import get_llm
from src.utils.settings import settings
from src.utils.log import logger


def generate_annotation_prompt(system_annotation: str, video_path: str) -> str:
    """Generate a prompt for creating ground truth annotation from system output.
    
    Args:
        system_annotation (str): The system-generated annotation.
        video_path (str): Path to the video file.
        
    Returns:
        str: Prompt for generating ground truth annotation.
    """
    return f"""You are an expert traffic safety analyst creating ground truth annotations for dashcam video evaluation.

Your task is to transform the structured system-generated annotation below into a coherent narrative ground truth annotation.

Video: {Path(video_path).name}

System Annotation (Structured Format):
{system_annotation}

EXAMPLE TRANSFORMATION:

System Format: "**Traffic Signals and Traffic Signs:** Lane markings with white dashed lines... **Interactions and Intentions:** Ego-vehicle in second lane, black SUV ahead, silver SUV adjacent... **Unsafe Behaviors:** No speeding observed... **Road Features:** Multi-lane highway, dry conditions..."

Ground Truth Format: "The ego vehicle was traveling in the second lane from the left on a multi-lane highway with clear, dry conditions. A black SUV was directly ahead in the same lane, with a silver SUV visible in the adjacent right lane. The ego vehicle maintained a safe following distance and steady speed, with no aggressive driving behaviors observed. All vehicles maintained proper lane discipline throughout the sequence."

INSTRUCTIONS:
1. **Convert to narrative format**: Transform the structured categories into a flowing chronological story
2. **Maintain chronological order**: Describe events as they unfold in time sequence
3. **Focus on the ego vehicle**: Center the narrative on what the ego vehicle is doing and experiencing
4. **Include critical interactions**: Highlight significant interactions with other vehicles, infrastructure, or hazards
5. **Preserve technical accuracy**: Keep all factual information from the system output
6. **Use objective language**: Avoid subjective interpretations, stick to observable facts
7. **Be concise but complete**: Create a comprehensive but readable narrative
8. **Include safety-relevant details**: Emphasize behaviors, positions, and events that matter for safety evaluation

Transform the system annotation above into a narrative ground truth annotation:"""


def generate_ground_truth_annotation(system_annotation: str, video_path: str, model_id: Optional[str] = None) -> str:
    """Generate ground truth annotation using LLM.
    
    Args:
        system_annotation (str): System-generated annotation to improve.
        video_path (str): Path to the video file.
        model_id (str, optional): LLM model to use.
        
    Returns:
        str: Generated ground truth annotation.
    """
    # Get LLM
    if model_id:
        llm = get_llm(model_id)
    else:
        llm = get_llm(settings.app.llm['main'])
    
    # Generate prompt
    prompt = generate_annotation_prompt(system_annotation, video_path)
    
    # Generate annotation
    logger.debug("Generating ground truth annotation using LLM...")
    response = llm.invoke(prompt)
    
    return response.content.strip()


def process_ground_truth_file(ground_truth_file: Path, system_output_dir: Path, model_id: Optional[str] = None) -> bool:
    """Process a single ground truth file to generate annotation.
    
    Args:
        ground_truth_file (Path): Path to the ground truth JSON file.
        system_output_dir (Path): Directory containing system outputs.
        model_id (str, optional): LLM model to use.
        
    Returns:
        bool: True if file was updated, False if skipped or error occurred.
    """
    try:
        # Read ground truth file
        with open(ground_truth_file, 'r', encoding='utf-8') as f:
            gt_data = json.load(f)
        
        # Check if annotation is still placeholder
        current_annotation = gt_data.get('ground_truth', {}).get('annotation', '')
        
        if current_annotation != "MANUAL_ANNOTATION_REQUIRED":
            logger.info(f"Skipping {ground_truth_file.name} - annotation already populated")
            return False
        
        # Find corresponding system output file
        video_id = ground_truth_file.stem
        system_output_file = system_output_dir / f"{video_id}.json"
        
        if not system_output_file.exists():
            logger.warning(f"No system output found for {ground_truth_file.name}")
            return False
        
        # Read system output
        with open(system_output_file, 'r', encoding='utf-8') as f:
            sys_data = json.load(f)
        
        # Extract system annotation
        system_annotation = sys_data.get('system_outputs', {}).get('annotation', '')
        
        if not system_annotation:
            logger.warning(f"No system annotation found in {system_output_file.name}")
            return False
        
        # Get video path
        video_path = gt_data.get('video_path', '')
        
        logger.info(f"Generating ground truth annotation for {ground_truth_file.name}")
        logger.debug(f"System annotation length: {len(system_annotation)} characters")
        
        # Generate improved annotation
        ground_truth_annotation = generate_ground_truth_annotation(
            system_annotation, video_path, model_id
        )
        
        if not ground_truth_annotation:
            logger.error(f"Failed to generate annotation for {ground_truth_file.name}")
            return False
        
        # Update ground truth data
        gt_data['ground_truth']['annotation'] = ground_truth_annotation
        
        # Write updated file
        with open(ground_truth_file, 'w', encoding='utf-8') as f:
            json.dump(gt_data, f, indent=2, ensure_ascii=False)
        
        logger.info(f"✅ Generated annotation for {ground_truth_file.name}")
        logger.info(f"   Length: {len(ground_truth_annotation)} characters")
        logger.info(f"   Preview: {ground_truth_annotation[:100]}...")
        
        return True
        
    except Exception as e:
        logger.error(f"❌ Failed to process {ground_truth_file}: {e}")
        return False


def generate_all_annotations(ground_truth_dir: Path, system_output_dir: Path, model_id: Optional[str] = None) -> None:
    """Generate annotations for all ground truth files that need them.
    
    Args:
        ground_truth_dir (Path): Directory containing ground truth files.
        system_output_dir (Path): Directory containing system outputs.
        model_id (str, optional): LLM model to use.
    """
    if not ground_truth_dir.exists():
        logger.error(f"Ground truth directory not found: {ground_truth_dir}")
        return
    
    if not system_output_dir.exists():
        logger.error(f"System output directory not found: {system_output_dir}")
        return
    
    # Find all ground truth JSON files
    json_files = list(ground_truth_dir.glob("*.json"))
    
    if not json_files:
        logger.warning(f"No JSON files found in {ground_truth_dir}")
        return
    
    logger.info(f"Found {len(json_files)} ground truth files")
    logger.info(f"Using model: {model_id or settings.app.llm['main']}")
    logger.info("=" * 50)
    
    updated_count = 0
    skipped_count = 0
    failed_count = 0
    
    for json_file in json_files:
        logger.info(f"Processing: {json_file.name}")
        
        try:
            if process_ground_truth_file(json_file, system_output_dir, model_id):
                updated_count += 1
            else:
                skipped_count += 1
        except Exception as e:
            logger.error(f"Failed to process {json_file.name}: {e}")
            failed_count += 1
        
        logger.info("-" * 30)
    
    # Summary
    logger.info("=" * 50)
    logger.info(f"SUMMARY:")
    logger.info(f"✅ Generated: {updated_count} annotations")
    logger.info(f"⏭️ Skipped: {skipped_count} files")
    logger.info(f"❌ Failed: {failed_count} files")
    logger.info(f"📁 Total: {len(json_files)} files")
    
    if updated_count > 0:
        logger.info("\\n📝 Next steps:")
        logger.info("1. Review the generated annotations in your IDE")
        logger.info("2. Edit annotations manually to improve accuracy")
        logger.info("3. Run extract_scenes_from_annotation.py to extract scenes")
        logger.info("4. Run populate_scenes.py to create violation/accident templates")


def process_single_file(ground_truth_file: str, system_output_dir: Path, model_id: Optional[str] = None) -> None:
    """Process a single ground truth file.
    
    Args:
        ground_truth_file (str): Path to the ground truth file.
        system_output_dir (Path): Directory containing system outputs.
        model_id (str, optional): LLM model to use.
    """
    file_path = Path(ground_truth_file)
    
    if not file_path.exists():
        logger.error(f"File not found: {file_path}")
        return
    
    if not file_path.suffix == '.json':
        logger.error(f"File must be a JSON file: {file_path}")
        return
    
    logger.info(f"Processing single file: {file_path.name}")
    logger.info(f"Using model: {model_id or settings.app.llm['main']}")
    logger.info("=" * 50)
    
    success = process_ground_truth_file(file_path, system_output_dir, model_id)
    
    logger.info("=" * 50)
    if success:
        logger.info("✅ Annotation generation completed successfully")
    else:
        logger.info("⏭️ Annotation generation skipped or failed")


def main():
    """Main function to generate ground truth annotations."""
    
    print("=" * 60)
    print("GENERATE GROUND TRUTH ANNOTATIONS FROM SYSTEM OUTPUTS")
    print("=" * 60)
    
    # Configuration
    ground_truth_dir = root / "data" / "evaluation" / "ground_truth"
    system_output_dir = root / "data" / "evaluation" / "system_outputs"
    
    # Parse command line arguments
    model_id = None
    single_file = None
    
    if len(sys.argv) > 1:
        if sys.argv[1].startswith('--model='):
            model_id = sys.argv[1].split('=')[1]
        elif sys.argv[1].endswith('.json'):
            single_file = sys.argv[1]
        else:
            print(f"Usage: python {sys.argv[0]} [file.json] [--model=model_id]")
            return
    
    if len(sys.argv) > 2:
        if sys.argv[2].startswith('--model='):
            model_id = sys.argv[2].split('=')[1]
    
    # Check directories
    if not ground_truth_dir.exists():
        print(f"❌ Ground truth directory not found: {ground_truth_dir}")
        return
    
    if not system_output_dir.exists():
        print(f"❌ System output directory not found: {system_output_dir}")
        return
    
    print(f"📁 Ground truth files: {ground_truth_dir}")
    print(f"📁 System output files: {system_output_dir}")
    if model_id:
        print(f"🤖 Using model: {model_id}")
    print("🔄 This will generate initial annotations that you can manually refine")
    print("💡 Requires API access configured in your settings")
    print()
    
    # Process files
    if single_file:
        print(f"🎯 Single file mode: {single_file}")
        process_single_file(single_file, system_output_dir, model_id)
    else:
        generate_all_annotations(ground_truth_dir, system_output_dir, model_id)
    
    print()
    print("=" * 60)
    print("ANNOTATION GENERATION COMPLETE")
    print("=" * 60)


if __name__ == "__main__":
    main()