"""Video Discovery Manager for finding and organizing video clips.

This manager implements intelligent video discovery with fallback mechanisms
to handle clips with or without rich metadata.
"""

import json
import logging
from pathlib import Path
from typing import List, Dict, Optional, Set
from datetime import datetime

# Import models from parent module
import sys
sys.path.append(str(Path(__file__).parent.parent))
from models import VideoClip

logger = logging.getLogger(__name__)


class VideoDiscoveryManager:
    """Discovers video clips using multiple fallback sources."""
    
    def __init__(self, dashcam_dir: str = "data/dashcam", 
                 metadata_dir: str = "data/dashcam/metadata",
                 ground_truth_dir: str = "data/evaluation/ground_truth",
                 system_output_dir: str = "data/evaluation/system_outputs"):
        """Initialize the video discovery manager.
        
        Args:
            dashcam_dir (str): Directory containing video clips.
            metadata_dir (str): Directory containing clip metadata.
            ground_truth_dir (str): Directory containing ground truth files.
            system_output_dir (str): Directory containing system outputs.
        """
        self.dashcam_dir = Path(dashcam_dir)
        self.metadata_dir = Path(metadata_dir)
        self.ground_truth_dir = Path(ground_truth_dir)
        self.system_output_dir = Path(system_output_dir)
        
        # Cache for discovered clips
        self._clip_cache: Optional[List[VideoClip]] = None
        self._cache_timestamp: Optional[datetime] = None
        
        # Supported video extensions
        self.video_extensions = {'.mp4', '.avi', '.mov', '.mkv'}
    
    def discover_all_clips(self, force_refresh: bool = False) -> List[VideoClip]:
        """Main discovery method with intelligent fallbacks.
        
        Args:
            force_refresh (bool): Force refresh of cached clips.
            
        Returns:
            List[VideoClip]: Sorted list of discovered video clips.
        """
        # Check cache first
        if not force_refresh and self._clip_cache is not None and self._cache_timestamp is not None:
            # Cache valid for 5 minutes
            if (datetime.now() - self._cache_timestamp).total_seconds() < 300:
                logger.debug(f"Returning cached clips: {len(self._clip_cache)} clips")
                return self._clip_cache
        
        logger.info("Discovering video clips with intelligent fallbacks...")
        discovered: Dict[str, VideoClip] = {}
        
        # Priority 1: Rich metadata from s3_extract_clips
        self._load_from_clip_metadata(discovered)
        logger.info(f"After clip metadata: {len(discovered)} clips")
        
        # Priority 2: Existing ground truth files  
        self._load_from_ground_truth(discovered)
        logger.info(f"After ground truth: {len(discovered)} clips")
        
        # Priority 3: Direct video file scanning
        self._load_from_video_files(discovered)
        logger.info(f"After video files: {len(discovered)} clips")
        
        # Priority 4: System outputs from evaluation pipeline
        self._load_from_system_outputs(discovered)
        logger.info(f"After system outputs: {len(discovered)} clips")
        
        # Sort clips by sequence number, then by video_id
        clips = sorted(discovered.values(), key=lambda x: (x.sequence_number or 9999, x.video_id))
        
        # Update cache
        self._clip_cache = clips
        self._cache_timestamp = datetime.now()
        
        logger.info(f"✅ Discovered {len(clips)} total video clips")
        return clips
    
    def _load_from_clip_metadata(self, discovered: Dict[str, VideoClip]) -> None:
        """Load clips from rich metadata generated by s3_extract_clips.
        
        Args:
            discovered (Dict[str, VideoClip]): Dictionary to update with discovered clips.
        """
        if not self.metadata_dir.exists():
            logger.debug("Metadata directory not found")
            return
        
        # Look for individual clip metadata files
        metadata_files = list(self.metadata_dir.glob("*_clips_metadata.json"))
        
        for metadata_file in metadata_files:
            try:
                with open(metadata_file, 'r', encoding='utf-8') as f:
                    metadata = json.load(f)
                
                clips_data = metadata.get('clips', [])
                logger.debug(f"Processing {metadata_file.name}: {len(clips_data)} clips")
                
                for clip_data in clips_data:
                    video_id = clip_data.get('video_id', '')
                    if not video_id:
                        continue
                    
                    video_clip = VideoClip(
                        video_id=video_id,
                        video_path=clip_data.get('output_path', f"data/dashcam/{video_id}.mp4"),
                        sequence_number=clip_data.get('sequence_number'),
                        category=clip_data.get('category', 'unknown'),
                        event_type=clip_data.get('event_type', 'unmarked'),
                        duration=clip_data.get('duration', 10.0),
                        severity_level=clip_data.get('severity_level', 3),
                        metadata_source='rich_metadata',
                        has_full_context=True,
                        original_video_id=clip_data.get('original_video_id'),
                        start_time=clip_data.get('start_time'),
                        confidence=clip_data.get('confidence'),
                        marked_at=datetime.fromisoformat(clip_data['marked_at']) if clip_data.get('marked_at') else None,
                        file_size=clip_data.get('file_size')
                    )
                    
                    discovered[video_id] = video_clip
                    
            except Exception as e:
                logger.warning(f"Failed to load metadata from {metadata_file}: {e}")
        
        logger.debug(f"Loaded {len([c for c in discovered.values() if c.metadata_source == 'rich_metadata'])} clips from metadata")
    
    def _load_from_ground_truth(self, discovered: Dict[str, VideoClip]) -> None:
        """Load clips from existing ground truth files.
        
        Args:
            discovered (Dict[str, VideoClip]): Dictionary to update with discovered clips.
        """
        if not self.ground_truth_dir.exists():
            logger.debug("Ground truth directory not found")
            return
        
        gt_files = list(self.ground_truth_dir.glob("*.json"))
        ground_truth_count = 0
        
        for gt_file in gt_files:
            try:
                video_id = gt_file.stem
                
                # Skip if already discovered with rich metadata
                if video_id in discovered and discovered[video_id].has_full_context:
                    continue
                
                with open(gt_file, 'r', encoding='utf-8') as f:
                    gt_data = json.load(f)
                
                video_path = gt_data.get('video_path', f"data/dashcam/{video_id}.mp4")
                
                # Extract sequence number from filename if possible
                sequence_number = self._extract_sequence_number(video_id)
                
                video_clip = VideoClip(
                    video_id=video_id,
                    video_path=video_path,
                    sequence_number=sequence_number,
                    category=self._infer_category_from_id(video_id),
                    event_type=self._infer_event_from_id(video_id),
                    duration=10.0,  # Default duration
                    severity_level=3,  # Default severity
                    metadata_source='ground_truth',
                    has_full_context=False
                )
                
                discovered[video_id] = video_clip
                ground_truth_count += 1
                
            except Exception as e:
                logger.warning(f"Failed to load ground truth from {gt_file}: {e}")
        
        logger.debug(f"Loaded {ground_truth_count} clips from ground truth files")
    
    def _load_from_video_files(self, discovered: Dict[str, VideoClip]) -> None:
        """Load clips from direct video file scanning.
        
        Args:
            discovered (Dict[str, VideoClip]): Dictionary to update with discovered clips.
        """
        if not self.dashcam_dir.exists():
            logger.debug("Dashcam directory not found")
            return
        
        video_files = []
        for ext in self.video_extensions:
            video_files.extend(self.dashcam_dir.glob(f"*{ext}"))
        
        file_scan_count = 0
        
        for video_file in video_files:
            try:
                video_id = video_file.stem
                
                # Skip if already discovered
                if video_id in discovered:
                    continue
                
                # Extract sequence number from filename
                sequence_number = self._extract_sequence_number(video_id)
                
                # Get file stats
                stat = video_file.stat()
                
                video_clip = VideoClip(
                    video_id=video_id,
                    video_path=str(video_file),
                    sequence_number=sequence_number,
                    category=self._infer_category_from_id(video_id),
                    event_type=self._infer_event_from_id(video_id),
                    duration=10.0,  # Default duration
                    severity_level=3,  # Default severity
                    metadata_source='file_scan',
                    has_full_context=False,
                    file_size=stat.st_size
                )
                
                discovered[video_id] = video_clip
                file_scan_count += 1
                
            except Exception as e:
                logger.warning(f"Failed to load video file {video_file}: {e}")
        
        logger.debug(f"Loaded {file_scan_count} clips from direct file scan")
    
    def _load_from_system_outputs(self, discovered: Dict[str, VideoClip]) -> None:
        """Load clips from system outputs directory.
        
        Args:
            discovered (Dict[str, VideoClip]): Dictionary to update with discovered clips.
        """
        if not self.system_output_dir.exists():
            logger.debug("System outputs directory not found")
            return
        
        system_files = list(self.system_output_dir.glob("*.json"))
        system_count = 0
        
        for system_file in system_files:
            try:
                video_id = system_file.stem
                
                # Skip if already discovered
                if video_id in discovered:
                    continue
                
                # Extract sequence number from filename
                sequence_number = self._extract_sequence_number(video_id)
                
                video_clip = VideoClip(
                    video_id=video_id,
                    video_path=f"data/dashcam/{video_id}.mp4",  # Assume standard location
                    sequence_number=sequence_number,
                    category=self._infer_category_from_id(video_id),
                    event_type=self._infer_event_from_id(video_id),
                    duration=10.0,  # Default duration
                    severity_level=3,  # Default severity
                    metadata_source='system_output',
                    has_full_context=False
                )
                
                discovered[video_id] = video_clip
                system_count += 1
                
            except Exception as e:
                logger.warning(f"Failed to load from system output {system_file}: {e}")
        
        logger.debug(f"Loaded {system_count} clips from system outputs")
    
    def get_clip_by_id(self, video_id: str) -> Optional[VideoClip]:
        """Get specific clip by video ID.
        
        Args:
            video_id (str): Video clip identifier.
            
        Returns:
            Optional[VideoClip]: Video clip if found, None otherwise.
        """
        clips = self.discover_all_clips()
        return next((clip for clip in clips if clip.video_id == video_id), None)
    
    def search_clips(self, query: str) -> List[VideoClip]:
        """Search clips by video ID, category, or event type.
        
        Args:
            query (str): Search query.
            
        Returns:
            List[VideoClip]: Matching video clips.
        """
        clips = self.discover_all_clips()
        query_lower = query.lower()
        
        matching_clips = []
        for clip in clips:
            if (query_lower in clip.video_id.lower() or
                query_lower in clip.category.lower() or
                query_lower in clip.event_type.lower()):
                matching_clips.append(clip)
        
        return matching_clips
    
    def get_clips_by_category(self, category: str) -> List[VideoClip]:
        """Get clips filtered by category.
        
        Args:
            category (str): Category to filter by.
            
        Returns:
            List[VideoClip]: Clips in the specified category.
        """
        clips = self.discover_all_clips()
        return [clip for clip in clips if clip.category.lower() == category.lower()]
    
    def get_statistics(self) -> Dict[str, any]:
        """Get discovery statistics.
        
        Returns:
            Dict[str, any]: Statistics about discovered clips.
        """
        clips = self.discover_all_clips()
        
        # Count by metadata source
        source_counts = {}
        category_counts = {}
        
        for clip in clips:
            source_counts[clip.metadata_source] = source_counts.get(clip.metadata_source, 0) + 1
            category_counts[clip.category] = category_counts.get(clip.category, 0) + 1
        
        # Count clips with full context
        full_context_count = sum(1 for clip in clips if clip.has_full_context)
        
        return {
            'total_clips': len(clips),
            'full_context_clips': full_context_count,
            'basic_clips': len(clips) - full_context_count,
            'source_breakdown': source_counts,
            'category_breakdown': category_counts,
            'sequence_range': {
                'min': min((c.sequence_number for c in clips if c.sequence_number is not None), default=None),
                'max': max((c.sequence_number for c in clips if c.sequence_number is not None), default=None)
            }
        }
    
    def _extract_sequence_number(self, video_id: str) -> Optional[int]:
        """Extract sequence number from video ID.
        
        Args:
            video_id (str): Video ID to parse.
            
        Returns:
            Optional[int]: Sequence number if found, None otherwise.
        """
        try:
            # Look for pattern like "0000_..." at the beginning
            if video_id and len(video_id) >= 4 and video_id[:4].isdigit():
                return int(video_id[:4])
        except (ValueError, IndexError):
            pass
        return None
    
    def _infer_category_from_id(self, video_id: str) -> str:
        """Infer category from video ID.
        
        Args:
            video_id (str): Video ID to analyze.
            
        Returns:
            str: Inferred category.
        """
        video_id_lower = video_id.lower()
        
        # Common category patterns
        if 'cut_off' in video_id_lower or 'cutoff' in video_id_lower:
            return 'cut_off'
        elif 'accident' in video_id_lower:
            return 'accident'
        elif 'turn' in video_id_lower:
            return 'turn'
        elif 'merge' in video_id_lower:
            return 'merge'
        elif 'lane_change' in video_id_lower:
            return 'lane_change'
        elif 'intersection' in video_id_lower:
            return 'intersection'
        elif 'highway' in video_id_lower:
            return 'highway'
        elif 'parking' in video_id_lower:
            return 'parking'
        elif 'pedestrian' in video_id_lower:
            return 'pedestrian'
        
        return 'unknown'
    
    def _infer_event_from_id(self, video_id: str) -> str:
        """Infer event type from video ID.
        
        Args:
            video_id (str): Video ID to analyze.
            
        Returns:
            str: Inferred event type.
        """
        video_id_lower = video_id.lower()
        
        # Common event patterns
        if 'accident' in video_id_lower:
            return 'accident'
        elif 'near_miss' in video_id_lower:
            return 'near_miss'
        elif 'violation' in video_id_lower:
            return 'violation'
        elif 'safe' in video_id_lower:
            return 'safe_driving'
        elif 'aggressive' in video_id_lower:
            return 'aggressive'
        
        return 'unmarked'
    
    def refresh_cache(self) -> List[VideoClip]:
        """Force refresh of the clip cache.
        
        Returns:
            List[VideoClip]: Newly discovered clips.
        """
        logger.info("Force refreshing video clip cache...")
        return self.discover_all_clips(force_refresh=True)