"""YouTube dashcam video downloader for dataset creation."""

import sys
from pathlib import Path
from typing import List, Dict, Optional
import subprocess
import json
import re
import argparse
import time
from datetime import datetime
from tqdm import tqdm

# Add project root to path
root = Path(__file__).parent.parent.parent
sys.path.append(str(root))

from src.utils.log import logger


class YouTubeDownloader:
    """Downloads dashcam videos from YouTube for dataset creation."""
    
    def __init__(self, download_dir: Path):
        """Initialize downloader.
        
        Args:
            download_dir: Directory to save downloaded videos
        """
        self.download_dir = download_dir
        self.download_dir.mkdir(parents=True, exist_ok=True)
        
        # Checkpoint file for resume functionality
        self.checkpoint_file = self.download_dir / ".download_checkpoint.json"
        self.checkpoint_data = self._load_checkpoint()
        
        # Validate and clean checkpoint data on startup
        self._validate_and_clean_checkpoint()
        
        # Check if yt-dlp is available
        self._check_ytdlp()
    
    def _check_ytdlp(self):
        """Check if yt-dlp is installed."""
        try:
            subprocess.run(['yt-dlp', '--version'], capture_output=True, check=True)
            logger.info("yt-dlp is available")
        except (subprocess.CalledProcessError, FileNotFoundError):
            logger.error("yt-dlp not found. Install with: pip install yt-dlp")
            raise RuntimeError("yt-dlp is required for downloading videos")
    
    def _load_checkpoint(self) -> Dict:
        """Load checkpoint data from file."""
        if self.checkpoint_file.exists():
            try:
                with open(self.checkpoint_file, 'r') as f:
                    checkpoint_data = json.load(f)
                logger.info(f"Loaded checkpoint with {len(checkpoint_data.get('downloaded_videos', {}))} existing videos")
                return checkpoint_data
            except (json.JSONDecodeError, Exception) as e:
                logger.warning(f"Failed to load checkpoint: {e}. Starting fresh.")
                return {}
        return {}
    
    def _save_checkpoint(self):
        """Save checkpoint data to file."""
        try:
            with open(self.checkpoint_file, 'w') as f:
                json.dump(self.checkpoint_data, f, indent=2, default=str)
        except Exception as e:
            logger.warning(f"Failed to save checkpoint: {e}")
    
    def _validate_and_clean_checkpoint(self):
        """Validate checkpoint data and remove invalid entries."""
        video_metadata = self.checkpoint_data.get('video_metadata', {})
        invalid_entries = []
        
        for key, metadata in video_metadata.items():
            file_path = metadata.get('file_path')
            if file_path:
                path = Path(file_path)
                if not path.exists() or path.stat().st_size == 0:
                    invalid_entries.append(key)
                    logger.warning(f"Removing invalid checkpoint entry: {key} (file not found or empty)")
        
        # Remove invalid entries
        for key in invalid_entries:
            del video_metadata[key]
            # Also remove from downloaded_videos list
            if ':' in key:
                category, video_id = key.split(':', 1)
                downloaded_videos = self.checkpoint_data.get('downloaded_videos', {})
                if category in downloaded_videos and video_id in downloaded_videos[category]:
                    downloaded_videos[category].remove(video_id)
        
        if invalid_entries:
            logger.info(f"Cleaned up {len(invalid_entries)} invalid checkpoint entries")
            self._save_checkpoint()
        
        return len(invalid_entries)
    
    def _is_video_downloaded(self, video_id: str, category: str) -> bool:
        """Check if a video has already been downloaded."""
        video_metadata = self.checkpoint_data.get('video_metadata', {})
        return f"{category}:{video_id}" in video_metadata
    
    def _is_duplicate_video(self, video_info: Dict, category: str) -> Optional[Dict]:
        """Check if video is duplicate based on ID or URL with enhanced detection."""
        video_metadata = self.checkpoint_data.get('video_metadata', {})
        video_id = video_info['id']
        video_url = video_info['url']
        
        # Method 1: Check exact category:video_id key
        metadata_key = f"{category}:{video_id}"
        if metadata_key in video_metadata:
            existing_metadata = video_metadata[metadata_key]
            logger.debug(f"Found exact duplicate: {video_id} in same category {category}")
            return existing_metadata
        
        # Method 2: Check by video_id across all categories (in case video moved categories)
        for key, metadata in video_metadata.items():
            if metadata.get('video_id') == video_id:
                existing_category = key.split(':', 1)[0] if ':' in key else 'unknown'
                logger.debug(f"Found video_id duplicate: {video_id} in different category {existing_category} -> {category}")
                return metadata
        
        # Method 3: Check by URL across all categories (in case URL format changed)
        for key, metadata in video_metadata.items():
            if metadata.get('url') == video_url:
                existing_category = key.split(':', 1)[0] if ':' in key else 'unknown'
                logger.debug(f"Found URL duplicate: {video_url} in category {existing_category} -> {category}")
                return metadata
                
        return None
    
    def _mark_video_downloaded(self, video_info: Dict, category: str, file_path: Path):
        """Mark a video as downloaded with comprehensive metadata."""
        if 'video_metadata' not in self.checkpoint_data:
            self.checkpoint_data['video_metadata'] = {}
        
        # Store comprehensive metadata
        metadata_key = f"{category}:{video_info['id']}"
        
        # Extract safety-related keywords from title and description
        safety_keywords = self._extract_safety_keywords(video_info.get('title', ''), video_info.get('description', ''))
        
        video_metadata = {
            # Basic video information
            'video_id': video_info['id'],
            'title': video_info['title'],
            'url': video_info['url'],
            'category': category,
            'duration': video_info.get('duration'),
            'upload_date': video_info.get('upload_date'),
            
            # Engagement metrics
            'view_count': video_info.get('view_count', 0),
            'like_count': video_info.get('like_count', 0),
            'comment_count': video_info.get('comment_count', 0),
            
            # Channel information
            'channel_name': video_info.get('channel_name'),
            'channel_id': video_info.get('channel_id'),
            'channel_url': video_info.get('channel_url'),
            
            # Content information
            'description': video_info.get('description', ''),
            'tags': video_info.get('tags', []),
            'thumbnail': video_info.get('thumbnail'),
            'age_limit': video_info.get('age_limit', 0),
            'live_status': video_info.get('live_status'),
            'availability': video_info.get('availability'),
            
            # Technical information
            'resolution': video_info.get('resolution'),
            'fps': video_info.get('fps'),
            'video_codec': video_info.get('video_codec'),
            'audio_codec': video_info.get('audio_codec'),
            
            # DriveGuard-specific metadata
            'relevance_score': video_info.get('relevance_score', 0),
            'search_query_used': video_info.get('search_query_used', ''),
            'search_rank': video_info.get('search_rank', 0),
            'safety_keywords': safety_keywords,
            'weather_mentioned': self._extract_weather_keywords(video_info.get('title', ''), video_info.get('description', '')),
            'time_of_day': self._extract_time_keywords(video_info.get('title', ''), video_info.get('description', '')),
            
            # File information
            'file_path': str(file_path),
            'file_size': file_path.stat().st_size if file_path.exists() else 0,
            'download_time': datetime.now().isoformat()
        }
        
        self.checkpoint_data['video_metadata'][metadata_key] = video_metadata
        
        # Also maintain simple lists for backward compatibility
        if 'downloaded_videos' not in self.checkpoint_data:
            self.checkpoint_data['downloaded_videos'] = {}
        if category not in self.checkpoint_data['downloaded_videos']:
            self.checkpoint_data['downloaded_videos'][category] = []
        
        if video_info['id'] not in self.checkpoint_data['downloaded_videos'][category]:
            self.checkpoint_data['downloaded_videos'][category].append(video_info['id'])
        
        self._save_checkpoint()
    
    def get_resume_status(self) -> Dict:
        """Get current resume status information."""
        downloaded_videos = self.checkpoint_data.get('downloaded_videos', {})
        video_metadata = self.checkpoint_data.get('video_metadata', {})
        
        total_downloaded = sum(len(videos) for videos in downloaded_videos.values())
        total_size = sum(metadata.get('file_size', 0) for metadata in video_metadata.values())
        
        # Check for duplicate detection stats
        duplicate_stats = self._get_duplicate_detection_stats()
        
        return {
            'total_downloaded': total_downloaded,
            'categories_with_downloads': len(downloaded_videos),
            'category_counts': {cat: len(videos) for cat, videos in downloaded_videos.items()},
            'total_size_mb': total_size / 1024 / 1024,
            'checkpoint_exists': self.checkpoint_file.exists(),
            'last_checkpoint_time': self.checkpoint_data.get('last_update_time', 'Never'),
            'has_metadata': len(video_metadata) > 0,
            'duplicate_detection': duplicate_stats
        }
    
    def _get_duplicate_detection_stats(self) -> Dict:
        """Get statistics about duplicate detection capabilities."""
        video_metadata = self.checkpoint_data.get('video_metadata', {})
        
        # Count videos by video_id and URL to detect potential duplicates
        video_ids = {}
        urls = {}
        cross_category_duplicates = []
        
        for key, metadata in video_metadata.items():
            video_id = metadata.get('video_id', '')
            url = metadata.get('url', '')
            category = key.split(':', 1)[0] if ':' in key else 'unknown'
            
            if video_id:
                if video_id in video_ids:
                    existing_category = video_ids[video_id]
                    if existing_category != category:
                        cross_category_duplicates.append({
                            'video_id': video_id,
                            'categories': [existing_category, category]
                        })
                else:
                    video_ids[video_id] = category
            
            if url:
                urls[url] = urls.get(url, 0) + 1
        
        url_duplicates = {url: count for url, count in urls.items() if count > 1}
        
        return {
            'total_unique_video_ids': len(video_ids),
            'cross_category_duplicates': len(cross_category_duplicates),
            'url_duplicates': len(url_duplicates),
            'detection_ready': len(video_metadata) > 0
        }
    
    def export_metadata(self, output_file: Optional[Path] = None) -> Path:
        """Export video metadata to JSON file.
        
        Args:
            output_file: Optional output file path
            
        Returns:
            Path to exported metadata file
        """
        if output_file is None:
            output_file = self.download_dir / "video_metadata.json"
        
        metadata_export = {
            'export_time': datetime.now().isoformat(),
            'total_videos': len(self.checkpoint_data.get('video_metadata', {})),
            'download_directory': str(self.download_dir),
            'videos': self.checkpoint_data.get('video_metadata', {})
        }
        
        with open(output_file, 'w') as f:
            json.dump(metadata_export, f, indent=2, default=str)
        
        logger.info(f"Exported metadata for {metadata_export['total_videos']} videos to {output_file}")
        return output_file
    
    def _extract_safety_keywords(self, title: str, description: str) -> List[str]:
        """Extract safety-related keywords from title and description."""
        safety_terms = {
            'accident', 'crash', 'collision', 'near miss', 'close call',
            'dangerous', 'reckless', 'aggressive', 'road rage', 'tailgate',
            'speed', 'racing', 'weaving', 'cutting off', 'brake check',
            'red light', 'stop sign', 'intersection', 'merge', 'lane change',
            'pedestrian', 'cyclist', 'motorcycle', 'truck', 'bus',
            'weather', 'rain', 'snow', 'ice', 'fog', 'night', 'construction',
            'emergency', 'police', 'ambulance', 'fire truck'
        }
        
        text = (title + ' ' + description).lower()
        found_keywords = []
        
        for term in safety_terms:
            if term in text:
                found_keywords.append(term)
        
        return found_keywords
    
    def _extract_weather_keywords(self, title: str, description: str) -> List[str]:
        """Extract weather-related keywords."""
        weather_terms = {
            'rain', 'rainy', 'wet', 'storm', 'heavy rain', 'drizzle',
            'snow', 'snowy', 'blizzard', 'ice', 'icy', 'sleet', 'hail',
            'fog', 'foggy', 'mist', 'misty', 'visibility',
            'sunny', 'clear', 'cloudy', 'overcast',
            'wind', 'windy', 'tornado', 'hurricane'
        }
        
        text = (title + ' ' + description).lower()
        found_weather = []
        
        for term in weather_terms:
            if term in text:
                found_weather.append(term)
        
        return found_weather
    
    def _extract_time_keywords(self, title: str, description: str) -> List[str]:
        """Extract time of day keywords."""
        time_terms = {
            'night', 'nighttime', 'evening', 'dark', 'darkness',
            'day', 'daytime', 'morning', 'afternoon', 'dawn', 'dusk',
            'sunrise', 'sunset', 'twilight', 'midnight',
            'rush hour', 'traffic hour'
        }
        
        text = (title + ' ' + description).lower()
        found_time = []
        
        for term in time_terms:
            if term in text:
                found_time.append(term)
        
        return found_time
    
    def get_search_queries(self) -> List[Dict[str, str]]:
        """Get search queries for different traffic scenarios.
        
        Returns:
            List of search queries with categories
        """
        return [
            # CRITICAL SAFETY VIOLATIONS
            {"category": "red_light_violations", "query": "dashcam red light running violation traffic light"},
            {"category": "red_light_violations", "query": "dashcam running red light intersection accident"},
            {"category": "red_light_violations", "query": "dashcam traffic light violation close call"},
            
            {"category": "stop_sign_violations", "query": "dashcam stop sign violation rolling stop"},
            {"category": "stop_sign_violations", "query": "dashcam stop sign running intersection"},
            {"category": "stop_sign_violations", "query": "dashcam failure to stop sign"},
            
            {"category": "wrong_way_driving", "query": "dashcam wrong way driver highway"},
            {"category": "wrong_way_driving", "query": "dashcam wrong way driving head on"},
            {"category": "wrong_way_driving", "query": "dashcam driving wrong direction"},
            
            {"category": "school_bus_violations", "query": "dashcam school bus stop sign violation"},
            {"category": "school_bus_violations", "query": "dashcam passing school bus illegally"},
            {"category": "school_bus_violations", "query": "dashcam school bus safety violation"},
            
            # AGGRESSIVE DRIVING BEHAVIORS  
            {"category": "road_rage", "query": "dashcam road rage aggressive driver"},
            {"category": "road_rage", "query": "dashcam road rage incident confrontation"},
            {"category": "road_rage", "query": "dashcam aggressive driving behavior"},
            
            {"category": "tailgating", "query": "dashcam tailgating following too close"},
            {"category": "tailgating", "query": "dashcam aggressive tailgating brake check"},
            {"category": "tailgating", "query": "dashcam following distance too close"},
            
            {"category": "dangerous_overtaking", "query": "dashcam dangerous overtaking passing"},
            {"category": "dangerous_overtaking", "query": "dashcam unsafe passing maneuver"},
            {"category": "dangerous_overtaking", "query": "dashcam reckless overtaking oncoming traffic"},
            
            {"category": "cutting_off", "query": "dashcam cutting off lane change"},
            {"category": "cutting_off", "query": "dashcam cut off dangerous merge"},
            {"category": "cutting_off", "query": "dashcam unsafe lane change cutting"},
            
            {"category": "speeding", "query": "dashcam speeding excessive speed limit"},
            {"category": "speeding", "query": "dashcam street racing illegal racing"},
            {"category": "speeding", "query": "dashcam high speed dangerous driving"},
            
            # INTERSECTION SCENARIOS
            {"category": "intersections", "query": "dashcam intersection close call traffic light"},
            {"category": "intersections", "query": "dashcam intersection collision near miss"},
            {"category": "intersections", "query": "dashcam left turn yield failure intersection"},
            {"category": "intersections", "query": "dashcam right turn on red violation"},
            {"category": "intersections", "query": "dashcam intersection right of way violation"},
            
            {"category": "roundabouts", "query": "dashcam roundabout dangerous driving"},
            {"category": "roundabouts", "query": "dashcam roundabout yield failure"},
            {"category": "roundabouts", "query": "dashcam roundabout wrong way"},
            
            # HIGHWAY AND MERGING
            {"category": "highway_merging", "query": "dashcam highway merge dangerous"},
            {"category": "highway_merging", "query": "dashcam highway on ramp merge failure"},
            {"category": "highway_merging", "query": "dashcam highway merge cutoff"},
            {"category": "highway_merging", "query": "dashcam highway exit dangerous"},
            
            {"category": "highway_lane_changes", "query": "dashcam highway lane change cutting off"},
            {"category": "highway_lane_changes", "query": "dashcam highway weaving dangerous"},
            {"category": "highway_lane_changes", "query": "dashcam highway blind spot accident"},
            
            # VULNERABLE ROAD USERS
            {"category": "pedestrian_conflicts", "query": "dashcam pedestrian crossing close call"},
            {"category": "pedestrian_conflicts", "query": "dashcam crosswalk pedestrian violation"},
            {"category": "pedestrian_conflicts", "query": "dashcam pedestrian jaywalking accident"},
            {"category": "pedestrian_conflicts", "query": "dashcam school zone children crossing"},
            
            {"category": "cyclist_conflicts", "query": "dashcam cyclist bike lane dangerous"},
            {"category": "cyclist_conflicts", "query": "dashcam bicycle accident close call"},
            {"category": "cyclist_conflicts", "query": "dashcam bike lane violation car"},
            {"category": "cyclist_conflicts", "query": "dashcam dooring cyclist accident"},
            
            {"category": "motorcycle_interactions", "query": "dashcam motorcycle lane splitting"},
            {"category": "motorcycle_interactions", "query": "dashcam motorcycle blind spot"},
            {"category": "motorcycle_interactions", "query": "dashcam motorcycle filtering dangerous"},
            
            # ENVIRONMENTAL HAZARDS
            {"category": "weather_rain", "query": "dashcam rain driving dangerous weather"},
            {"category": "weather_rain", "query": "dashcam wet road hydroplaning"},
            {"category": "weather_rain", "query": "dashcam heavy rain visibility poor"},
            
            {"category": "weather_snow_ice", "query": "dashcam snow ice dangerous driving"},
            {"category": "weather_snow_ice", "query": "dashcam winter driving accident slippery"},
            {"category": "weather_snow_ice", "query": "dashcam ice road sliding accident"},
            
            {"category": "weather_fog", "query": "dashcam fog low visibility driving"},
            {"category": "weather_fog", "query": "dashcam foggy conditions accident"},
            {"category": "weather_fog", "query": "dashcam heavy fog dangerous driving"},
            
            {"category": "night_driving", "query": "dashcam night driving dangerous"},
            {"category": "night_driving", "query": "dashcam headlight failure night"},
            {"category": "night_driving", "query": "dashcam night visibility accident"},
            
            # CONSTRUCTION AND WORK ZONES
            {"category": "construction_zones", "query": "dashcam construction zone violation"},
            {"category": "construction_zones", "query": "dashcam work zone speeding dangerous"},
            {"category": "construction_zones", "query": "dashcam construction worker safety"},
            
            # EMERGENCY SITUATIONS
            {"category": "emergency_vehicles", "query": "dashcam emergency vehicle failure yield"},
            {"category": "emergency_vehicles", "query": "dashcam ambulance fire truck police"},
            {"category": "emergency_vehicles", "query": "dashcam emergency vehicle blocking"},
            
            {"category": "vehicle_breakdowns", "query": "dashcam vehicle breakdown highway"},
            {"category": "vehicle_breakdowns", "query": "dashcam disabled vehicle accident"},
            {"category": "vehicle_breakdowns", "query": "dashcam emergency hazard lights"},
            
            # PARKING AND BACKING
            {"category": "parking_accidents", "query": "dashcam parking lot accident backing"},
            {"category": "parking_accidents", "query": "dashcam reverse backing accident"},
            {"category": "parking_accidents", "query": "dashcam parking garage accident"},
            {"category": "parking_accidents", "query": "dashcam parallel parking accident"},
            
            # DISTRACTED AND IMPAIRED DRIVING
            {"category": "distracted_driving", "query": "dashcam distracted driver phone"},
            {"category": "distracted_driving", "query": "dashcam texting while driving"},
            {"category": "distracted_driving", "query": "dashcam driver distraction accident"},
            
            {"category": "drowsy_driving", "query": "dashcam drowsy driver falling asleep"},
            {"category": "drowsy_driving", "query": "dashcam tired driver weaving"},
            {"category": "drowsy_driving", "query": "dashcam fatigue driving accident"},
            
            # COMMERCIAL VEHICLE SCENARIOS
            {"category": "truck_interactions", "query": "dashcam truck blind spot accident"},
            {"category": "truck_interactions", "query": "dashcam truck following too close"},
            {"category": "truck_interactions", "query": "dashcam semi truck dangerous"},
            {"category": "truck_interactions", "query": "dashcam truck brake failure"},
            
            {"category": "bus_interactions", "query": "dashcam bus stop dangerous"},
            {"category": "bus_interactions", "query": "dashcam city bus accident"},
            {"category": "bus_interactions", "query": "dashcam bus lane violation"},
            
            # INFRASTRUCTURE HAZARDS
            {"category": "railroad_crossings", "query": "dashcam railroad crossing violation"},
            {"category": "railroad_crossings", "query": "dashcam train crossing gate running"},
            {"category": "railroad_crossings", "query": "dashcam railway crossing accident"},
            
            {"category": "bridge_tunnel", "query": "dashcam bridge tunnel accident"},
            {"category": "bridge_tunnel", "query": "dashcam tunnel driving dangerous"},
            {"category": "bridge_tunnel", "query": "dashcam bridge height clearance"},
            
            {"category": "road_debris", "query": "dashcam road debris accident"},
            {"category": "road_debris", "query": "dashcam object road hazard"},
            {"category": "road_debris", "query": "dashcam debris avoidance swerve"},
            
            # ANIMAL ENCOUNTERS
            {"category": "animal_crossings", "query": "dashcam deer crossing accident"},
            {"category": "animal_crossings", "query": "dashcam animal road wildlife"},
            {"category": "animal_crossings", "query": "dashcam pet dog cat road"},
            
            # URBAN DRIVING SCENARIOS
            {"category": "urban_complex", "query": "dashcam city driving narrow streets"},
            {"category": "urban_complex", "query": "dashcam downtown traffic congestion"},
            {"category": "urban_complex", "query": "dashcam urban intersection busy"},
            {"category": "urban_complex", "query": "dashcam delivery truck blocking traffic"},
            
            # FOLLOWING DISTANCE AND SPACE MANAGEMENT
            {"category": "following_distance", "query": "dashcam rear end collision following"},
            {"category": "following_distance", "query": "dashcam brake checking tailgating"},
            {"category": "following_distance", "query": "dashcam sudden stop rear end"},
            
            # LANE DISCIPLINE VIOLATIONS
            {"category": "lane_violations", "query": "dashcam improper lane change"},
            {"category": "lane_violations", "query": "dashcam lane departure accident"},
            {"category": "lane_violations", "query": "dashcam crossing solid line"},
            {"category": "lane_violations", "query": "dashcam shoulder driving illegal"},
            
            # YIELD AND RIGHT OF WAY
            {"category": "yield_violations", "query": "dashcam failure to yield accident"},
            {"category": "yield_violations", "query": "dashcam yield sign violation"},
            {"category": "yield_violations", "query": "dashcam right of way violation"},
            
            # GENERAL ACCIDENT SCENARIOS (for comprehensive coverage)
            {"category": "accidents_general", "query": "dashcam car accident compilation close call"},
            {"category": "accidents_general", "query": "dashcam near miss compilation dangerous"},
            {"category": "accidents_general", "query": "dashcam traffic accident prevention"},
        ]
    
    def search_videos(self, query: str, max_results: int = 10) -> List[Dict]:
        """Search for videos on YouTube.
        
        Args:
            query: Search query
            max_results: Maximum number of results to return
            
        Returns:
            List of video information dictionaries
        """
        try:
            cmd = [
                'yt-dlp',
                f'ytsearch{max_results}:{query}',
                '--dump-json',
                '--no-download',
                '--flat-playlist'
            ]
            
            result = subprocess.run(cmd, capture_output=True, text=True, check=True)
            
            videos = []
            for line in result.stdout.strip().split('\n'):
                if line:
                    try:
                        video_info = json.loads(line)
                        videos.append({
                            'id': video_info.get('id'),
                            'title': video_info.get('title'),
                            'url': f"https://www.youtube.com/watch?v={video_info.get('id')}",
                            'duration': video_info.get('duration'),
                            'view_count': video_info.get('view_count', 0),
                            'upload_date': video_info.get('upload_date'),
                            # Enhanced metadata
                            'like_count': video_info.get('like_count', 0),
                            'comment_count': video_info.get('comment_count', 0),
                            'channel_name': video_info.get('uploader', video_info.get('channel')),
                            'channel_id': video_info.get('channel_id'),
                            'channel_url': video_info.get('channel_url'),
                            'description': video_info.get('description', '')[:500] if video_info.get('description') else '',
                            'tags': video_info.get('tags', []),
                            'thumbnail': video_info.get('thumbnail'),
                            'resolution': f"{video_info.get('width', 0)}x{video_info.get('height', 0)}" if video_info.get('width') else None,
                            'fps': video_info.get('fps'),
                            'video_codec': video_info.get('vcodec'),
                            'audio_codec': video_info.get('acodec'),
                            'age_limit': video_info.get('age_limit', 0),
                            'live_status': video_info.get('live_status'),
                            'availability': video_info.get('availability')
                        })
                    except json.JSONDecodeError:
                        continue
            
            logger.info(f"Found {len(videos)} videos for query: {query}")
            return videos
            
        except subprocess.CalledProcessError as e:
            logger.error(f"Failed to search videos: {e}")
            return []
    
    def filter_suitable_videos(self, videos: List[Dict]) -> List[Dict]:
        """Filter videos suitable for dashcam dataset.
        
        Args:
            videos: List of video information
            
        Returns:
            Filtered list of suitable videos
        """
        suitable = []
        
        for video in videos:
            # Filter by duration (prefer 2-30 minutes for good content density)
            duration = video.get('duration', 0)
            if not duration or duration < 120 or duration > 1800:  # 2 min to 30 min
                continue
            
            # Filter by title keywords (avoid music, compilations without driving content)
            title = video.get('title', '').lower()
            
            # Skip music, memes, gaming content, and non-traffic content
            skip_keywords = ['music', 'song', 'gta', 'game', 'funny', 'meme', 'compilation music', 
                           'review', 'unboxing', 'installation', 'how to install', 'product test',
                           'funny compilation', 'epic fail', 'try not to laugh', 'reaction']
            if any(keyword in title for keyword in skip_keywords):
                continue
            
            # Prefer videos with dashcam and safety-related keywords
            dashcam_keywords = ['dashcam', 'dash cam', 'car camera', 'driving camera', 'road camera']
            safety_keywords = ['accident', 'crash', 'collision', 'close call', 'near miss', 'dangerous',
                             'violation', 'traffic light', 'intersection', 'merge', 'lane change',
                             'pedestrian', 'cyclist', 'road rage', 'aggressive', 'speeding',
                             'weather', 'rain', 'snow', 'fog', 'construction', 'emergency']
            
            # Score based on relevance
            relevance_score = 0
            relevance_score += sum(2 for kw in dashcam_keywords if kw in title)  # Higher weight for dashcam
            relevance_score += sum(1 for kw in safety_keywords if kw in title)   # Standard weight for safety
            
            if relevance_score > 0:
                video['relevance_score'] = relevance_score
                suitable.append(video)
        
        # Sort by relevance and view count
        suitable.sort(key=lambda x: (x.get('relevance_score', 0), x.get('view_count', 0)), reverse=True)
        
        logger.info(f"Filtered to {len(suitable)} suitable videos")
        return suitable
    
    def download_video(self, video_info: Dict, category: str, check_existing: bool = True, progress_bar: Optional[tqdm] = None) -> Optional[Path]:
        """Download a single video.
        
        Args:
            video_info: Video information dictionary
            category: Category for organizing downloads
            check_existing: Whether to check for existing downloads (default: True)
            progress_bar: Optional progress bar for updating download progress
            
        Returns:
            Path to downloaded video file, or None if failed
        """
        video_id = video_info['id']
        
        # Check for duplicates using enhanced detection
        if check_existing:
            existing_metadata = self._is_duplicate_video(video_info, category)
            if existing_metadata:
                # Try to find existing file
                existing_path = Path(existing_metadata['file_path'])
                if existing_path.exists() and existing_path.stat().st_size > 0:
                    existing_category = existing_metadata.get('category', 'unknown')
                    if progress_bar:
                        progress_bar.set_description(f"Skipped: {video_info['title'][:40]}...")
                        progress_bar.update(1)
                    
                    # More informative logging based on duplicate type
                    if existing_category == category:
                        logger.info(f"Skipping {video_id}: Already downloaded in {category} ({existing_path.name})")
                    else:
                        logger.info(f"Skipping {video_id}: Already downloaded in {existing_category}, requested for {category} ({existing_path.name})")
                    
                    return existing_path
                else:
                    logger.warning(f"Video {video_id} marked as downloaded but file not found or empty. Re-downloading...")
                    # Remove invalid metadata entry to avoid confusion
                    old_key = f"{existing_metadata.get('category', category)}:{video_id}"
                    if old_key in self.checkpoint_data.get('video_metadata', {}):
                        del self.checkpoint_data['video_metadata'][old_key]
                        self._save_checkpoint()
        
        try:
            # Create category subdirectory
            category_dir = self.download_dir / category
            category_dir.mkdir(exist_ok=True)
            
            # Clean filename
            safe_title = re.sub(r'[^\w\s-]', '', video_info['title'])
            safe_title = re.sub(r'[-\s]+', '_', safe_title)
            safe_title = safe_title[:50]  # Limit length
            
            output_template = str(category_dir / f"{video_id}_{safe_title}.%(ext)s")
            
            cmd = [
                'yt-dlp',
                video_info['url'],
                '-o', output_template,
                '--no-playlist',
                '--write-info-json',
                '--no-warnings',  # Reduce console noise for progress bar
                '--retries', '5',  # Increased retries
                '--fragment-retries', '5',  # Increased fragment retries
                '--retry-sleep', 'exp=1:10',  # Exponential backoff from 1 to 10 seconds
                '--user-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
                '--extractor-args', 'youtube:player-client=android',  # Use android client (less restricted)
                '--add-header', 'Accept-Language:en-US,en;q=0.9',  # Add language header
                '--sleep-interval', '1',  # Sleep 1 second between downloads
                '--max-sleep-interval', '5',  # Maximum sleep interval
            ]
            
            # Update progress bar description
            if progress_bar:
                progress_bar.set_description(f"Downloading: {video_info['title'][:40]}...")
            
            logger.info(f"Downloading: {video_info['title'][:60]}...")
            
            # Run download with progress tracking
            result = subprocess.run(cmd, capture_output=True, text=True, check=True)
            
            # Find the downloaded file (any video format)
            downloaded_files = list(category_dir.glob(f"{video_id}_*")) 
            downloaded_files = [f for f in downloaded_files if f.suffix in ['.mp4', '.mkv', '.webm', '.avi', '.mov']]
            if downloaded_files:
                video_path = downloaded_files[0]
                
                # Mark as downloaded with full metadata
                self._mark_video_downloaded(video_info, category, video_path)
                
                # Update progress bar
                if progress_bar:
                    progress_bar.set_description(f"Completed: {video_info['title'][:40]}...")
                    progress_bar.update(1)
                
                logger.info(f"Downloaded: {video_path.name} ({video_path.stat().st_size / 1024 / 1024:.1f} MB)")
                
                # Add delay to avoid rate limiting
                time.sleep(2)  # 2 second delay between downloads
                
                return video_path
            else:
                if progress_bar:
                    progress_bar.set_description(f"Failed: {video_info['title'][:40]}...")
                    progress_bar.update(1)
                logger.warning(f"Could not find downloaded file for {video_id}")
                return None
                
        except subprocess.CalledProcessError as e:
            if progress_bar:
                progress_bar.set_description(f"Error: {video_info['title'][:40]}...")
                progress_bar.update(1)
            logger.error(f"Failed to download {video_id}: {e}")
            if e.stderr:
                logger.error(f"Error details: {e.stderr}")
            return None
    
    def calculate_videos_per_category(self, total_videos: int) -> Dict[str, int]:
        """Calculate how many videos to download per category.
        
        Args:
            total_videos: Total number of videos desired
            
        Returns:
            Dictionary mapping categories to number of videos to download
        """
        search_queries = self.get_search_queries()
        
        # Get unique categories
        categories = list(set(query['category'] for query in search_queries))
        num_categories = len(categories)
        
        # Calculate base videos per category
        base_videos_per_category = total_videos // num_categories
        remainder = total_videos % num_categories
        
        # Distribute videos across categories
        category_counts = {}
        for i, category in enumerate(categories):
            # Give extra video to first 'remainder' categories
            extra = 1 if i < remainder else 0
            category_counts[category] = base_videos_per_category + extra
        
        logger.info(f"Distributing {total_videos} videos across {num_categories} categories:")
        for category, count in category_counts.items():
            logger.info(f"  {category}: {count} videos")
        
        return category_counts

    def download_dataset(self, total_videos: Optional[int] = None, max_videos_per_category: Optional[int] = None, resume: bool = False) -> Dict[str, List[Path]]:
        """Download videos for the complete dataset.
        
        Args:
            total_videos: Total number of videos to download (takes precedence over max_videos_per_category)
            max_videos_per_category: Maximum videos to download per category (used if total_videos not specified)
            resume: Whether to resume from previous download (default: False)
            
        Returns:
            Dictionary mapping categories to downloaded video paths
        """
        search_queries = self.get_search_queries()
        
        # Store parameters in checkpoint for resume validation
        current_params = {
            'total_videos': total_videos,
            'max_videos_per_category': max_videos_per_category,
            'start_time': datetime.now().isoformat()
        }
        
        # Validate resume parameters match if resuming
        if resume and 'parameters' in self.checkpoint_data:
            prev_params = self.checkpoint_data['parameters']
            if (prev_params.get('total_videos') != total_videos or 
                prev_params.get('max_videos_per_category') != max_videos_per_category):
                logger.warning("Resume parameters don't match previous run. Continuing with current parameters.")
        
        self.checkpoint_data['parameters'] = current_params
        self._save_checkpoint()
        
        # Determine video distribution
        if total_videos is not None:
            category_counts = self.calculate_videos_per_category(total_videos)
        else:
            # Use the old method - same videos per category
            max_per_cat = max_videos_per_category or 5
            categories = list(set(query['category'] for query in search_queries))
            category_counts = {cat: max_per_cat for cat in categories}
        
        downloaded = {}
        processed_categories = set()
        
        # If resuming, get existing downloads and include them in results
        if resume:
            existing_downloads = self.checkpoint_data.get('downloaded_videos', {})
            for category, video_ids in existing_downloads.items():
                category_dir = self.download_dir / category
                category_files = []
                for video_id in video_ids:
                    existing_files = list(category_dir.glob(f"{video_id}_*.mp4"))
                    if existing_files and existing_files[0].exists():
                        category_files.append(existing_files[0])
                downloaded[category] = category_files
                
            logger.info(f"Resuming: Found {sum(len(files) for files in downloaded.values())} existing videos")
        
        # Calculate total videos needed for progress bar
        total_needed = sum(max(0, target_count - len(downloaded.get(category, []))) 
                          for category, target_count in category_counts.items())
        
        # Create overall progress bar
        with tqdm(total=total_needed, desc="Overall Progress", unit="video", position=0, leave=True) as overall_pbar:
            
            # Process each query, downloading videos for each category
            for query_info in search_queries:
                category = query_info['category']
                
                # Skip if we've already processed this category
                if category in processed_categories:
                    continue
                    
                target_count = category_counts[category]
                existing_count = len(downloaded.get(category, []))
                needed_count = max(0, target_count - existing_count)
                
                if needed_count == 0:
                    logger.info(f"Category {category}: Target reached ({existing_count}/{target_count} videos)")
                    processed_categories.add(category)
                    continue
                    
                logger.info(f"Processing category: {category} (need {needed_count} more, {existing_count}/{target_count} complete)")
                
                # Search for videos across all queries for this category
                all_videos = []
                category_queries = [q for q in search_queries if q['category'] == category]
                
                # Search progress for category
                overall_pbar.set_description(f"Searching: {category}")
                
                for cat_query in category_queries:
                    videos = self.search_videos(cat_query['query'], max_results=15)  # Get more to account for already downloaded
                    # Add search query information to each video
                    for i, video in enumerate(videos):
                        video['search_query_used'] = cat_query['query']
                        video['search_rank'] = i + 1
                    all_videos.extend(videos)
                
                # Remove duplicates based on video ID and URL
                seen_ids = set()
                seen_urls = set()
                unique_videos = []
                for video in all_videos:
                    video_id = video['id']
                    video_url = video['url']
                    
                    # Skip if we've seen this ID or URL
                    if video_id not in seen_ids and video_url not in seen_urls:
                        unique_videos.append(video)
                        seen_ids.add(video_id)
                        seen_urls.add(video_url)
                
                # Filter suitable videos
                suitable_videos = self.filter_suitable_videos(unique_videos)
                logger.info(f"Found {len(suitable_videos)} suitable videos for {category}")
                
                # Download additional videos for this category
                category_downloads = downloaded.get(category, [])
                videos_to_download = suitable_videos[:target_count - len(category_downloads)]
                
                if videos_to_download:
                    # Create category-specific progress bar
                    with tqdm(total=len(videos_to_download), desc=f"{category}", unit="video", 
                             position=1, leave=False) as category_pbar:
                        
                        for video in videos_to_download:
                            if len(category_downloads) >= target_count:
                                break
                                
                            downloaded_path = self.download_video(video, category, check_existing=True, progress_bar=category_pbar)
                            if downloaded_path and downloaded_path not in category_downloads:
                                category_downloads.append(downloaded_path)
                                overall_pbar.update(1)
                
                downloaded[category] = category_downloads
                processed_categories.add(category)
                
                actual_downloaded = len(category_downloads) - existing_count
                logger.info(f"Downloaded {actual_downloaded} new videos for {category} ({len(category_downloads)}/{target_count} total)")
        
        print()  # Add spacing after progress bars
        
        # Update checkpoint with final status
        self.checkpoint_data['last_update_time'] = datetime.now().isoformat()
        self.checkpoint_data['completed'] = True
        self._save_checkpoint()
        
        # Log summary
        total_downloads = sum(len(paths) for paths in downloaded.values())
        target_total = sum(category_counts.values())
        
        logger.info(f"Dataset download complete! Downloaded {total_downloads}/{target_total} videos")
        
        for category, paths in downloaded.items():
            target = category_counts.get(category, 0)
            logger.info(f"  {category}: {len(paths)}/{target} videos")
        
        return downloaded


def main():
    """Main function for downloading YouTube dashcam videos."""
    
    parser = argparse.ArgumentParser(
        description='Download dashcam videos from YouTube for dataset creation',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python youtube_downloader.py --total-videos 50
  python youtube_downloader.py --videos-per-category 5
  python youtube_downloader.py --total-videos 100 --output-dir ./my_videos
  python youtube_downloader.py --resume --total-videos 50
  python youtube_downloader.py --status
  python youtube_downloader.py --list-categories
  
The script will automatically distribute videos evenly across all available categories
when using --total-videos, or download the same number per category when using
--videos-per-category. Use --resume to continue interrupted downloads without
re-downloading existing videos.
        """
    )
    
    parser.add_argument(
        '--total-videos', 
        type=int,
        help='Total number of videos to download (will be distributed evenly across categories)'
    )
    parser.add_argument(
        '--videos-per-category',
        type=int,
        default=3,
        help='Number of videos to download per category (default: 3, ignored if --total-videos is specified)'
    )
    parser.add_argument(
        '--output-dir',
        type=Path,
        help='Output directory for downloaded videos (default: data/data_prepare/raw_videos)'
    )
    parser.add_argument(
        '--list-categories',
        action='store_true',
        help='List all available video categories and exit'
    )
    parser.add_argument(
        '--resume',
        action='store_true',
        help='Resume from previous interrupted download'
    )
    parser.add_argument(
        '--status',
        action='store_true',
        help='Show current download status and exit'
    )
    parser.add_argument(
        '--export-metadata',
        type=Path,
        nargs='?',
        const='auto',
        help='Export video metadata to JSON file (default: video_metadata.json)'
    )
    
    args = parser.parse_args()
    
    # Default configuration
    root = Path(__file__).parent.parent.parent
    default_download_dir = root / "data" / "data_prepare" / "raw_videos"
    download_dir = args.output_dir or default_download_dir
    
    print("=" * 60)
    print("YOUTUBE DASHCAM VIDEO DOWNLOADER")
    print("=" * 60)
    
    # Create downloader early for status/categories commands
    downloader = YouTubeDownloader(download_dir)
    
    # Handle list categories command
    if args.list_categories:
        search_queries = downloader.get_search_queries()
        categories = sorted(set(query['category'] for query in search_queries))
        
        print(f"\nAvailable categories ({len(categories)} total):")
        print("-" * 40)
        for i, category in enumerate(categories, 1):
            queries_for_cat = [q for q in search_queries if q['category'] == category]
            print(f"{i:2d}. {category} ({len(queries_for_cat)} search queries)")
        return 0
    
    # Handle status command
    if args.status:
        status = downloader.get_resume_status()
        
        print(f"\nDownload Status:")
        print("-" * 50)
        print(f"Checkpoint exists: {status['checkpoint_exists']}")
        print(f"Total downloaded: {status['total_downloaded']} videos")
        print(f"Total size: {status['total_size_mb']:.1f} MB")
        print(f"Categories with downloads: {status['categories_with_downloads']}")
        print(f"Metadata available: {status['has_metadata']}")
        print(f"Last checkpoint: {status['last_checkpoint_time']}")
        
        # Show duplicate detection status
        if status['duplicate_detection']['detection_ready']:
            dup_stats = status['duplicate_detection']
            print(f"Duplicate detection: {dup_stats['total_unique_video_ids']} unique video IDs tracked")
            if dup_stats['cross_category_duplicates'] > 0:
                print(f"Cross-category duplicates: {dup_stats['cross_category_duplicates']} detected")
            if dup_stats['url_duplicates'] > 0:
                print(f"URL duplicates: {dup_stats['url_duplicates']} detected")
        
        if status['category_counts']:
            print(f"\nDownloads per category:")
            print("-" * 30)
            for category, count in sorted(status['category_counts'].items()):
                print(f"  {category:<25}: {count:>3} videos")
                
            # Show metadata sample if available
            if status['has_metadata']:
                video_metadata = downloader.checkpoint_data.get('video_metadata', {})
                if video_metadata:
                    print(f"\nSample video metadata:")
                    print("-" * 30)
                    sample_key = next(iter(video_metadata))
                    sample = video_metadata[sample_key]
                    print(f"  Title: {sample.get('title', 'N/A')[:60]}...")
                    print(f"  Duration: {sample.get('duration', 'N/A')} seconds")
                    print(f"  View Count: {sample.get('view_count', 'N/A'):,}")
                    print(f"  Upload Date: {sample.get('upload_date', 'N/A')}")
                    print(f"  File Size: {sample.get('file_size', 0) / 1024 / 1024:.1f} MB")
                    
                # Offer to export metadata
                print(f"\nTo export all metadata: python {Path(__file__).name} --export-metadata")
        else:
            print("\nNo downloads found.")
            
        if status['total_downloaded'] > 0:
            print(f"\nTo resume: python {Path(__file__).name} --resume --total-videos <N>")
        
        return 0
    
    # Handle export metadata command
    if args.export_metadata is not None:
        video_metadata = downloader.checkpoint_data.get('video_metadata', {})
        if not video_metadata:
            print("\nNo video metadata found to export.")
            print("Download some videos first, then try again.")
            return 1
        
        # Determine output file
        if args.export_metadata == 'auto':
            output_file = None  # Use default
        else:
            output_file = args.export_metadata
        
        exported_file = downloader.export_metadata(output_file)
        
        print(f"\nMetadata Export Complete:")
        print("-" * 40)
        print(f"Videos: {len(video_metadata)}")
        print(f"File: {exported_file}")
        print(f"Size: {exported_file.stat().st_size / 1024:.1f} KB")
        
        return 0
    
    print("\nThis tool downloads dashcam videos from YouTube for dataset creation.")
    print("Make sure you have yt-dlp installed: uv add yt-dlp")
    print(f"Videos will be saved to: {download_dir}")
    
    # Handle resume information
    if args.resume:
        status = downloader.get_resume_status()
        if status['total_downloaded'] > 0:
            print(f"\nResuming download: {status['total_downloaded']} videos already downloaded")
        else:
            print(f"\nNo previous downloads found, starting fresh")
    
    # Determine download parameters
    if args.total_videos:
        print(f"\nTarget: {args.total_videos} videos total (distributed across categories)")
    else:
        print(f"\nTarget: {args.videos_per_category} videos per category")
    
    # Show category distribution preview
    if args.total_videos:
        category_counts = downloader.calculate_videos_per_category(args.total_videos)
        print(f"\nVideo distribution preview:")
        print("-" * 40)
        for category, count in sorted(category_counts.items()):
            print(f"{category}: {count} videos")
    
    # Download dataset
    try:
        print("\nStarting download process...")
        downloaded = downloader.download_dataset(
            total_videos=args.total_videos,
            max_videos_per_category=args.videos_per_category if not args.total_videos else None,
            resume=args.resume
        )
        
        print("\nDownload Summary:")
        print("-" * 40)
        for category, paths in sorted(downloaded.items()):
            print(f"{category}: {len(paths)} videos")
        
        total = sum(len(paths) for paths in downloaded.values())
        print(f"\nTotal downloaded: {total} videos")
        print(f"Saved to: {download_dir}")
        
        print("\nNext steps:")
        print("1. Review downloaded videos for quality and relevance")
        print("2. Manually analyze videos to identify safety scenarios")
        print("3. Create evaluation dataset by selecting best clips")
        print("4. Use selected clips with DriveGuard evaluation pipeline")
        
    except Exception as e:
        print(f"Error: {e}")
        return 1
    
    return 0


if __name__ == "__main__":
    exit(main())