"""DriveGuard Clip Extraction Tool - Extract marked clips from reviewed videos.

This script extracts video clips marked in step 2 (video_reviewer) and saves them
to the final dataset directory with meaningful filenames and comprehensive metadata.
"""

import sys
import json
import subprocess
from pathlib import Path
from typing import Dict, List, Optional, Any, Tuple
from datetime import datetime
import argparse
import re
from tqdm import tqdm

# Add project root to path
root = Path(__file__).parent.parent.parent
sys.path.append(str(root))

from src.utils.log import logger


class ClipExtractor:
    """Extracts video clips from marked timestamps and saves with metadata."""
    
    def __init__(self, output_dir: Optional[Path] = None):
        """Initialize the clip extractor.
        
        Args:
            output_dir: Directory to save extracted clips (default: data/dashcam/)
        """
        # Paths
        self.root = Path(__file__).parent.parent.parent
        self.data_prepare_dir = self.root / "data" / "data_prepare"
        self.output_dir = output_dir or (self.root / "data" / "dashcam")
        
        # Input data paths
        self.review_progress_dir = self.data_prepare_dir / "review_progress"
        self.video_metadata_file = self.data_prepare_dir / "raw_videos" / ".download_checkpoint.json"
        
        # Output paths - organize metadata in separate folder
        self.metadata_dir = self.output_dir / "metadata"
        self.extraction_metadata_file = self.metadata_dir / "extraction_metadata.json"
        self.extraction_log_file = self.metadata_dir / "extraction_log.json"
        
        # Create output directories
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.metadata_dir.mkdir(parents=True, exist_ok=True)
        
        # Check dependencies
        self._check_ffmpeg()
    
    def _check_ffmpeg(self):
        """Check if ffmpeg is available."""
        try:
            subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True)
            logger.info("ffmpeg is available")
        except (subprocess.CalledProcessError, FileNotFoundError):
            logger.error("ffmpeg not found. Install with: brew install ffmpeg (macOS) or apt install ffmpeg (Linux)")
            raise RuntimeError("ffmpeg is required for video clip extraction")
    
    def get_next_sequence_number(self) -> int:
        """Get the next sequential number for continuous numbering.
        
        Returns the maximum existing sequence number + 1, ensuring truly
        continuous numbering without gaps.
        
        Returns:
            Next sequential number (1 if no clips exist)
        """
        if not self.output_dir.exists():
            return 1
        
        existing_sequences = set()
        pattern = re.compile(r'^(\d{4})_.*\.mp4$')
        
        # Collect all existing sequence numbers
        for file_path in self.output_dir.glob("*.mp4"):
            match = pattern.match(file_path.name)
            if match:
                seq_num = int(match.group(1))
                existing_sequences.add(seq_num)
        
        if not existing_sequences:
            return 1
        
        # Use max + 1 for truly sequential numbering
        next_seq = max(existing_sequences) + 1
        logger.info(f"Next sequence number: {next_seq} (found {len(existing_sequences)} existing clips)")
        return next_seq
    
    def load_individual_video_markings(self) -> Dict:
        """Load clip markings from individual video review marking files."""
        review_progress_dir = self.data_prepare_dir / "review_progress"
        
        if not review_progress_dir.exists():
            logger.error(f"Review progress directory not found: {review_progress_dir}")
            return {"clips_by_video": {}}
        
        # Find all individual marking files
        marking_files = list(review_progress_dir.glob("*_review_markings.json"))
        
        if not marking_files:
            logger.warning(f"No individual marking files found in {review_progress_dir}")
            return {"clips_by_video": {}}
        
        clips_by_video = {}
        total_clips = 0
        
        for marking_file in marking_files:
            try:
                with open(marking_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                
                video_info = data.get('video_info', {})
                video_id = video_info.get('video_id')
                
                if not video_id:
                    logger.warning(f"No video_id found in {marking_file.name}, skipping")
                    continue
                
                # Get clip markings
                clip_markings = data.get('clip_markings', [])
                if clip_markings:
                    clips_by_video[video_id] = clip_markings
                    total_clips += len(clip_markings)
                    logger.debug(f"Loaded {len(clip_markings)} clips for video {video_id}")
                
            except Exception as e:
                logger.warning(f"Failed to load markings from {marking_file.name}: {e}")
                continue
        
        logger.info(f"Loaded {total_clips} clips from {len(clips_by_video)} individual video marking files")
        
        return {
            'export_time': datetime.now().isoformat(),
            'total_clips': total_clips,
            'total_videos_with_clips': len(clips_by_video),
            'clips_by_video': clips_by_video,
            'source': 'individual_video_files'
        }
    
    def load_clip_markings(self) -> Dict:
        """Load clip markings from step 2 (individual video files only)."""
        # Load from individual video files
        data = self.load_individual_video_markings()
        
        if data.get('clips_by_video'):
            return data
        
        # No clip markings found
        logger.error(f"No clip markings found in {self.review_progress_dir}")
        logger.error("Run step 2 (video reviewer) first to mark clips")
        return {"clips_by_video": {}}
    
    def load_video_metadata(self) -> Dict:
        """Load video metadata from step 1."""
        if not self.video_metadata_file.exists():
            logger.error(f"Video metadata file not found: {self.video_metadata_file}")
            logger.error("Run step 1 (youtube downloader) first to download videos")
            return {"video_metadata": {}}
        
        try:
            with open(self.video_metadata_file, 'r') as f:
                data = json.load(f)
            
            metadata_count = len(data.get('video_metadata', {}))
            logger.info(f"Loaded metadata for {metadata_count} videos")
            return data
        except Exception as e:
            logger.error(f"Failed to load video metadata: {e}")
            return {"video_metadata": {}}
    
    def merge_clip_with_metadata(self, clip_data: Dict, video_metadata: Dict) -> Dict:
        """Merge clip data with video metadata from step 1.
        
        Args:
            clip_data: Clip marking data from step 2
            video_metadata: Video metadata from step 1
            
        Returns:
            Merged data dictionary
        """
        video_id = clip_data['video_id']
        
        # Find matching metadata (may have category prefix)
        matching_metadata = None
        for key, metadata in video_metadata.get('video_metadata', {}).items():
            if metadata.get('video_id') == video_id:
                matching_metadata = metadata
                break
        
        return {
            'clip_data': clip_data,
            'video_metadata': matching_metadata or {},
            'merged_at': datetime.now().isoformat()
        }
    
    def generate_clip_filename(self, sequence_num: int, merged_data: Dict) -> str:
        """Generate meaningful filename for extracted clip.
        
        Args:
            sequence_num: Sequential number for the clip
            merged_data: Combined clip and video metadata
            
        Returns:
            Generated filename (without extension)
        """
        clip = merged_data['clip_data']
        video_meta = merged_data['video_metadata']
        
        # Components for filename
        seq = f"{sequence_num:04d}"
        
        # Category from clip or fallback to video metadata
        category = clip.get('video_category', 'unknown')
        if video_meta:
            category = video_meta.get('category', category)
        
        # Event type from user marking
        event_type = clip.get('event_type', 'unmarked')
        if not event_type:
            event_type = 'unmarked'
        
        # Video ID
        video_id = clip.get('video_id', 'unknown')
        
        # Start time (rounded to avoid decimals in filename)
        start_time = int(clip.get('start_time', 0))
        start_time_str = f"{start_time:04d}"
        
        # Clean components for filename safety
        def clean_for_filename(s: str) -> str:
            s = re.sub(r'[^a-zA-Z0-9_-]', '_', s)
            return s.strip('_')
        
        category = clean_for_filename(category)
        event_type = clean_for_filename(event_type)
        video_id = clean_for_filename(video_id)
        
        filename = f"{seq}_{category}_{event_type}_{video_id}_{start_time_str}"
        return filename
    
    def extract_video_clip(self, source_path: str, start_time: float, duration: float, 
                          output_path: Path) -> Tuple[bool, str]:
        """Extract video clip using ffmpeg.
        
        Args:
            source_path: Path to source video file
            start_time: Start time in seconds
            duration: Duration in seconds
            output_path: Output file path
            
        Returns:
            Tuple of (success: bool, error_message: str)
        """
        try:
            # Verify source file exists
            if not Path(source_path).exists():
                return False, f"Source video not found: {source_path}"
            
            # Use accurate seeking: -ss before -i for precise timing alignment
            # This seeks to exact timestamp before decoding starts (slower but accurate)
            cmd = [
                'ffmpeg',
                '-ss', str(start_time),  # Seek before input for accuracy
                '-i', str(source_path),
                '-t', str(duration),
                '-c', 'copy',  # Copy streams without re-encoding for speed
                '-y',  # Overwrite output file
                str(output_path)
            ]
            
            # Run ffmpeg with error capture
            result = subprocess.run(cmd, capture_output=True, text=True, check=True)
            
            # Verify output file was created
            if output_path.exists() and output_path.stat().st_size > 0:
                return True, ""
            else:
                return False, "Output file was not created or is empty"
                
        except subprocess.CalledProcessError as e:
            error_msg = f"ffmpeg error: {e.stderr}" if e.stderr else str(e)
            return False, error_msg
        except Exception as e:
            return False, f"Unexpected error: {str(e)}"
    
    def is_clip_already_extracted(self, clip_id: str, clip_data: Dict = None) -> Optional[Path]:
        """Check if clip has already been extracted.
        
        Args:
            clip_id: Unique clip identifier
            clip_data: Optional clip data for content-based deduplication
            
        Returns:
            Path to existing file if found, None otherwise
        """
        # Check extraction log first for both clip_id and content matches
        if self.extraction_log_file.exists():
            try:
                with open(self.extraction_log_file, 'r') as f:
                    log_data = json.load(f)
                
                for entry in log_data.get('extracted_clips', []):
                    if entry.get('status') == 'success':
                        # Check by clip_id first
                        if entry.get('clip_id') == clip_id:
                            output_path = Path(entry.get('output_path', ''))
                            if output_path.exists():
                                return output_path
                        
                        # Check for content-based duplicates
                        if clip_data:
                            entry_clip_data = entry.get('clip_data', {})
                            if (entry_clip_data.get('video_id') == clip_data.get('video_id') and
                                abs(entry_clip_data.get('start_time', 0) - clip_data.get('start_time', 0)) < 0.5 and
                                entry_clip_data.get('duration') == clip_data.get('duration')):
                                output_path = Path(entry.get('output_path', ''))
                                if output_path.exists():
                                    logger.info(f"Found content duplicate: {clip_id} matches existing {entry.get('clip_id')}")
                                    return output_path
            except Exception:
                pass  # Continue with directory scan
        
        # Fallback: scan output directory for files containing clip_id
        for video_file in self.output_dir.glob("*.mp4"):
            if clip_id in video_file.name:
                return video_file
        
        # Additional fallback: check for files with similar content pattern
        if clip_data:
            video_id = clip_data.get('video_id', '')
            start_time = int(clip_data.get('start_time', 0))
            pattern = f"*{video_id}_{start_time:04d}.mp4"
            matching_files = list(self.output_dir.glob(pattern))
            if matching_files:
                logger.info(f"Found potential content duplicate by pattern: {pattern}")
                return matching_files[0]
        
        return None
    
    def save_video_clips_metadata(self, extraction_results: List[Dict]) -> bool:
        """Save individual metadata files grouped by video.
        
        Args:
            extraction_results: List of all extraction results
            
        Returns:
            True if all files saved successfully, False otherwise
        """
        try:
            # Group clips by video_id
            clips_by_video = {}
            for result in extraction_results:
                clip_data = result.get('clip_data', {})
                video_id = clip_data.get('video_id', 'unknown')
                
                if video_id not in clips_by_video:
                    clips_by_video[video_id] = {
                        'video_info': {
                            'video_id': video_id,
                            'video_path': clip_data.get('video_path', ''),
                            'video_metadata': result.get('video_metadata', {}),
                            'total_clips': 0,
                            'last_updated': datetime.now().isoformat()
                        },
                        'clips': []
                    }
                
                # Add clip info to video group
                clip_info = {
                    'sequence_number': result.get('sequence_number'),
                    'clip_id': result.get('clip_id'),
                    'filename': result.get('filename'),
                    'output_path': result.get('output_path'),
                    'file_size': result.get('file_size', 0),
                    'status': result.get('status'),
                    'processed_at': result.get('processed_at'),
                    'extraction_message': result.get('message', ''),
                    'clip_data': clip_data
                }
                clips_by_video[video_id]['clips'].append(clip_info)
                clips_by_video[video_id]['video_info']['total_clips'] += 1
            
            # Save one JSON file per video
            success_count = 0
            for video_id, video_data in clips_by_video.items():
                try:
                    # Clean video_id for filename
                    safe_video_id = re.sub(r'[^a-zA-Z0-9_-]', '_', video_id)
                    metadata_filename = f"{safe_video_id}_clips_metadata.json"
                    metadata_path = self.metadata_dir / metadata_filename
                    
                    # Create comprehensive metadata structure
                    video_metadata = {
                        'metadata_info': {
                            'created_with': 'DriveGuard s3_extract_clips.py',
                            'metadata_version': '1.0',
                            'created_at': datetime.now().isoformat()
                        },
                        'video_info': video_data['video_info'],
                        'clips': video_data['clips']
                    }
                    
                    # Save metadata file
                    with open(metadata_path, 'w', encoding='utf-8') as f:
                        json.dump(video_metadata, f, indent=2, default=str)
                    
                    logger.info(f"Saved video metadata: {metadata_filename} ({len(video_data['clips'])} clips)")
                    success_count += 1
                    
                except Exception as e:
                    logger.error(f"Failed to save metadata for video {video_id}: {e}")
            
            logger.info(f"Saved metadata for {success_count}/{len(clips_by_video)} videos")
            return success_count == len(clips_by_video)
            
        except Exception as e:
            logger.error(f"Failed to save video clips metadata: {e}")
            return False
    
    def save_extraction_log(self, extraction_results: List[Dict]):
        """Save detailed extraction log.
        
        Args:
            extraction_results: List of extraction result dictionaries
        """
        log_data = {
            'extraction_run': {
                'timestamp': datetime.now().isoformat(),
                'total_clips_processed': len(extraction_results),
                'successful_extractions': len([r for r in extraction_results if r['status'] == 'success']),
                'failed_extractions': len([r for r in extraction_results if r['status'] == 'failed']),
                'skipped_extractions': len([r for r in extraction_results if r['status'] == 'skipped'])
            },
            'extracted_clips': extraction_results
        }
        
        try:
            with open(self.extraction_log_file, 'w') as f:
                json.dump(log_data, f, indent=2, default=str)
            logger.info(f"Extraction log saved: {self.extraction_log_file}")
        except Exception as e:
            logger.error(f"Failed to save extraction log: {e}")
    
    def save_extraction_metadata(self, extraction_results: List[Dict], clip_markings: Dict, video_metadata: Dict):
        """Save comprehensive metadata for extracted clips.
        
        Args:
            extraction_results: List of extraction result dictionaries
            clip_markings: Original clip markings data
            video_metadata: Original video metadata
        """
        successful_clips = [r for r in extraction_results if r['status'] in ['success', 'metadata_updated']]
        
        metadata = {
            'extraction_info': {
                'extracted_at': datetime.now().isoformat(),
                'total_clips_marked': clip_markings.get('total_clips', 0),
                'total_clips_extracted': len(successful_clips),
                'total_videos_with_clips': clip_markings.get('total_videos_with_clips', 0),
                'extraction_success_rate': len(successful_clips) / len(extraction_results) if extraction_results else 0,
                'output_directory': str(self.output_dir),
                'source_clips_directory': str(self.review_progress_dir),
                'source_metadata_file': str(self.video_metadata_file)
            },
            'clips': successful_clips,
            'statistics': self._calculate_extraction_statistics(successful_clips)
        }
        
        try:
            with open(self.extraction_metadata_file, 'w') as f:
                json.dump(metadata, f, indent=2, default=str)
            logger.info(f"Extraction metadata saved: {self.extraction_metadata_file}")
        except Exception as e:
            logger.error(f"Failed to save extraction metadata: {e}")
    
    def _calculate_extraction_statistics(self, successful_clips: List[Dict]) -> Dict:
        """Calculate statistics for successful extractions.
        
        Args:
            successful_clips: List of successfully extracted clips
            
        Returns:
            Statistics dictionary
        """
        if not successful_clips:
            return {}
        
        # Count by category
        categories = {}
        event_types = {}
        severities = {}
        total_duration = 0
        total_size = 0
        
        for clip in successful_clips:
            clip_data = clip.get('clip_data', {})
            
            # Categories
            category = clip_data.get('video_category', 'unknown')
            categories[category] = categories.get(category, 0) + 1
            
            # Event types
            event_type = clip_data.get('event_type', 'unmarked')
            event_types[event_type] = event_types.get(event_type, 0) + 1
            
            # Severities
            severity = clip_data.get('severity_level', 3)
            severities[f"level_{severity}"] = severities.get(f"level_{severity}", 0) + 1
            
            # Duration and size
            total_duration += clip_data.get('duration', 0)
            total_size += clip.get('file_size', 0)
        
        return {
            'clips_by_category': categories,
            'clips_by_event_type': event_types,
            'clips_by_severity': severities,
            'total_duration_seconds': total_duration,
            'average_duration_seconds': total_duration / len(successful_clips),
            'total_file_size_bytes': total_size,
            'average_file_size_bytes': total_size / len(successful_clips) if successful_clips else 0,
            'total_file_size_mb': total_size / (1024 * 1024)
        }
    
    def extract_all_clips(self, force: bool = False, categories: Optional[List[str]] = None) -> Dict[str, Any]:
        """Extract all marked clips to the output directory.
        
        Args:
            force: If True, re-extract even if files already exist
            categories: If provided, only extract clips from these categories
            
        Returns:
            Extraction results dictionary
        """
        logger.info("Starting clip extraction process")
        
        # Load data
        clip_markings = self.load_clip_markings()
        video_metadata = self.load_video_metadata()
        
        if not clip_markings.get('clips_by_video'):
            logger.warning("No clips found to extract")
            return {'status': 'no_clips', 'message': 'No clips found in clip markings file'}
        
        # Collect all clips
        all_clips = []
        for video_id, clips in clip_markings['clips_by_video'].items():
            for clip in clips:
                # Filter by categories if specified
                if categories:
                    clip_category = clip.get('video_category', '')
                    if clip_category not in categories:
                        continue
                
                merged_data = self.merge_clip_with_metadata(clip, video_metadata)
                all_clips.append(merged_data)
        
        logger.info(f"Found {len(all_clips)} clips to process")
        
        if not all_clips:
            logger.warning("No clips match the specified criteria")
            return {'status': 'no_matching_clips', 'message': 'No clips match the specified criteria'}
        
        # Process clips with progress bar - extract to temp, assign sequence on success
        extraction_results = []
        
        with tqdm(total=len(all_clips), desc="Extracting clips", unit="clip") as pbar:
            for i, merged_data in enumerate(all_clips):
                clip_data = merged_data['clip_data']
                clip_id = clip_data['clip_id']
                
                pbar.set_description(f"Processing clip {i+1}/{len(all_clips)}")
                
                # Check if already extracted first
                if not force:
                    existing_file = self.is_clip_already_extracted(clip_id, clip_data)
                    if existing_file:
                        # Get current sequence number for existing file
                        pattern = re.compile(r'^(\d{4})_.*\.mp4$')
                        match = pattern.match(existing_file.name)
                        seq_num = int(match.group(1)) if match else 0
                        
                        result = {
                            'sequence_number': seq_num,
                            'clip_id': clip_id,
                            'filename': existing_file.name,
                            'output_path': str(existing_file),
                            'clip_data': clip_data,
                            'video_metadata': merged_data['video_metadata'],
                            'processed_at': datetime.now().isoformat(),
                            'status': 'metadata_updated',
                            'message': f'Clip exists, metadata updated: {existing_file.name}',
                            'file_size': existing_file.stat().st_size
                        }
                        extraction_results.append(result)
                        logger.info(f"Updated metadata for existing clip: {existing_file.name}")
                        pbar.update(1)
                        continue
                
                # Extract to temporary file first
                temp_filename = f"temp_{clip_id}.mp4"
                temp_path = self.output_dir / temp_filename
                
                # Attempt extraction
                success, error_message = self.extract_video_clip(
                    source_path=clip_data['video_path'],
                    start_time=clip_data['start_time'],
                    duration=clip_data['duration'],
                    output_path=temp_path
                )
                
                if success:
                    # Only now get sequence number and generate final filename
                    seq_num = self.get_next_sequence_number()
                    filename = self.generate_clip_filename(seq_num, merged_data)
                    final_path = self.output_dir / f"{filename}.mp4"
                    
                    # Atomically rename temp file to final name
                    temp_path.rename(final_path)
                    
                    result = {
                        'sequence_number': seq_num,
                        'clip_id': clip_id,
                        'filename': f"{filename}.mp4",
                        'output_path': str(final_path),
                        'clip_data': clip_data,
                        'video_metadata': merged_data['video_metadata'],
                        'processed_at': datetime.now().isoformat(),
                        'status': 'success',
                        'message': 'Successfully extracted',
                        'file_size': final_path.stat().st_size
                    }
                    logger.info(f"Extracted: {filename}.mp4 (sequence {seq_num:04d})")
                else:
                    # Clean up temp file on failure
                    if temp_path.exists():
                        temp_path.unlink()
                    
                    result = {
                        'sequence_number': 0,  # No sequence for failed extractions
                        'clip_id': clip_id,
                        'filename': '',
                        'output_path': '',
                        'clip_data': clip_data,
                        'video_metadata': merged_data['video_metadata'],
                        'processed_at': datetime.now().isoformat(),
                        'status': 'failed',
                        'message': error_message,
                        'file_size': 0
                    }
                    logger.error(f"Failed to extract clip {clip_id}: {error_message}")
                
                extraction_results.append(result)
                pbar.update(1)
        
        # Save results
        self.save_extraction_log(extraction_results)
        self.save_extraction_metadata(extraction_results, clip_markings, video_metadata)
        
        # Save individual video metadata files (new feature)
        self.save_video_clips_metadata(extraction_results)
        
        # Generate summary
        successful = len([r for r in extraction_results if r['status'] == 'success'])
        failed = len([r for r in extraction_results if r['status'] == 'failed'])
        skipped = len([r for r in extraction_results if r['status'] == 'skipped'])
        metadata_updated = len([r for r in extraction_results if r['status'] == 'metadata_updated'])
        
        logger.info(f"Extraction complete: {successful} successful, {failed} failed, {skipped} skipped, {metadata_updated} metadata updated")
        
        return {
            'status': 'complete',
            'total_processed': len(extraction_results),
            'successful': successful,
            'failed': failed,
            'skipped': skipped,
            'metadata_updated': metadata_updated,
            'output_directory': str(self.output_dir),
            'results': extraction_results
        }
    
    def get_extraction_status(self) -> Dict[str, Any]:
        """Get current extraction status and statistics.
        
        Returns:
            Status information dictionary
        """
        status = {
            'review_progress_dir_exists': self.review_progress_dir.exists(),
            'video_metadata_file_exists': self.video_metadata_file.exists(),
            'output_directory_exists': self.output_dir.exists(),
            'extraction_log_exists': self.extraction_log_file.exists(),
            'extraction_metadata_exists': self.extraction_metadata_file.exists()
        }
        
        # Count existing clips in output directory
        existing_clips = list(self.output_dir.glob("*.mp4"))
        status['existing_clips_count'] = len(existing_clips)
        status['existing_clips_size_mb'] = sum(f.stat().st_size for f in existing_clips) / (1024 * 1024)
        
        # Load clip markings if available
        if status['review_progress_dir_exists']:
            clip_markings = self.load_clip_markings()
            status['total_clips_marked'] = clip_markings.get('total_clips', 0)
            status['total_videos_with_clips'] = clip_markings.get('total_videos_with_clips', 0)
        
        # Load extraction metadata if available
        if status['extraction_metadata_exists']:
            try:
                with open(self.extraction_metadata_file, 'r') as f:
                    metadata = json.load(f)
                status['last_extraction'] = metadata.get('extraction_info', {})
                status['extraction_statistics'] = metadata.get('statistics', {})
            except Exception as e:
                status['metadata_load_error'] = str(e)
        
        return status


def main():
    """Main function for the clip extraction script."""
    parser = argparse.ArgumentParser(
        description='Extract video clips marked in step 2 to create final dataset',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python s3_extract_clips.py                    # Extract all marked clips
  python s3_extract_clips.py --force            # Re-extract all clips (overwrite existing)
  python s3_extract_clips.py --status           # Show extraction status
  python s3_extract_clips.py --categories accidents_general,road_rage  # Extract specific categories
  python s3_extract_clips.py --output-dir ./my_clips  # Custom output directory

The script will:
1. Load clip markings from step 2 (video reviewer)
2. Load video metadata from step 1 (youtube downloader)
3. Extract clips with meaningful filenames
4. Save comprehensive metadata for integration with evaluation pipeline
        """
    )
    
    parser.add_argument(
        '--output-dir',
        type=Path,
        help='Output directory for extracted clips (default: data/dashcam/)'
    )
    parser.add_argument(
        '--force',
        action='store_true',
        help='Force re-extraction of existing clips'
    )
    parser.add_argument(
        '--status',
        action='store_true',
        help='Show current extraction status and exit'
    )
    parser.add_argument(
        '--categories',
        type=str,
        help='Comma-separated list of categories to extract (e.g., accidents_general,road_rage)'
    )
    
    args = parser.parse_args()
    
    print("=" * 60)
    print("DRIVEGUARD CLIP EXTRACTION TOOL")
    print("=" * 60)
    
    # Initialize extractor
    extractor = ClipExtractor(output_dir=args.output_dir)
    
    # Handle status command
    if args.status:
        status = extractor.get_extraction_status()
        
        print("\nExtraction Status:")
        print("-" * 40)
        print(f"Review progress directory exists: {status['review_progress_dir_exists']}")
        print(f"Video metadata file exists: {status['video_metadata_file_exists']}")
        print(f"Output directory exists: {status['output_directory_exists']}")
        print(f"Existing clips in output: {status['existing_clips_count']}")
        print(f"Existing clips size: {status['existing_clips_size_mb']:.1f} MB")
        
        if status.get('total_clips_marked'):
            print(f"Total clips marked: {status['total_clips_marked']}")
            print(f"Videos with clips: {status['total_videos_with_clips']}")
        
        if status.get('last_extraction'):
            last_extraction = status['last_extraction']
            print(f"\nLast extraction:")
            print(f"  Time: {last_extraction.get('extracted_at', 'Unknown')}")
            print(f"  Clips extracted: {last_extraction.get('total_clips_extracted', 0)}")
            print(f"  Success rate: {last_extraction.get('extraction_success_rate', 0):.1%}")
        
        if not status['review_progress_dir_exists']:
            print("\nPlease run step 2 (video reviewer) first to mark clips")
        elif not status['video_metadata_file_exists']:
            print("\nPlease run step 1 (youtube downloader) first to download videos")
        elif status['existing_clips_count'] == 0:
            print("\nNo clips have been extracted yet")
            print("Run without --status to start extraction")
        
        return 0
    
    # Parse categories
    categories = None
    if args.categories:
        categories = [cat.strip() for cat in args.categories.split(',')]
        print(f"\nFiltering by categories: {', '.join(categories)}")
    
    print(f"\nExtracting clips to: {extractor.output_dir}")
    if args.force:
        print("Force mode: Re-extracting existing clips")
    
    # Run extraction
    try:
        results = extractor.extract_all_clips(force=args.force, categories=categories)
        
        print("\nExtraction Summary:")
        print("-" * 40)
        print(f"Status: {results['status']}")
        
        if results['status'] == 'complete':
            print(f"Total processed: {results['total_processed']}")
            print(f"Successfully extracted: {results['successful']}")
            print(f"Metadata updated: {results['metadata_updated']}")
            print(f"Failed: {results['failed']}")
            print(f"Skipped (already exist): {results['skipped']}")
            print(f"Output directory: {results['output_directory']}")
            
            if results['successful'] > 0:
                print("\nNext steps:")
                print("1. Review extracted clips in data/dashcam/")
                print("2. Use clips with DriveGuard evaluation pipeline")
                print("3. Run evaluation scripts on the curated dataset")
        else:
            print(f"Message: {results.get('message', 'Unknown error')}")
        
        print("\n" + "=" * 60)
        print("EXTRACTION COMPLETE")
        print("=" * 60)
        
        return 0 if results.get('successful', 0) > 0 or results['status'] in ['no_clips', 'no_matching_clips'] else 1
        
    except Exception as e:
        logger.error(f"Extraction failed: {e}")
        print(f"\nError: {e}")
        return 1


if __name__ == "__main__":
    exit(main())