#!/usr/bin/env python3
"""
Comprehensive ASR Evaluation Pipeline - Performance Optimized (VOICE PROFILE PROMPT)
Integrates optimized metrics, auto-resume, and robust error handling for 3x speed improvement.
This version uses voice profile information for structured prompt conditioning.
"""

import torch
import numpy as np
import json
import time
import argparse
import sys
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Tuple, Any, Optional

# Core imports
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from datasets import load_from_disk
import librosa

# Local imports - use improved versions
from compute_metrics_approx import ASRMetricsApprox, MetricsResultApprox
from asr_experiment_logger_improved import ASRExperimentLoggerImproved
from llm_text_normalizer import LLMTextNormalizer



class ComprehensiveASREvaluatorImproved:
    """
    Performance-optimized ASR evaluation pipeline with auto-resume capability.
    
    Key optimizations:
    - Slot-WER computation disabled (saves ~8-10s per sample)
    - Minimal normalization (only best prediction)
    - Auto-resume from checkpoints
    - Robust error handling with graceful shutdown
    - 3x performance improvement over original
    - VOICE PROFILE PROMPT CONDITIONING
    """
    
    def __init__(self, 
                 model_name: str = "openai/whisper-small",
                 dataset_path: str = "../final_hf_dataset/agentic_asr_normalized",
                 experiment_name: str = "whisper-small-prompt-with-profile",
                 num_beams: int = 1,
                 checkpoint_interval: int = 500,
                 enable_slot_wer: bool = False,
                 auto_resume: bool = True):
        
        self.model_name = model_name
        self.dataset_path = dataset_path
        self.experiment_name = experiment_name
        self.num_beams = num_beams
        self.checkpoint_interval = checkpoint_interval
        self.enable_slot_wer = enable_slot_wer
        self.auto_resume = auto_resume
        
        # Initialize components
        self.model = None
        self.processor = None
        self.dataset = None
        self.logger = None
        self.metrics_calculator = None
        
        # Tracking variables
        self.processed_count = 0
        self.start_time = None
        self.results = []
        self.resume_index = 0
        
        # Performance tracking
        self.performance_stats = {
            "total_samples": 0,
            "successful_samples": 0,
            "failed_samples": 0,
            "avg_processing_time": 0.0,
            "optimization_enabled": True,
            "slot_wer_disabled": not enable_slot_wer
        }
        
    def parse_voice_profile(self, voice_code: str) -> str:
        """Parse voice code like 'bf_emma' into 'british female'."""
        if not voice_code or '_' not in voice_code:
            return 'unknown speaker'
        
        prefix = voice_code.split('_')[0]
        
        # Map voice codes to readable profiles
        voice_mapping = {
            'af': 'american female',
            'am': 'american male', 
            'bf': 'british female',
            'bm': 'british male'
        }
        
        return voice_mapping.get(prefix, 'unknown speaker')
        
    def initialize_components(self):
        """Initialize all pipeline components with performance optimizations."""
        print("🚀 Initializing Performance-Optimized ASR Evaluation Pipeline (VOICE PROFILE PROMPT)...")
        
        # Load model and processor
        print(f"Loading Whisper model: {self.model_name}")
        self.processor = WhisperProcessor.from_pretrained(self.model_name)
        self.model = WhisperForConditionalGeneration.from_pretrained(
            self.model_name,
            torch_dtype=torch.float32,
            device_map="auto"
        )
        print("✅ Model loaded successfully")
        
        # Load dataset
        print(f"Loading dataset from: {self.dataset_path}")
        self.dataset = load_from_disk(self.dataset_path)
        print(f"✅ Dataset loaded: {len(self.dataset)} samples")
        
        # Initialize shared Bedrock client for consistent session management
        print("Initializing shared Bedrock client...")
        from bedrock_claude import BedrockClaudeClient
        shared_bedrock_client = BedrockClaudeClient()
        print("✅ Shared Bedrock client initialized")
        
        # Initialize optimized metrics calculator
        print("Initializing OPTIMIZED ASR metrics calculator...")
        normalizer = LLMTextNormalizer(shared_bedrock_client)
        self.metrics_calculator = ASRMetricsApprox(
            normalizer, 
            shared_bedrock_client, 
            enable_slot_wer=self.enable_slot_wer
        )
        if self.enable_slot_wer:
            print("⚠️  Slot-WER enabled (slower performance)")
        else:
            print("🚀 Slot-WER disabled for 3x performance improvement")
        print("✅ Optimized metrics calculator initialized")
        
        # Initialize enhanced experiment logger
        print(f"Initializing enhanced experiment logger: {self.experiment_name}")
        self.logger = ASRExperimentLoggerImproved(
            experiment_id=self.experiment_name,
            output_format="jsonl",
            output_dir="asr_experiments",
            checkpoint_interval=self.checkpoint_interval
        )
        print("✅ Enhanced logger initialized")
        
        # Auto-resume detection
        if self.auto_resume:
            print("🔍 Checking for existing experiments to resume...")
            self.resume_index = self.logger.detect_resume_point(self.experiment_name)
            if self.resume_index > 0:
                print(f"🔄 Auto-resume enabled: starting from sample {self.resume_index}")
            else:
                print("🆕 No existing experiments found - starting fresh")
        
    def process_audio_with_prompt(self, 
                                audio_array: np.ndarray, 
                                sample_rate: int, 
                                prompt: str) -> Tuple[List[str], List[float]]:
        """Process audio with Whisper using optimized prompt conditioning (no beam search)."""
        
        # Resample if needed (Whisper expects 16kHz)
        target_sample_rate = 16000
        if sample_rate != target_sample_rate:
            audio_array = librosa.resample(
                audio_array, 
                orig_sr=sample_rate, 
                target_sr=target_sample_rate
            )
            sample_rate = target_sample_rate
        
        # Process audio inputs
        inputs = self.processor(
            audio_array,
            sampling_rate=sample_rate,
            return_tensors="pt"
        )
        
        # Move to device
        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
        
        # Prepare prompt for Whisper's native conditioning
        generation_kwargs = {
            "input_features": inputs["input_features"],
            "max_new_tokens": 256,
            "do_sample": False,
            "num_beams": 1,  # Greedy decoding for speed
            "pad_token_id": self.processor.tokenizer.pad_token_id,
            "eos_token_id": self.processor.tokenizer.eos_token_id,
        }
        
        # Add prompt conditioning if available
        if prompt and prompt.strip():
            # Use Whisper's native prompt conditioning
            try:
                prompt_ids = self.processor.tokenizer(
                    prompt,
                    return_tensors="pt",
                    add_special_tokens=False
                ).input_ids.to(self.model.device)
                
                # Only add if prompt_ids is valid
                if prompt_ids is not None and prompt_ids.numel() > 0:
                    generation_kwargs["prompt_ids"] = prompt_ids
            except Exception as e:
                print(f"  ⚠️ Prompt processing error: {e}, continuing without prompt")
        
        # Generate transcription (fast greedy decoding)
        with torch.no_grad():
            try:
                generated_outputs = self.model.generate(**generation_kwargs)
            except Exception as e:
                print(f"  ⚠️ Generation error: {e}")
                # Simple fallback without any special parameters
                try:
                    generated_outputs = self.model.generate(
                        inputs["input_features"],
                        max_new_tokens=256
                    )
                except Exception as fallback_e:
                    print(f"  ❌ Fallback generation failed: {fallback_e}")
                    return [""], [0.0]
        
        # Decode the generated sequence
        if generated_outputs is not None and generated_outputs.numel() > 0:
            # Remove input tokens from output if needed
            input_length = inputs["input_features"].shape[-1] // 2  # Rough estimate
            if len(generated_outputs.shape) > 1:
                # Take only the first sequence (greedy decoding)
                sequence = generated_outputs[0]
            else:
                sequence = generated_outputs
            
            # Decode to text
            transcription = self.processor.decode(sequence, skip_special_tokens=True)
            transcription = transcription.strip()
            
            # Return single transcription with confidence 1.0 (greedy decoding)
            return [transcription], [1.0]
        else:
            return [""], [0.0]
    
    def process_single_sample(self, sample: Dict[str, Any], sample_idx: int) -> Dict[str, Any]:
        """Process a single sample through the optimized pipeline."""
        
        # Extract sample data
        utterance_id = sample.get('utterance_id', f'sample_{sample_idx}')
        domain = sample.get('domain', 'unknown')
        voice = sample.get('voice', 'unknown')
        asr_difficulty = sample.get('asr_difficulty', 0.0)
        
        # Ground truth data
        truth = sample.get('truth', sample.get('text', ''))
        normalized_truth = sample.get('normalized_truth', truth)
        
        # Extract profile information from sample (for logging)
        profile = sample.get('profile', 'unknown')
        
        # Parse voice profile from voice code (NEW LOGIC)
        voice_profile = self.parse_voice_profile(voice)
        
        # Create voice profile prompt
        prompt = f"This is {voice_profile} speaking"
        
        # Audio data
        audio_data = sample['audio']
        audio_array = np.array(audio_data['array'])
        sample_rate = audio_data['sampling_rate']
        
        print(f"  📝 Processing: {utterance_id} ({domain}/{voice}) [VOICE PROFILE PROMPT]")
        print(f"    🎯 Prompt: {prompt}")
        
        # Process audio with voice profile prompt
        start_time = time.time()
        try:
            transcriptions, probabilities = self.process_audio_with_prompt(
                audio_array, sample_rate, prompt
            )
            processing_time = time.time() - start_time
            
            # Get best transcription
            best_transcription = transcriptions[0] if transcriptions else ""
            
            print(f"    🎯 Best: {best_transcription[:100]}...")
            
        except Exception as e:
            print(f"    ❌ ASR Error: {e}")
            transcriptions = [""]
            probabilities = [0.0]
            best_transcription = ""
            processing_time = 0.0
            
            # Check for critical errors
            if "token" in str(e).lower() or "connection" in str(e).lower():
                raise  # Re-raise critical errors
        
        # Compute optimized metrics
        try:
            # Check if normalized_truth is actually normalized
            if normalized_truth != truth:
                # Use pre-normalized truth
                metrics_result = self.metrics_calculator.compute_metrics(
                    ground_truth=normalized_truth,
                    predictions=transcriptions,
                    skip_norm_for_truth=True
                )
            else:
                # normalized_truth is same as truth, so normalize both
                metrics_result = self.metrics_calculator.compute_metrics(
                    ground_truth=truth,
                    predictions=transcriptions,
                    skip_norm_for_truth=False  # Normalize both
                )
            
            slot_status = "disabled" if not self.enable_slot_wer else f"{metrics_result.slot_wer:.3f}"
            print(f"    📊 WER: {metrics_result.wer:.3f}, SER: {metrics_result.ser:.3f}, SlotWER: {slot_status}")
            
        except Exception as e:
            print(f"    ❌ Metrics Error: {e}")
            
            # Check for critical errors
            if "token" in str(e).lower() or "connection" in str(e).lower():
                raise  # Re-raise critical errors
            
            # Create a mock MetricsResultApprox-like object
            class MockMetrics:
                def __init__(self):
                    self.wer = 1.0
                    self.ser = 1.0
                    self.slot_wer = 1.0
                    self.oracle_wer = 1.0
                    self.oracle_slot_wer = 1.0
                    self.normalized_prediction = best_transcription
                    self.normalized_truth = normalized_truth
                    self.normalized_predictions = [best_transcription]
                    self.nbest_match = False
                    self.truth_slots = []
                    self.prediction_slots = []
                    self.slot_matches = []
                    self.slot_mismatches = []
                    self.word_errors = {}
                    self.slot_errors = {}
                    self.position_metrics = []
            
            metrics_result = MockMetrics()
        
        # Compile complete result
        result = {
            'sample_index': sample_idx,
            'utterance_id': utterance_id,
            'domain': domain,
            'voice': voice,
            'voice_profile': voice_profile,  # Include parsed voice profile
            'profile': profile,  # Include original profile for reference
            'asr_difficulty': asr_difficulty,
            'prompt': prompt,  # This is now the voice profile prompt
            'ground_truth': truth,
            'normalized_truth': normalized_truth,
            'transcriptions': transcriptions,
            'probabilities': probabilities,
            'best_transcription': best_transcription,
            'processing_time': processing_time,
            'metrics': metrics_result,
            'timestamp': datetime.now().isoformat(),
            'performance_optimized': True,
            'slot_wer_enabled': self.enable_slot_wer,
            'prompt_type': 'voice_profile'  # Flag to indicate prompt type
        }
        
        return result
    
    def save_checkpoint(self, results: List[Dict[str, Any]], checkpoint_num: int):
        """Save checkpoint results using enhanced logger."""
        self.logger.save_checkpoint(results, checkpoint_num)
    
    def consolidate_final_results(self):
        """Consolidate all results and generate final analytics."""
        print("\n🔄 Consolidating final results...")
        
        # Create final directory
        final_dir = Path(f"asr_experiments/{self.logger.experiment_id}")
        final_dir.mkdir(parents=True, exist_ok=True)
        
        # Save final JSONL
        final_jsonl = final_dir / f"{self.logger.experiment_id}_FINAL.jsonl"
        with open(final_jsonl, 'w') as f:
            for result in self.results:
                f.write(json.dumps(result, default=str) + '\n')
        
        # Generate final CSV
        final_csv = final_dir / f"{self.logger.experiment_id}_FINAL.csv"
        self.generate_csv_summary(final_csv)
        
        # Generate analytics
        self.generate_final_analytics()
        
        print(f"✅ Final results saved to: {final_dir}")
    
    def generate_csv_summary(self, csv_path: Path):
        """Generate CSV summary of results."""
        import csv
        
        with open(csv_path, 'w', newline='') as f:
            writer = csv.writer(f)
            
            # Header (updated to include voice_profile)
            writer.writerow([
                'utterance_id', 'domain', 'voice', 'voice_profile', 'profile', 'asr_difficulty',
                'ground_truth', 'best_transcription', 'wer', 'ser', 'slot_wer',
                'processing_time', 'confidence_score', 'performance_optimized', 'prompt_type'
            ])
            
            # Data rows
            for result in self.results:
                metrics = result['metrics']
                writer.writerow([
                    result['utterance_id'],
                    result['domain'],
                    result['voice'],
                    result.get('voice_profile', 'unknown speaker'),
                    result.get('profile', 'unknown'),
                    result['asr_difficulty'],
                    result['ground_truth'],
                    result['best_transcription'],
                    metrics.wer if hasattr(metrics, 'wer') else getattr(metrics, 'wer', 1.0),
                    metrics.ser if hasattr(metrics, 'ser') else getattr(metrics, 'ser', 1.0),
                    metrics.slot_wer if hasattr(metrics, 'slot_wer') else getattr(metrics, 'slot_wer', 1.0),
                    result['processing_time'],
                    result['probabilities'][0] if result['probabilities'] else 0.0,
                    result.get('performance_optimized', True),
                    result.get('prompt_type', 'voice_profile')
                ])
    
    def generate_final_analytics(self):
        """Generate comprehensive final analytics with performance metrics."""
        analytics_dir = Path(f"asr_experiments/{self.logger.experiment_id}/final_analytics")
        analytics_dir.mkdir(parents=True, exist_ok=True)
        
        # Overall performance
        wers = [getattr(r['metrics'], 'wer', 1.0) for r in self.results]
        sers = [getattr(r['metrics'], 'ser', 1.0) for r in self.results]
        slot_wers = [getattr(r['metrics'], 'slot_wer', 1.0) for r in self.results]
        processing_times = [r['processing_time'] for r in self.results]
        
        overall_stats = {
            'total_samples': len(self.results),
            'avg_wer': np.mean(wers),
            'avg_ser': np.mean(sers),
            'avg_slot_wer': np.mean(slot_wers),
            'std_wer': np.std(wers),
            'std_ser': np.std(sers),
            'std_slot_wer': np.std(slot_wers),
            'total_runtime': time.time() - self.start_time if self.start_time else 0,
            'avg_processing_time': np.mean(processing_times),
            'min_processing_time': np.min(processing_times),
            'max_processing_time': np.max(processing_times),
            'performance_optimized': True,
            'slot_wer_enabled': self.enable_slot_wer,
            'checkpoint_interval': self.checkpoint_interval,
            'auto_resume_enabled': self.auto_resume,
            'resume_index': self.resume_index,
            'prompt_type': 'voice_profile'
        }
        
        with open(analytics_dir / 'overall_performance.json', 'w') as f:
            json.dump(overall_stats, f, indent=2, default=str)
        
        # Domain breakdown
        domain_stats = {}
        for domain in set(r['domain'] for r in self.results):
            domain_results = [r for r in self.results if r['domain'] == domain]
            domain_wers = [getattr(r['metrics'], 'wer', 1.0) for r in domain_results]
            domain_times = [r['processing_time'] for r in domain_results]
            domain_stats[domain] = {
                'count': len(domain_results),
                'avg_wer': np.mean(domain_wers),
                'avg_ser': np.mean([getattr(r['metrics'], 'ser', 1.0) for r in domain_results]),
                'avg_slot_wer': np.mean([getattr(r['metrics'], 'slot_wer', 1.0) for r in domain_results]),
                'avg_processing_time': np.mean(domain_times)
            }
        
        with open(analytics_dir / 'domain_breakdown.json', 'w') as f:
            json.dump(domain_stats, f, indent=2, default=str)
        
        # Voice breakdown
        voice_stats = {}
        for voice in set(r['voice'] for r in self.results):
            voice_results = [r for r in self.results if r['voice'] == voice]
            voice_wers = [getattr(r['metrics'], 'wer', 1.0) for r in voice_results]
            voice_times = [r['processing_time'] for r in voice_results]
            voice_stats[voice] = {
                'count': len(voice_results),
                'avg_wer': np.mean(voice_wers),
                'avg_ser': np.mean([getattr(r['metrics'], 'ser', 1.0) for r in voice_results]),
                'avg_slot_wer': np.mean([getattr(r['metrics'], 'slot_wer', 1.0) for r in voice_results]),
                'avg_processing_time': np.mean(voice_times)
            }
        
        with open(analytics_dir / 'voice_breakdown.json', 'w') as f:
            json.dump(voice_stats, f, indent=2, default=str)
        
        # Voice Profile breakdown (NEW)
        voice_profile_stats = {}
        for voice_profile in set(r.get('voice_profile', 'unknown speaker') for r in self.results):
            profile_results = [r for r in self.results if r.get('voice_profile', 'unknown speaker') == voice_profile]
            profile_wers = [getattr(r['metrics'], 'wer', 1.0) for r in profile_results]
            profile_times = [r['processing_time'] for r in profile_results]
            voice_profile_stats[voice_profile] = {
                'count': len(profile_results),
                'avg_wer': np.mean(profile_wers),
                'avg_ser': np.mean([getattr(r['metrics'], 'ser', 1.0) for r in profile_results]),
                'avg_slot_wer': np.mean([getattr(r['metrics'], 'slot_wer', 1.0) for r in profile_results]),
                'avg_processing_time': np.mean(profile_times)
            }
        
        with open(analytics_dir / 'voice_profile_breakdown.json', 'w') as f:
            json.dump(voice_profile_stats, f, indent=2, default=str)
        
        # Profile breakdown (original profile field)
        profile_stats = {}
        for profile in set(r.get('profile', 'unknown') for r in self.results):
            profile_results = [r for r in self.results if r.get('profile', 'unknown') == profile]
            profile_wers = [getattr(r['metrics'], 'wer', 1.0) for r in profile_results]
            profile_times = [r['processing_time'] for r in profile_results]
            profile_stats[profile] = {
                'count': len(profile_results),
                'avg_wer': np.mean(profile_wers),
                'avg_ser': np.mean([getattr(r['metrics'], 'ser', 1.0) for r in profile_results]),
                'avg_slot_wer': np.mean([getattr(r['metrics'], 'slot_wer', 1.0) for r in profile_results]),
                'avg_processing_time': np.mean(profile_times)
            }
        
        with open(analytics_dir / 'profile_breakdown.json', 'w') as f:
            json.dump(profile_stats, f, indent=2, default=str)
        
        print("📊 Final analytics generated with performance metrics (including voice profile breakdown)")
    
    def run_evaluation(self, start_idx: int = 0, max_samples: Optional[int] = None):
        """Run the complete optimized evaluation pipeline with auto-resume."""
        self.start_time = time.time()
        
        # Initialize components
        self.initialize_components()
        
        # Use resume index if auto-resume is enabled
        if self.auto_resume and self.resume_index > start_idx:
            start_idx = self.resume_index
            print(f"🔄 Resuming from index {start_idx} due to auto-resume")
        
        # Determine sample range
        total_samples = len(self.dataset)
        end_idx = min(start_idx + max_samples, total_samples) if max_samples else total_samples
        
        print(f"\n🎯 Starting optimized evaluation (VOICE PROFILE PROMPT): samples {start_idx} to {end_idx-1} ({end_idx-start_idx} total)")
        print(f"🚀 Performance optimizations enabled:")
        print(f"   - Slot-WER: {'ENABLED (slower)' if self.enable_slot_wer else 'DISABLED (3x faster)'}")
        print(f"   - Checkpoint interval: {self.checkpoint_interval} samples")
        print(f"   - Auto-resume: {'ENABLED' if self.auto_resume else 'DISABLED'}")
        print(f"   - Prompt conditioning: VOICE PROFILE")
        
        # Process samples
        checkpoint_results = []
        
        for i in range(start_idx, end_idx):
            sample = self.dataset[i]
            
            try:
                # Check for critical errors before processing
                if self.logger.should_abort():
                    print(f"\n🛑 Critical error threshold reached - aborting at sample {i}")
                    print("💾 Current state saved for resume")
                    break
                
                # Log individual experiment with proper context
                with self.logger.start_experiment(sample, i) as exp:
                    # Process the sample
                    result = self.process_single_sample(sample, i)
                    self.results.append(result)
                    checkpoint_results.append(result)
                    self.processed_count += 1
                    
                    # Update performance stats
                    self.performance_stats["total_samples"] += 1
                    if not result.get('errors', []):
                        self.performance_stats["successful_samples"] += 1
                    else:
                        self.performance_stats["failed_samples"] += 1
                    
                    # Update average processing time
                    if result['processing_time'] > 0:
                        current_avg = self.performance_stats["avg_processing_time"]
                        count = self.performance_stats["total_samples"]
                        new_avg = (current_avg * (count - 1) + result['processing_time']) / count
                        self.performance_stats["avg_processing_time"] = new_avg
                    
                    # Log ASR step
                    exp.log_asr_step(
                        model_name=self.model_name,
                        model_params={
                            'num_beams': self.num_beams,
                            'max_new_tokens': 256,
                            'temperature': 1.0,
                            'performance_optimized': True,
                            'prompt_type': 'voice_profile'
                        },
                        predictions=result['transcriptions'],
                        confidence_scores=result['probabilities'],
                        processing_time=result['processing_time']
                    )
                    
                    # Log normalization step
                    exp.log_normalization_step(
                        original_truth=result['ground_truth'],
                        normalized_truth=result['normalized_truth']
                    )
                    
                    # Log metrics step with the actual MetricsResult object
                    exp.log_metrics_step(result['metrics'])
                
                # Checkpoint save
                if (i + 1) % self.checkpoint_interval == 0:
                    checkpoint_num = (i + 1) // self.checkpoint_interval
                    self.save_checkpoint(checkpoint_results, checkpoint_num)
                    checkpoint_results = []  # Reset for next checkpoint
                    
                    # Progress update with performance info
                    elapsed = time.time() - self.start_time
                    rate = self.processed_count / elapsed
                    remaining = (end_idx - i - 1) / rate if rate > 0 else 0
                    avg_time = self.performance_stats["avg_processing_time"]
                    print(f"⏱️  Progress: {i+1}/{end_idx} ({(i+1)/end_idx*100:.1f}%) - {rate:.1f} samples/sec - Avg: {avg_time:.1f}s/sample - ETA: {remaining/60:.1f}min")
            
            except Exception as e:
                error_msg = str(e)
                print(f"❌ Error processing sample {i}: {error_msg}")
                
                # Check for critical errors
                if "token" in error_msg.lower() or "connection" in error_msg.lower():
                    print(f"🛑 Critical error detected: {error_msg}")
                    print("💾 Saving current state and aborting for safe resume")
                    
                    # Save final checkpoint if needed
                    if checkpoint_results:
                        final_checkpoint = ((i - 1) // self.checkpoint_interval) + 1
                        self.save_checkpoint(checkpoint_results, final_checkpoint)
                    
                    # Exit gracefully
                    print(f"🔄 To resume, run the script again - it will automatically continue from sample {i}")
                    sys.exit(1)
                
                # Non-critical errors - continue processing
                continue
        
        # Save final checkpoint if needed
        if checkpoint_results:
            final_checkpoint = ((end_idx - 1) // self.checkpoint_interval) + 1
            self.save_checkpoint(checkpoint_results, final_checkpoint)
        
        # Consolidate final results
        self.consolidate_final_results()
        
        # Final summary with performance metrics
        total_time = time.time() - self.start_time
        print(f"\n🎉 Optimized Evaluation Complete (VOICE PROFILE PROMPT)!")
        print(f"   Processed: {self.processed_count}/{end_idx-start_idx} samples")
        print(f"   Total time: {total_time/60:.1f} minutes")
        print(f"   Average rate: {self.processed_count/total_time:.1f} samples/sec")
        print(f"   Average time per sample: {self.performance_stats['avg_processing_time']:.1f}s")
        
        if self.results:
            avg_wer = np.mean([getattr(r['metrics'], 'wer', 1.0) for r in self.results])
            avg_ser = np.mean([getattr(r['metrics'], 'ser', 1.0) for r in self.results])
            avg_slot_wer = np.mean([getattr(r['metrics'], 'slot_wer', 1.0) for r in self.results])
            print(f"   Average WER: {avg_wer:.3f}")
            print(f"   Average SER: {avg_ser:.3f}")
            print(f"   Average SlotWER: {avg_slot_wer:.3f} {'(disabled)' if not self.enable_slot_wer else ''}")
        
        print(f"\n🚀 Performance Summary:")
        print(f"   Slot-WER: {'ENABLED' if self.enable_slot_wer else 'DISABLED (3x speed boost)'}")
        print(f"   Auto-resume: {'ENABLED' if self.auto_resume else 'DISABLED'}")
        print(f"   Checkpoint interval: {self.checkpoint_interval} samples")
        print(f"   Prompt conditioning: VOICE PROFILE")
        print(f"   Logger stats: {self.logger.get_stats()}")


def main():
    """Main function with enhanced command line interface."""
    parser = argparse.ArgumentParser(description='Performance-Optimized ASR Evaluation Pipeline (VOICE PROFILE PROMPT)')
    parser.add_argument('--dataset-path', default='../final_hf_dataset/agentic_asr_normalized',
                       help='Path to HuggingFace dataset')
    parser.add_argument('--model-name', default='openai/whisper-small',
                       help='Whisper model name')
    parser.add_argument('--experiment-name', default='whisper-small-prompt-with-profile',
                       help='Experiment name for logging')
    parser.add_argument('--num-beams', type=int, default=5,
                       help='Number of beams for beam search')
    parser.add_argument('--checkpoint-interval', type=int, default=500,
                       help='Save checkpoint every N samples (default: 500)')
    parser.add_argument('--start-idx', type=int, default=0,
                       help='Starting sample index (overridden by auto-resume)')
    parser.add_argument('--max-samples', type=int,
                       help='Maximum number of samples to process')
    parser.add_argument('--enable-slot-wer', action='store_true',
                       help='Enable slot-WER computation (slower, disabled by default)')
    parser.add_argument('--disable-auto-resume', action='store_true',
                       help='Disable automatic resume from checkpoints')
    parser.add_argument('--test-run', action='store_true',
                       help='Run on first 10 samples for testing')
    
    args = parser.parse_args()
    
    # Test run override
    if args.test_run:
        args.max_samples = 10
        args.checkpoint_interval = 5
        print("🧪 Test run mode: processing first 10 samples (VOICE PROFILE PROMPT)")
    
    # Create evaluator
    evaluator = ComprehensiveASREvaluatorImproved(
        model_name=args.model_name,
        dataset_path=args.dataset_path,
        experiment_name=args.experiment_name,
        num_beams=args.num_beams,
        checkpoint_interval=args.checkpoint_interval,
        enable_slot_wer=args.enable_slot_wer,
        auto_resume=not args.disable_auto_resume
    )
    
    # Run evaluation
    evaluator.run_evaluation(
        start_idx=args.start_idx,
        max_samples=args.max_samples
    )


if __name__ == "__main__":
    main()
