"""Unified orchestrator with timestamp-based result organization."""

import json
import logging
import os
from pathlib import Path
from typing import List, Dict, Any, Optional, Callable, Tuple
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd

from .parallel_evaluator import ParallelEvaluator
from .retry_handler import RetryHandler, RetryPolicy
from .chunked_checkpoint_system import ChunkedCheckpointSystem, ContinualEvaluationManager
from ..config import UnifiedBatchConfig, EvaluationParams, ModelConfig, EvaluationTask, EvaluationResult
from ..search import SearchEngine
from ..logging import MetricsCollector, ResultsAggregator


class UnifiedOrchestrator:
    """Unified orchestrator with timestamp-based result organization."""
    
    def __init__(
        self,
        config: UnifiedBatchConfig,
        logger: Optional[logging.Logger] = None
    ):
        """Initialize unified orchestrator.
        
        Args:
            config: Unified batch configuration
            logger: Logger instance
        """
        self.config = config
        self.output_dir = config.get_output_dir()
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.logger = logger or logging.getLogger(__name__)
        
        # Initialize components
        # Use first evaluation setting's retry parameters as default
        first_params = config.evaluation_params[0] if config.evaluation_params else None
        max_attempts = first_params.retry_attempts if first_params else 3
        base_delay = first_params.retry_delay if first_params else 1.0
        
        self.retry_handler = RetryHandler(
            policy=RetryPolicy(
                max_attempts=max_attempts,
                base_delay=base_delay
            )
        )
        
        self.metrics_collector = MetricsCollector()
        self.results_aggregator = ResultsAggregator(str(self.output_dir))
        
        # Track evaluation state
        self.setting_results: Dict[str, Tuple[str, List[EvaluationResult]]] = {}  # setting_id -> (timestamp, results)
        self.search_engines: Dict[str, SearchEngine] = {}
        
        
        # Current run timestamp (generated once per orchestrator run)
        self.current_run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        self.logger.info(f"Initialized orchestrator for batch run: {config.batch_run_name}")
        self.logger.info(f"Output directory: {self.output_dir}")
        self.logger.info(f"Current run timestamp: {self.current_run_timestamp}")
    
    
    def _check_existing_results(
        self, 
        setting: EvaluationParams,
        continual_manager: ContinualEvaluationManager
    ) -> Optional[Tuple[str, List[EvaluationResult]]]:
        """Check if results already exist for a setting.
        
        Args:
            setting: Evaluation setting
            
        Returns:
        """
        # Get existing timestamps for this setting
        existing_timestamps = self.config.get_existing_timestamps(setting.setting_id)
        existing_timestamps.append("most_updated")
        completed_problem_ids, most_updated_results = continual_manager.get_completed_problem_ids()
        expected_problems = self._get_expected_problem_count(setting)

        try:
            if expected_problems and len(completed_problem_ids) < expected_problems:
                self.logger.warning(
                    f"Existing results for {setting.setting_id}'s previous runs are incomplete: "
                    f"{len(completed_problem_ids)}/{expected_problems} problems"
                )
                return most_updated_results, False
            else:
                self.logger.info(
                    f"✓ Found existing complete results for {setting.setting_id} {len(most_updated_results)} problems"
                )
                return most_updated_results, True
            
        except Exception as e:
            self.logger.warning(f"Failed to load existing results for {setting.setting_id}: {e}")
            raise
    
    def _get_expected_problem_count(self, setting: EvaluationParams) -> Optional[int]:
        """Get expected number of problems for a setting.
        
        Args:
            setting: Evaluation setting
            
        Returns:
            Expected number of problems, or None if unknown
        """
        if self.config.test_mode:
            return self.config.test_samples
        
        if setting.num_samples:
            return setting.num_samples
        
        raise ValueError("No number of samples provided")
    
    def _create_search_engine(self, setting: EvaluationParams) -> SearchEngine:
        """Create a search engine for a specific setting.
        
        Args:
            setting: Evaluation setting (with shared parameters applied)
            
        Returns:
            Configured SearchEngine instance
        """
        cache_key = f"{setting.chromadb_path}_{setting.collection_name}_{setting.embedding_model}"
        
        if cache_key in self.search_engines:
            return self.search_engines[cache_key]
        
        self.logger.info(f"Creating search engine for {setting.get_display_name()}")
        self.logger.debug(f"  - Database: {setting.chromadb_path}")
        self.logger.debug(f"  - Collection: {setting.collection_name}")
        self.logger.debug(f"  - Embedding: {setting.embedding_model}")
        
        try:
            search_engine = SearchEngine(
                chromadb_path=setting.chromadb_path,
                collection_name=setting.collection_name,
                embedding_model=setting.embedding_model,
                cache_size=100,
                engine_id="default",
                logger=self.logger
            )
            
            self.search_engines[cache_key] = search_engine
            return search_engine
            
        except Exception as e:
            self.logger.error(f"Failed to create search engine for {setting.get_display_name()}: {e}")
            raise
    
    def _get_search_engine(self, setting: EvaluationParams) -> SearchEngine:
        """Get or create a search engine for a specific setting.
        
        Args:
            setting: Evaluation setting
            
        Returns:
            SearchEngine instance
        """
        return self._create_search_engine(setting)
    
    def _create_evaluator(self, setting: EvaluationParams) -> ParallelEvaluator:
        """Create a parallel evaluator for a specific setting.
        
        Args:
            setting: Evaluation setting (with shared parameters applied)
            
        Returns:
            Configured ParallelEvaluator instance
        """
        # Set rate limits based on provider
        rate_limits = {
            'anthropic': 50,
            'openai': 100,
            'google': 60,
        }
        
        rate_limit = {setting.model_provider: rate_limits.get(setting.model_provider, 30)}
        
        evaluator = ParallelEvaluator(
            max_workers=setting.num_workers,
            use_async=True,
            rate_limit=rate_limit,
            logger=self.logger
        )
        
        # Set progress callback
        def progress_callback(completed: int, failed: int):
            self.logger.info(
                f"[{setting.setting_id}] Progress: {completed} completed, {failed} failed"
            )
        
        evaluator.set_progress_callback(progress_callback)
        
        return evaluator
    
    def _load_dataset(self, setting: EvaluationParams) -> List[Dict[str, Any]]:
        """Load the evaluation dataset for a specific setting.
        
        Args:
            setting: Evaluation setting (with shared parameters applied)
            
        Returns:
            List of problems from the dataset
        """
        dataset_path = Path(setting.dataset_path).expanduser()
        
        if not dataset_path.exists():
            raise FileNotFoundError(f"Dataset file not found: {dataset_path}")
        
        self.logger.info(f"Loading dataset from: {dataset_path}")
        
        with open(dataset_path, 'r') as f:
            data = json.load(f)
        
        # Handle different dataset formats
        if isinstance(data, list):
            problems = data
        elif 'documents' in data:
            problems = data['documents']
        else:
            raise ValueError(f"Unknown dataset format in {dataset_path}")
        
        # Apply test mode or index limits
        if self.config.test_mode:
            problems = problems[:self.config.test_samples]
        
        return problems
    
    def _create_tasks(
        self,
        setting: EvaluationParams,
        problems: List[Dict[str, Any]]
    ) -> List[EvaluationTask]:
        """Create evaluation tasks for a setting.
        
        Args:
            setting: Evaluation setting (with shared parameters applied)
            problems: List of problems
            
        Returns:
            List of evaluation tasks
        """
        tasks = []
        
        # Create a model config from the setting
        
        model_config = ModelConfig(
            name=setting.model_name,
            provider=setting.model_provider,
            temperature=setting.temperature,
            max_tokens=setting.max_tokens,
            random_seed=setting.random_seed
        )
        

        
        for idx, problem in enumerate(problems):
            question_id = problem['question_id']
            task = EvaluationTask(
                task_id=question_id,
                problem_index=idx,
                model_config=model_config,
                problem_data=problem,
                max_attempts=setting.retry_attempts,
                metadata={
                    'setting': setting,
                    'search_engine': self._create_search_engine(setting),
                    'setting_id': setting.setting_id
                }
            )
            tasks.append(task)
        
        return tasks
    
    def evaluate_setting(
        self,
        setting: EvaluationParams,
        evaluator_func: Callable
    ) -> Tuple[str, List[EvaluationResult]]:
        """Evaluate a single setting with chunked checkpoints and continual evaluation.
        
        Args:
            setting: Evaluation setting
            evaluator_func: Function to evaluate a single task
            
        Returns:
            Tuple of (timestamp, evaluation results)
        """
        self.logger.info(f"Starting evaluation for: {setting.setting_id}")
        setting_base_dir = self.config.get_output_dir() / setting.setting_id
        os.makedirs(setting_base_dir, exist_ok=True)
        continual_manager = ContinualEvaluationManager(setting_base_dir, self.logger)
        
        # If a fully complete results set already exists, reuse it and skip work
        results, is_complete = self._check_existing_results(setting, continual_manager)
        # import pdb; pdb.set_trace()
        timestamp = self.current_run_timestamp
        if is_complete:
            self.logger.info(f"Reusing existing complete results for {setting.setting_id}; skipping evaluation")
            # Update most_updated in case it needs consolidation
            if getattr(setting, 'enable_continual_evaluation', True):
                continual_manager.unified_config = self.config
                continual_manager.update_most_updated_folder()
            return timestamp, results
        
        output_dir = self.config.get_setting_output_dir(setting.setting_id, timestamp)
        output_dir.mkdir(parents=True, exist_ok=True)
        checkpoints_dir = output_dir / "checkpoints"
        
        self.logger.info(f"📁 Output directory: {output_dir}")
        
        # Initialize chunked checkpoint system
        chunk_system = ChunkedCheckpointSystem(
            checkpoint_dir=checkpoints_dir,
            chunk_size=getattr(setting, 'chunk_size', 20),
            logger=self.logger
        )
        
        # Load dataset
        try:
            problems = self._load_dataset(setting)
        except Exception as e:
            self.logger.error(f"Failed to load dataset for {setting.setting_id}: {e}")
            return timestamp, []
        
        # Apply continual evaluation if enabled
        tasks_to_evaluate = []
        skipped_count = 0
        
        if getattr(setting, 'enable_continual_evaluation', True):
            # Pass the config for system prompt access
            continual_manager.unified_config = self.config
            
            # Get completed problem IDs from previous runs and current chunks
            completed_ids, completed_results_by_id = continual_manager.get_completed_problem_ids()
            all_completed_ids = completed_ids
            
            self.logger.info(f"Found {len(all_completed_ids)} previously completed problems")
            
            # import pdb; pdb.set_trace()

            # Filter tasks to only evaluate uncompleted problems
            for i, problem in enumerate(problems):
                question_id = problem.get('question_id', f"{setting.setting_id}_{i}")
                if question_id in all_completed_ids:
                    skipped_count += 1
                else:
                    task = EvaluationTask(
                        task_id=question_id,
                        problem_index=i,
                        model_config=ModelConfig(name=setting.model_name, provider=setting.model_provider, temperature=setting.temperature, max_tokens=setting.max_tokens, random_seed=setting.random_seed),
                        problem_data=problem,
                        metadata={'setting': setting, 'search_engine': self._get_search_engine(setting)}
                    )
                    tasks_to_evaluate.append(task)
            
            self.logger.info(f"Continual evaluation: skipping {skipped_count} completed problems, evaluating {len(tasks_to_evaluate)} new problems")
        else:
            # Create all tasks (no continual evaluation)
            tasks_to_evaluate = self._create_tasks(setting, problems)
            self.logger.info(f"Created {len(tasks_to_evaluate)} tasks for {setting.setting_id}")
        
        if not tasks_to_evaluate:
            self.logger.info("No tasks to evaluate - all problems already completed!")
            # Return existing results from chunks
            existing_results = chunk_system.get_recovery_results()
            if existing_results:
                self._save_results(setting, existing_results, timestamp)
                return timestamp, existing_results
            else:
                return timestamp, []
        
        # Note: We don't need to create a parallel evaluator since we're doing individual task evaluation
        
        # Run evaluation with chunked checkpoints
        all_results = []
        remaining_tasks = tasks_to_evaluate
        
        for attempt in range(setting.retry_attempts):
            if not remaining_tasks:
                break
            
            self.logger.info(
                f"Attempt {attempt + 1} for {setting.setting_id}: {len(remaining_tasks)} tasks"
            )
            
            # Evaluate tasks one by one to enable chunked saving
            batch_results = []
            for task in remaining_tasks:
                try:
                    result = evaluator_func(task)
                    batch_results.append(result)
                    
                    # Add result to chunk system
                    chunk_saved = chunk_system.add_result(result)
                    if chunk_saved:
                        self.logger.info(f"Saved chunk with {setting.chunk_size} results")
                    
                except Exception as e:
                    self.logger.error(f"Error evaluating task {task.task_id}: {e}")
                    batch_results.append(EvaluationResult(
                        task_id=task.task_id,
                        problem_index=task.problem_index,
                        model_name=setting.model_name,
                        success=False,
                        error=str(e),
                        attempt=attempt
                    ))
            
            # Separate successful and failed results
            successful_results = [r for r in batch_results if r.success]
            failed_results = [r for r in batch_results if not r.success]
            
            all_results.extend(successful_results)
            
            if failed_results and attempt < setting.retry_attempts - 1:
                # Create retry tasks for failed results
                remaining_tasks = []
                for failed in failed_results:
                    original_task = next(t for t in tasks_to_evaluate if t.task_id == failed.task_id)
                    retry_task = EvaluationTask(
                        task_id=original_task.task_id,
                        problem_index=original_task.problem_index,
                        model_config=original_task.model_config,
                        problem_data=original_task.problem_data,
                        attempt=attempt + 1,
                        max_attempts=original_task.max_attempts,
                        metadata=original_task.metadata
                    )
                    remaining_tasks.append(retry_task)
                
                # Apply retry delay
                self.retry_handler.wait_before_retry(attempt)
            else:
                # No more retries, add failed results to chunks
                for failed in failed_results:
                    chunk_system.add_result(failed)
                all_results.extend(failed_results)
                remaining_tasks = []
        
        # import pdb; pdb.set_trace()

        # Finalize evaluation and get combined results
        final_results = chunk_system.finalize_evaluation(setting)
        
        # Save traditional results format as well
        self._save_results(setting, final_results, timestamp)
        
        # Update most_updated folder if continual evaluation is enabled
        if getattr(setting, 'enable_continual_evaluation', True):
            setting_base_dir = self.config.get_output_dir() / setting.setting_id
            continual_manager = ContinualEvaluationManager(setting_base_dir, self.logger)
            # Pass the config for system prompt access
            continual_manager.unified_config = self.config
            continual_manager.update_most_updated_folder()
        
        return timestamp, final_results
    
    def _save_results(
        self, 
        setting: EvaluationParams, 
        results: List[EvaluationResult], 
        timestamp: str
    ):
        """Save results for a specific setting.
        
        Args:
            setting: Evaluation setting
            results: List of results to save
            timestamp: Timestamp for this run
        """
        output_dir = self.config.get_setting_output_dir(setting.setting_id, timestamp)
        output_dir.mkdir(parents=True, exist_ok=True)
        
        # Save detailed results
        results_file = output_dir / "results.json"
        results_data = []
        
        for r in results:
            result_dict = {
                'task_id': r.task_id,
                'problem_index': r.problem_index,
                'success': r.success,
                'answer': r.answer,
                'ground_truth': r.ground_truth,
                'is_correct': r.is_correct,
                'search_complete': r.metrics.get('search_complete', False),
                'search_rounds': r.metrics.get('search_rounds', 0),
                'error': str(r.error) if r.error else None,
                'metrics': r.metrics,
                'timestamp': r.timestamp,
                'duration': r.duration,
                'attempt': r.attempt
            }
            
            # Include conversation if available
            if 'full_conversation' in r.metrics:
                result_dict['conversation'] = r.metrics['full_conversation']
            # Include deep research intermediates if present in metrics
            if 'deep_research' in r.metrics:
                dr = r.metrics['deep_research'] or {}
                # Flatten into top-level keys to make analysis easier
                if 'final_report_prompt' in dr and dr['final_report_prompt'] is not None:
                    result_dict['final_report_prompt'] = dr['final_report_prompt']
                if 'research_topics_by_round' in dr and dr['research_topics_by_round']:
                    result_dict['research_topics_by_round'] = dr['research_topics_by_round']
            
            results_data.append(result_dict)
        
        with open(results_file, 'w') as f:
            json.dump(results_data, f, indent=2)
        
        # Save metrics
        metrics_file = output_dir / "metrics.json"
        metrics = self._calculate_metrics(results)
        metrics['evaluation_info'] = {
            'setting_id': setting.setting_id,
            'model': setting.model_name,
            'dataset': setting.dataset_type,
            'embedding': setting.embedding_model,
            'num_samples': setting.num_samples,
            'timestamp': timestamp,
            'run_date': datetime.strptime(timestamp, "%Y%m%d_%H%M%S").isoformat()
        }
        
        with open(metrics_file, 'w') as f:
            json.dump(metrics, f, indent=2)
        
        # Save setting configuration
        config_file = output_dir / "config.json"
        config_data = {
            'setting_id': setting.setting_id,
            'model_name': setting.model_name,
            'model_provider': setting.model_provider,
            'temperature': setting.temperature,
            'max_tokens': setting.max_tokens,
            'dataset_type': setting.dataset_type,
            'dataset_path': setting.dataset_path,
            'num_samples': setting.num_samples,
            'search_engine_type': setting.search_engine_type,
            'chromadb_path': setting.chromadb_path,
            'collection_name': setting.collection_name,
            'embedding_model': setting.embedding_model,
            'results_per_page': setting.results_per_page,
            'max_documents': setting.max_documents,
            'metadata': setting.metadata,
            'system_prompt': self.config.system_prompt
        }
        
        with open(config_file, 'w') as f:
            json.dump(config_data, f, indent=2)
        
        self.logger.info(f"✅ Saved results for {setting.setting_id} to {output_dir}")
    
    def _calculate_metrics(self, results: List[EvaluationResult]) -> Dict[str, Any]:
        """Calculate metrics for a set of results.
        
        Args:
            results: List of evaluation results
            
        Returns:
            Dictionary of metrics
        """
        total = len(results)
        if total == 0:
            return {'total_tasks': 0}
        
        successful = sum(1 for r in results if r.success)
        correct = sum(1 for r in results if r.is_correct)
        follow_format = sum(1 for r in results if r.metrics.get('follow_format', False))
        
        # Subgroup analysis based on search completion
        all_docs_found = [r for r in results if r.metrics.get('search_complete', False)]
        not_all_docs_found = [r for r in results if not r.metrics.get('search_complete', False)]
        
        metrics = {
            'total_tasks': total,
            'successful_tasks': successful,
            'failed_tasks': total - successful,
            'success_rate': successful / total,
            'correct_answers': correct,
            'accuracy': correct / successful if successful > 0 else 0,
            'follow_format_count': follow_format,
            'follow_format_rate': follow_format / successful if successful > 0 else 0,
            'average_duration': sum(r.duration for r in results) / total,
            'total_duration': sum(r.duration for r in results),
            
            # Subgroup analysis
            'all_docs_found_tasks': len(all_docs_found),
            'all_docs_found_accuracy': (
                sum(1 for r in all_docs_found if r.is_correct) / len(all_docs_found)
                if all_docs_found else 0
            ),
            'not_all_docs_found_tasks': len(not_all_docs_found),
            'not_all_docs_found_accuracy': (
                sum(1 for r in not_all_docs_found if r.is_correct) / len(not_all_docs_found)
                if not_all_docs_found else 0
            ),
        }
        
        # Add error analysis
        errors = [r.error for r in results if r.error]
        if errors:
            error_types = {}
            for error in errors:
                error_type = str(type(error).__name__) if hasattr(error, '__class__') else 'Unknown'
                error_types[error_type] = error_types.get(error_type, 0) + 1
            metrics['error_types'] = error_types
        
        return metrics
    
    def evaluate_all(self, evaluator_func: Callable) -> Dict[str, Tuple[str, List[EvaluationResult]]]:
        """Evaluate all settings, grouping by model to avoid rate limits.
        
        Args:
            evaluator_func: Function to evaluate a single task
            
        Returns:
            Dictionary mapping setting IDs to (timestamp, results) tuples
        """
        self.logger.info(
            f"Starting batch evaluation for {len(self.config.evaluation_params)} settings"
        )
        
        # Group settings by model
        model_groups = self.config.group_params_by_model()
        self.logger.info(f"Settings grouped into {len(model_groups)} model groups")
        
        all_results = {}
        
        # Process each model group in parallel
        with ThreadPoolExecutor(max_workers=self.config.num_workers) as executor:
            # Submit all model groups for parallel processing
            future_to_model = {}
            for model_name, settings in model_groups.items():
                self.logger.info(f"Submitting {len(settings)} settings for model: {model_name}")
                
                # Create a future for this model's settings
                future = executor.submit(
                    self._evaluate_model_settings,
                    settings,
                    evaluator_func
                )
                future_to_model[future] = model_name
            
            # Collect results as they complete
            for future in as_completed(future_to_model):
                model_name = future_to_model[future]
                try:
                    model_results = future.result()
                    all_results.update(model_results)
                    self.logger.info(
                        f"✓ Completed evaluation for model {model_name}: "
                        f"{len(model_results)} settings"
                    )
                except Exception as e:
                    self.logger.error(f"Failed to evaluate model {model_name}: {e}")
        
        # Store all results
        self.setting_results = all_results
        
        # Generate reports
        self._generate_batch_report()
        
        return all_results
    
    def _evaluate_model_settings(
        self,
        settings: List[EvaluationParams],
        evaluator_func: Callable
    ) -> Dict[str, Tuple[str, List[EvaluationResult]]]:
        """Evaluate all settings for a single model (sequentially to avoid rate limits).
        
        Args:
            settings: List of settings for this model
            evaluator_func: Function to evaluate a single task
            
        Returns:
            Dictionary mapping setting IDs to (timestamp, results) tuples
        """
        results = {}
        
        for setting in settings:
            try:
                # import pdb; pdb.set_trace()
                timestamp, setting_results = self.evaluate_setting(setting, evaluator_func)
                results[setting.setting_id] = (timestamp, setting_results)
            except Exception as e:
                self.logger.error(f"Failed to evaluate {setting.setting_id}: {e}")
                results[setting.setting_id] = (self.current_run_timestamp, [])
        
        return results
    
    def _generate_batch_report(self):
        """Generate batch-level report across all settings."""
        report_file = self.output_dir / "batch_report.json"
        summary_file = self.output_dir / "batch_summary.csv"
        
        # Collect all metrics
        all_metrics = {}
        for setting in self.config.evaluation_params:
            # Find the most recent timestamp for this setting
            if setting.setting_id in self.setting_results:
                timestamp, _ = self.setting_results[setting.setting_id]
            else:
                timestamps = self.config.get_existing_timestamps(setting.setting_id)
                if timestamps:
                    timestamp = timestamps[0]
                else:
                    continue
            
            metrics_file = self.config.get_setting_output_dir(setting.setting_id, timestamp) / "metrics.json"
            if metrics_file.exists():
                with open(metrics_file, 'r') as f:
                    all_metrics[setting.setting_id] = json.load(f)
        
        # Create report
        report_data = {
            'batch_run_name': self.config.batch_run_name,
            'run_timestamp': self.current_run_timestamp,
            'num_settings': len(self.config.evaluation_params),
            'settings_evaluated': len(all_metrics),
            'configuration': self.config.to_dict(),
            'metrics_by_setting': all_metrics,
            'best_settings': self._find_best_settings(all_metrics)
        }
        
        with open(report_file, 'w') as f:
            json.dump(report_data, f, indent=2)
        
        # Create summary CSV
        if all_metrics:
            df_data = []
            for setting_id, metrics in all_metrics.items():
                eval_info = metrics.get('evaluation_info', {})
                df_data.append({
                    'Setting ID': setting_id,
                    'Model': eval_info.get('model', 'unknown'),
                    'Dataset': eval_info.get('dataset', 'unknown'),
                    'Embedding': eval_info.get('embedding', 'unknown'),
                    'Timestamp': eval_info.get('timestamp', 'unknown'),
                    'Success Rate': f"{metrics.get('success_rate', 0):.2%}",
                    'Accuracy': f"{metrics.get('accuracy', 0):.2%}",
                    'Follow Format Rate': f"{metrics.get('follow_format_rate', 0):.2%}",
                    'Total Tasks': metrics.get('total_tasks', 0),
                    'Avg Duration (s)': f"{metrics.get('average_duration', 0):.2f}"
                })
            
            df = pd.DataFrame(df_data)
            df = df.sort_values(['Model', 'Dataset', 'Embedding'])
            df.to_csv(summary_file, index=False)
            
            # Print summary
            print("\n" + "="*80)
            print(f"BATCH RUN: {self.config.batch_run_name}")
            print("="*80)
            print(df.to_string(index=False))
        
        self.logger.info(f"Generated batch report: {report_file}")
    
    def _find_best_settings(self, all_metrics: Dict[str, Dict]) -> Dict[str, Any]:
        """Find the best performing settings.
        
        Args:
            all_metrics: Dictionary of all metrics
            
        Returns:
            Dictionary with best settings information
        """
        if not all_metrics:
            return {}
        
        # Find best by accuracy
        best_accuracy = max(all_metrics.items(), key=lambda x: x[1].get('accuracy', 0))
        
        # Find best by success rate
        best_success = max(all_metrics.items(), key=lambda x: x[1].get('success_rate', 0))
        
        # Find fastest
        fastest = min(
            all_metrics.items(),
            key=lambda x: x[1].get('average_duration', float('inf'))
        )
        
        
        return {
            'best_accuracy': {
                'setting_id': best_accuracy[0],
                'value': best_accuracy[1].get('accuracy', 0),
                'details': best_accuracy[1].get('evaluation_info', {})
            },
            'best_success_rate': {
                'setting_id': best_success[0],
                'value': best_success[1].get('success_rate', 0),
                'details': best_success[1].get('evaluation_info', {})
            },
            'fastest': {
                'setting_id': fastest[0],
                'value': fastest[1].get('average_duration', 0),
                'details': fastest[1].get('evaluation_info', {})
            }
        }
