"""Main entry point for the unified batch evaluation framework."""

import os
import sys
import argparse
import logging
from pathlib import Path
import getpass
import yaml

# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))

from src.config.unified_config import UnifiedBatchConfig
from src.evaluation.unified_orchestrator import UnifiedOrchestrator
from src.agents import ProblemEvaluator
from src.search import SearchEngine


def setup_logging(verbose: bool = False) -> logging.Logger:
    """Set up logging configuration.
    
    Args:
        verbose: Whether to use verbose logging
        
    Returns:
        Logger instance
    """
    level = logging.DEBUG if verbose else logging.INFO
    
    logging.basicConfig(
        level=level,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S'
    )
    
    # Reduce noise from some libraries
    if not verbose:
        logging.getLogger('chromadb').setLevel(logging.WARNING)
        logging.getLogger('httpx').setLevel(logging.WARNING)
    
    return logging.getLogger(__name__)


def setup_api_keys(config: UnifiedBatchConfig, logger: logging.Logger):
    """Set up API keys from environment or prompt user.
    
    Args:
        config: Unified batch configuration
        logger: Logger instance
    """
    required_keys = set()
    
    # Determine required API keys based on settings
    for params in config.evaluation_params:
        if params.model_provider == 'anthropic':
            required_keys.add('ANTHROPIC_API_KEY')
        elif params.model_provider == 'openai':
            required_keys.add('OPENAI_API_KEY')
        elif params.model_provider == 'google':
            required_keys.add('GOOGLE_API_KEY')
        
        # Check for embedding API keys
        embedding_model = params.embedding_model or 'default'
        if SearchEngine.requires_openai_api_key(embedding_model):
            required_keys.add('OPENAI_API_KEY')
    
    # Check and prompt for missing keys
    for key in required_keys:
        if key not in os.environ:
            logger.info(f"{key} not found in environment")
            api_key = getpass.getpass(f"Enter {key}: ")
            os.environ[key] = api_key


def evaluate_batch(args: argparse.Namespace, logger: logging.Logger):
    """Run batch evaluation with unified configuration.
    
    Args:
        args: Command line arguments
        logger: Logger instance
    """
    # Load configuration
    config_path = Path(args.config).expanduser()
    with open(config_path, 'r') as f:
        config_data = yaml.safe_load(f)
    
    config = UnifiedBatchConfig.from_yaml(config_data)
    
    # Apply command-line overrides
    if args.test_mode:
        config.test_mode = True
        logger.info(f"TEST MODE: Evaluating only {config.test_samples} samples per setting")
    
    
    if args.num_workers is not None:
        # Update all evaluation parameters
        for params in config.evaluation_params:
            params.num_workers = args.num_workers
    
    if args.output_dir:
        config.base_output_dir = args.output_dir
    
    if args.batch_run_name:
        config.batch_run_name = args.batch_run_name
    
    # Set up API keys
    setup_api_keys(config, logger)
    
    # Create orchestrator
    orchestrator = UnifiedOrchestrator(config, logger)

    
    # UnifiedBatchConfig now provides the interfaces directly
    
    # Create evaluator function
    def evaluate_task(task):
        """Evaluate a single task."""
        # Get search engine from task metadata
        search_engine = task.metadata.get('search_engine')
        if not search_engine:
            params = task.metadata.get('setting')
            if params:
                search_engine = SearchEngine(
                    chromadb_path=params.chromadb_path,
                    collection_name=params.collection_name,
                    embedding_model=params.embedding_model,
                    engine_id="default",
                    logger=logger
                )
            else:
                raise ValueError("No search engine or setting in task metadata")
        
        # Create evaluator for this task using the unified config
        evaluator = ProblemEvaluator(config, search_engine, logger)
        return evaluator.evaluate_task(task)
    
    # Run evaluation
    logger.info(f"Starting batch run: {config.batch_run_name}")
    logger.info(f"Evaluating {len(config.evaluation_params)} settings")
    
    # Show settings summary
    model_groups = config.group_params_by_model()
    for model_name, param_list in model_groups.items():
        logger.info(f"  Model {model_name}: {len(param_list)} configurations")
        for params in param_list:
            logger.info(f"    - {params.setting_id}: {params.dataset_type} dataset")
    
    # Run batch evaluation
    results = orchestrator.evaluate_all(evaluate_task)
    
    # Print final summary
    print("\n" + "="*80)
    print("🎉 BATCH EVALUATION COMPLETE")
    print("="*80)
    print(f"📊 Batch run: {config.batch_run_name}")
    print(f"📊 Total settings evaluated: {len(results)}")
    print(f"📁 Results saved to: {orchestrator.output_dir}")
    
    # Show best performing settings
    metrics_dir = orchestrator.output_dir
    batch_report = metrics_dir / "batch_report.json"
    
    if batch_report.exists():
        with open(batch_report, 'r') as f:
            report = yaml.safe_load(f)
            best = report.get('best_settings', {})
            
            if best.get('best_accuracy'):
                print("\n🏆 Best Accuracy:")
                print(f"   Setting: {best['best_accuracy']['setting_id']}")
                print(f"   Accuracy: {best['best_accuracy']['value']:.2%}")
            
            if best.get('fastest'):
                print("\n⚡ Fastest:")
                print(f"   Setting: {best['fastest']['setting_id']}")
                print(f"   Avg Duration: {best['fastest']['value']:.2f}s")
            
            if best.get('best_follow_format'):
                print("\n📋 Best Format Compliance:")
                print(f"   Setting: {best['best_follow_format']['setting_id']}")
                print(f"   Follow Format Rate: {best['best_follow_format']['value']:.2%}")


def generate_config_template(args: argparse.Namespace, logger: logging.Logger):
    """Generate a configuration template file.
    
    Args:
        args: Command line arguments
        logger: Logger instance
    """
    template = {
        'batch_run_name': 'my_evaluation',
        'base_output_dir': './gsm-agent/eval/results',
        
        'shared_parameters': {
            'agent_type': 'react',  # 'standard' or 'deep_research'
            'agent_stop_type': 'default',  # 'default' or 'interaction_scaling'
            'agent_stop_kwargs': {
                'interaction_rounds': 5,  # For interaction_scaling: force N rounds of interaction
                'answer_judge': 'parsing_and_llm'  # 'parsing_only', 'llm_only', or 'parsing_and_llm'
            },
            'temperature': 0.4,
            'max_tokens': 2048,
            'random_seed': 42,  # Random seed for reproducible inference
            'dataset_dir': './data',
            'search_engine_type': 'chromadb',
            'chromadb_base_path': './databases/chroma_db',
            'collection_name': 'default',
            'embedding_model': 'default',  # Use 'openai-small' or 'openai-large' for OpenAI embeddings
            'results_per_page': 5,
            'max_documents': 100,
            'num_workers': 4,
            'retry_attempts': 3,
            'retry_delay': 1.0,
            'chunk_size': 10,
            # Chunked checkpoint system parameters
            'enable_continual_evaluation': True  # Skip already evaluated problems
        },
        
        'evaluation_control': {
            'test_mode': False,
            'test_samples': 3
        },
        
        'prompts': {
            'system_prompt': """You are a helpful assistant that answers questions using search tools.

Use the search_information tool to find relevant information.
Use the next_page tool to see more results from your last search.

Your final answer should start with "#### " followed by the answer."""
        },
        
        'evaluation_settings': [
            {
                'setting_id': 'claude_small_20',
                'model_name': 'claude-3-sonnet-20240229',
                'dataset_type': 'small',
                'num_samples': 20
            },
            {
                'setting_id': 'gpt4_small_20_cool',
                'model_name': 'gpt-4',
                'dataset_type': 'small',
                'num_samples': 20,
                'temperature': 0.3,  # Override shared temperature
                'metadata': {
                    'experiment': 'lower_temperature'
                }
            },
            {
                'setting_id': 'claude_interaction_scaling',
                'model_name': 'claude-3-sonnet-20240229',
                'dataset_type': 'small',
                'num_samples': 20,
                'agent_stop_type': 'interaction_scaling',  # Force interaction for N rounds
                'agent_stop_kwargs': {
                    'interaction_rounds': 8,  # Override to 8 rounds instead of 5
                    'answer_judge': 'parsing_only'  # Use only parsing for answer extraction
                },
                'metadata': {
                    'experiment': 'forced_interaction_scaling'
                }
            }
        ]
    }
    
    # Save template
    output_path = Path(args.output).expanduser()
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    with open(output_path, 'w') as f:
        yaml.dump(template, f, default_flow_style=False, sort_keys=False, width=120)
    
    logger.info(f"Generated configuration template: {output_path}")
    print(f"Configuration template saved to: {output_path}")
    print("\nEdit this file to customize your evaluation settings.")
    print("\nKey features:")
    print("  - Shared parameters reduce redundancy")
    print("  - Each evaluation setting only specifies what's different")
    print("  - Simplified internal processing with EvaluationParams")
    print("  - Results organized by batch_run_name/setting_id/timestamp/")


def main():
    """Main entry point."""
    parser = argparse.ArgumentParser(
        description="Unified batch evaluation framework for language models",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Run batch evaluation (smart skip existing results)
  python -m src.main evaluate --config configs/unified_evaluation.yaml
  
  # Test mode - evaluate only 3 samples per setting
  python -m src.main evaluate --config configs/unified_test.yaml --test-mode
  
  # Generate configuration template
  python -m src.main generate-config --output configs/my_config.yaml
        """
    )
    
    subparsers = parser.add_subparsers(dest='command', help='Command to run')
    
    # Batch evaluation command
    eval_parser = subparsers.add_parser('evaluate', help='Run batch evaluation')
    eval_parser.add_argument('--config', required=True, help='Configuration file path')
    eval_parser.add_argument('--test-mode', action='store_true', 
                            help='Run in test mode with few samples')
    eval_parser.add_argument('--num-workers', type=int, help='Number of parallel workers')
    eval_parser.add_argument('--output-dir', help='Base output directory')
    eval_parser.add_argument('--batch-run-name', help='Name for this batch run')
    eval_parser.add_argument('--verbose', action='store_true', help='Enable verbose logging')
    
    # Generate config template command
    config_parser = subparsers.add_parser('generate-config', 
                                         help='Generate configuration template')
    config_parser.add_argument('--output', required=True, help='Output file path')
    
    args = parser.parse_args()
    
    if not args.command:
        parser.print_help()
        sys.exit(1)
    
    # Set up logging
    logger = setup_logging(args.verbose if hasattr(args, 'verbose') else False)
    
    try:
        if args.command == 'evaluate':
            evaluate_batch(args, logger)
        elif args.command == 'generate-config':
            generate_config_template(args, logger)
        else:
            parser.print_help()
            sys.exit(1)
    except KeyboardInterrupt:
        logger.info("Evaluation interrupted by user")
        sys.exit(1)
    except Exception as e:
        logger.error(f"Evaluation failed: {e}", exc_info=True)
        sys.exit(1)


if __name__ == "__main__":
    main()
