#!/usr/bin/env python3
"""
Unified Evaluation Script for DDR_Bench.

Single entry point for evaluating agent results across all scenarios:
- MIMIC: Evaluate medical insights against QA pairs
- 10-K: Evaluate financial insights against QA pairs
- GLOBEM: Evaluate behavioral insights against QA pairs

Usage:
    python run_evaluation.py --scenario mimic --qa-file /path/to/qa.json --log-dir /path/to/logs
    python run_evaluation.py --scenario 10k --qa-file /path/to/qa.json --log-dir /path/to/logs

See README.md for detailed usage instructions.
"""

import argparse
import logging
import os
from pathlib import Path

from config import get_config
from evaluate import UnifiedEvaluator

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


def main():
    """Main entry point for evaluation."""
    parser = argparse.ArgumentParser(
        description="DDR_Bench Unified Evaluation Script",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Evaluate MIMIC results
  python run_evaluation.py --scenario mimic --qa-file qa.json --log-dir ./mimic_logs

  # Evaluate 10-K results
  python run_evaluation.py --scenario 10k --qa-file qa.json --log-dir ./10k_logs

  # Evaluate GLOBEM results
  python run_evaluation.py --scenario globem --qa-file qa.json --log-dir ./globem_logs
        """
    )
    
    # Required arguments
    parser.add_argument("--scenario", required=True, choices=["mimic", "10k", "globem"],
                        help="Evaluation scenario")
    parser.add_argument("--qa-file", help="Path to QA file (default: from config)")
    parser.add_argument("--log-dir", help="Path to agent logs directory (default: from config)")
    
    # Output configuration
    parser.add_argument("--output", "-o", help="Output file path for results")
    
    # Provider configuration
    parser.add_argument("--provider",
                        choices=["azure", "openai", "vllm"],
                        help="LLM provider for evaluation (default: from config)")
    parser.add_argument("--model",
                        help="Model name for LLM-as-judge (default: from config)")
    
    # vLLM configuration
    parser.add_argument("--vllm-host", help="vLLM server host (default: localhost)")
    parser.add_argument("--vllm-port", help="vLLM server port (default: from config)")
    
    # Evaluation options
    parser.add_argument("--max-retries", type=int,
                        help="Maximum retry attempts for API calls")
    parser.add_argument("--retry-delay", type=float,
                        help="Delay between retries in seconds")
    parser.add_argument("--test-mode", "-t", action="store_true",
                        help="Run in test mode (process only first entity)")
    
    # Configuration file
    parser.add_argument("--config", help="Path to config.yaml file")
    
    args = parser.parse_args()
    
    # Load configuration
    config = get_config(args.config)
    scenario_config = config.get_scenario(args.scenario)
    
    # Get paths from config or args
    qa_file = args.qa_file or scenario_config.qa_file
    log_dir = args.log_dir or scenario_config.log_dir
    
    if not qa_file:
        parser.error(f"--qa-file is required (not found in config for {args.scenario})")
    if not log_dir:
        parser.error(f"--log-dir is required (not found in config for {args.scenario})")
    
    # Determine output file
    output_file = args.output
    if not output_file:
        log_dir_name = Path(log_dir).name
        output_file = f"./{args.scenario}_{log_dir_name}_evaluation_result.json"
    
    # Resolve evaluation parameters
    provider = args.provider or config.evaluation.provider or "azure"
    model = args.model or config.evaluation.model or "gpt-5-mini"
    max_retries = args.max_retries or config.evaluation.max_retries or 5
    retry_delay = args.retry_delay or config.evaluation.retry_delay or 2.0
    log_level = config.agent.log_level or "INFO"
    
    # Set log level for current process
    os.environ["DDR_LOG_LEVEL"] = log_level
    logging.getLogger().setLevel(getattr(logging, log_level.upper(), logging.INFO))
    
    vllm_host = args.vllm_host or "localhost"
    vllm_port = args.vllm_port or config.provider.vllm_port or 8000
    
    # Build vLLM URL
    vllm_url = f"http://{vllm_host}:{vllm_port}/v1/chat/completions"
    
    print(f"\n{'='*60}")
    print(f"DDR_Bench Evaluation")
    print(f"Scenario: {args.scenario}")
    print(f"QA File: {qa_file}")
    print(f"Log Directory: {log_dir}")
    print(f"Output: {output_file}")
    print(f"Provider: {provider}")
    print(f"Model: {model}")
    if args.test_mode:
        print("Mode: TEST (first entity only)")
    print(f"{'='*60}\n")
    
    # Create unified evaluator
    evaluator = UnifiedEvaluator(
        scenario=args.scenario,
        vllm_url=vllm_url,
        provider=provider,
        openai_model=model,
        azure_model=model,
        max_retries=max_retries,
        retry_delay=retry_delay
    )
    
    # Run evaluation
    evaluator.run_evaluation(
        qa_file=qa_file,
        logs_dir=log_dir,
        output_file=output_file,
        test_mode=args.test_mode
    )
    
    print(f"\nEvaluation complete. Results saved to: {output_file}")


if __name__ == "__main__":
    main()
