#!/usr/bin/env python3

import argparse
import os
import sys
from pathlib import Path

# Load API configuration from .env file
from dotenv import load_dotenv
load_dotenv('../.env')

from evaluator import UniversalPhysicsEvaluator, safe_print


def parse_args():
    """Parse command line arguments"""
    parser = argparse.ArgumentParser(description="HiPhO Physics Competition Evaluation Script - Specially adapted for HiPhO datasets")
    
    # Basic parameters
    parser.add_argument(
        "--results-dir", 
        type=str, 
        default="../infer_results",
        help="Inference results directory (default: ../infer_results)"
    )
    
    parser.add_argument(
        "--dataset", 
        type=str, 
        choices=[
            'APhO_2025', 'EuPhO_2024', 'EuPhO_2025', 'F=MA_2024', 'F=MA_2025',
            'IPhO_2024', 'IPhO_2025', 'NBPhO_2024', 'NBPhO_2025',
            'PanMechanics_2024', 'PanMechanics_2025', 'PanPhO_2024', 'PanPhO_2025'
        ],
        help="Specify the dataset to evaluate (if not specified, evaluate all available datasets)"
    )
    
    parser.add_argument(
        "--model-name", 
        type=str,
        help="Model name, used for filtering result files and naming output files (e.g., intern-s1, Qwen_Qwen2_5-VL-32B-Instruct)"
    )
    
    parser.add_argument(
        "--nproc", 
        type=int, 
        default=4,
        help="Number of parallel processes (default: 4)"
    )
    
    # Judge model parameters
    parser.add_argument(
        "--judge-model", 
        type=str, 
        help="Judge model name (e.g., gpt-4o, gemini-2.5-flash)"
    )
    
    parser.add_argument(
        "--no-judge", 
        action="store_true",
        help="Disable Judge model (fine-grained score will be 0, still perform coarse-grained evaluation)"
    )
    
    parser.add_argument(
        "--multi-runs", 
        action="store_true",
        help="Evaluate multiple run results and calculate statistics"
    )
    
    # Other parameters
    parser.add_argument(
        "--output-dir", 
        type=str, 
        default="../eval_results",
        help="Evaluation results output directory (default: ../eval_results)"
    )
    
    parser.add_argument(
        "--verbose", 
        action="store_true",
        help="Verbose output mode"
    )
    
    parser.add_argument(
        "--dry-run", 
        action="store_true",
        help="Dry run mode, only check datasets without actual evaluation"
    )
    
    return parser.parse_args()

def setup_environment(args):
    """Setup runtime environment"""
    # Check results directory
    results_dir = Path(args.results_dir)
    if not results_dir.exists():
        safe_print(f"❌ Results directory does not exist: {results_dir}")
        sys.exit(1)
    
    # Create output directory
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    safe_print(f"📁 Output directory: {output_dir}")

def build_judge_kwargs(args):
    """Build Judge model parameters"""
    judge_kwargs = {}
    
    if args.no_judge:
        safe_print("⚠️  Judge model disabled, fine-grained score will be 0, still perform coarse-grained evaluation")
        return judge_kwargs
    
    if args.judge_model:
        judge_kwargs['model'] = args.judge_model
        safe_print(f"🤖 Using Judge model: {args.judge_model}")
        
        # Check API key
        if not os.getenv("OPENAI_API_KEY"):
            safe_print("⚠️  Warning: API key not set, Judge model may not work")
            safe_print("   Please set OPENAI_API_KEY in .env file")
    else:
        safe_print("ℹ️  Judge model not specified, fine-grained score will be 0, only use coarse-grained evaluation")
    
    if args.nproc:
        judge_kwargs['nproc'] = args.nproc
    
    return judge_kwargs

def main():
    """Main function"""
    args = parse_args()
    
    # Print startup information
    safe_print("🚀 HiPhO Physics Competition Evaluation System Starting")
    safe_print("=" * 60)
    safe_print(f"📂 Inference results directory: {args.results_dir}")
    safe_print(f"📊 Number of parallel processes: {args.nproc}")
    safe_print(f"💾 Output directory: {args.output_dir}")
    if args.model_name:
        safe_print(f"🤖 Specified model: {args.model_name}")
    if args.dataset:
        safe_print(f"🎯 Specified dataset: {args.dataset}")
    else:
        safe_print(f"🎯 Evaluation mode: All available datasets")
    safe_print("=" * 60)
    
    # Setup environment
    setup_environment(args)
    
    # Build Judge parameters
    judge_kwargs = build_judge_kwargs(args)
    
    # Initialize evaluator
    try:
        evaluator = UniversalPhysicsEvaluator(
            results_dir=args.results_dir,
            nproc=args.nproc,
            model_name=args.model_name,
            output_dir=args.output_dir
        )
        safe_print("✅ Evaluator initialized successfully")
    except Exception as e:
        safe_print(f"❌ Evaluator initialization failed: {e}")
        sys.exit(1)
    
    # Dry run mode: only check datasets
    if args.dry_run:
        safe_print("\n🔍 Dry run mode: checking available datasets...")
        available_datasets = evaluator.detect_available_datasets()
        safe_print(f"\n📊 Found {len(available_datasets)} available datasets:")
        for dataset_key in available_datasets:
            config = evaluator.DATASET_CONFIGS[dataset_key]
            safe_print(f"   ✓ {config['display_name']} ({dataset_key})")
        safe_print("\n✅ Dry run completed")
        return
    
    # Start evaluation
    try:
        if args.multi_runs:
            # Multiple runs evaluation mode
            if args.dataset:
                # Evaluate multiple runs of single dataset
                safe_print(f"\n🔄 Starting multiple runs evaluation: {args.dataset}")
                
                # Check if there are multiple runs
                if not evaluator.has_multiple_runs(args.dataset):
                    safe_print(f"⚠️  Dataset {args.dataset} has no multiple run results")
                    return
                
                multi_run_results = evaluator.evaluate_multiple_runs(args.dataset, judge_kwargs)
                
                if multi_run_results:
                    overall = multi_run_results['overall_statistics']
                    safe_print(f"\n🎉 Multiple runs evaluation completed!")
                    safe_print(f"🔄 Number of runs: {overall['num_runs']}")
                    safe_print(f"📈 Average score rate: {overall['mean_score_rate']:.2f}% ± {overall['std_score_rate']:.2f}%")
                else:
                    safe_print(f"❌ Dataset {args.dataset} multiple runs evaluation failed")
            else:
                # Evaluate multiple runs of all datasets
                safe_print(f"\n🌟 Starting multiple runs evaluation for all datasets...")
                available_datasets = evaluator.detect_available_datasets()
                
                multi_run_datasets = []
                for dataset_key in available_datasets:
                    if evaluator.has_multiple_runs(dataset_key):
                        multi_run_datasets.append(dataset_key)
                
                if not multi_run_datasets:
                    safe_print(f"❌ No datasets with multiple run results found")
                    return
                
                safe_print(f"📊 Found {len(multi_run_datasets)} datasets with multiple runs")
                
                all_multi_run_results = {}
                for dataset_key in multi_run_datasets:
                    safe_print(f"\n{'='*60}")
                    safe_print(f"🔄 Evaluating multiple runs: {evaluator.DATASET_CONFIGS[dataset_key]['display_name']}")
                    safe_print(f"{'='*60}")
                    
                    try:
                        multi_run_results = evaluator.evaluate_multiple_runs(dataset_key, judge_kwargs)
                        all_multi_run_results[dataset_key] = multi_run_results
                        
                        if multi_run_results:
                            overall = multi_run_results['overall_statistics']
                            safe_print(f"✅ Completed: Average score rate {overall['mean_score_rate']:.2f}% ± {overall['std_score_rate']:.2f}%")
                        
                    except Exception as e:
                        safe_print(f"❌ Multiple runs evaluation failed for {dataset_key}: {e}")
                        all_multi_run_results[dataset_key] = None
                
                # Save summary of all multiple run results
                if all_multi_run_results:
                    evaluator._save_all_multi_run_summary(all_multi_run_results)
                    
        else:
            # Regular evaluation mode
            if args.dataset:
                # Evaluate single dataset
                safe_print(f"\n🎯 Starting single dataset evaluation: {args.dataset}")
                results = evaluator.evaluate_dataset(args.dataset, judge_kwargs)
                
                if results:
                    config = evaluator.DATASET_CONFIGS[args.dataset]
                    safe_print(f"\n✅ {config['display_name']} evaluation completed!")
                    safe_print(f"🏆 Overall score: {results['total_score']:.2f} / {results['max_possible_score']:.2f} ({results['score_rate']:.2f}%)")
                else:
                    safe_print(f"❌ Dataset {args.dataset} evaluation failed")
            else:
                # Evaluate all datasets
                safe_print(f"\n🌟 Starting evaluation of all available datasets...")
                all_results = evaluator.evaluate_all_datasets(judge_kwargs)
                
                if all_results:
                    safe_print(f"\n🎉 All datasets evaluation completed!")
                    successful_count = sum(1 for r in all_results.values() if r is not None)
                    safe_print(f"📊 Successfully evaluated {successful_count}/{len(all_results)} datasets")
                else:
                    safe_print(f"❌ Failed to successfully evaluate any datasets")
                
    except KeyboardInterrupt:
        safe_print(f"\n⏹️  User interrupted evaluation")
        sys.exit(1)
    except Exception as e:
        safe_print(f"\n❌ Error occurred during evaluation: {e}")
        import traceback
        if args.verbose:
            safe_print(f"Detailed error information:\n{traceback.format_exc()}")
        sys.exit(1)
    
    safe_print(f"\n🎯 Evaluation completed! Results saved to: {args.output_dir}")

if __name__ == "__main__":
    main()