#!/usr/bin/env python3
"""
InfiHelper Evaluation Entry Script
Provides flexible parameter options for dataset evaluation
"""

import argparse
import os
import sys
from pathlib import Path
from evaluator import InfiHelperEvaluator

def main():
    parser = argparse.ArgumentParser(
        description="InfiHelper Dataset Evaluation Tool",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Usage Examples:
  # Evaluate MATH dataset (using default predictions directory)
  python evaluate.py --dataset math
  
  # Evaluate all datasets
  python evaluate.py --dataset all
  
  # Use custom predictions directory
  python evaluate.py --dataset math --predictions-dir custom_predictions
  
  # Save results to specified directory
  python evaluate.py --dataset gsm8k --results-dir my_results
  
  # Show detailed information without saving
  python evaluate.py --dataset math --verbose
  
  # Evaluate multiple datasets
  python evaluate.py --dataset math,gsm8k,humaneval
        """
    )
    
    # Dataset parameter
    parser.add_argument(
        '--dataset', '-d',
        type=str,
        default='all',
        help='Dataset name to evaluate. Supported: math, gsm8k, humaneval, mbpp, hotpotqa, drop, all. Multiple datasets separated by commas (default: all)'
    )
    
    # Predictions directory
    parser.add_argument(
        '--predictions-dir', '-p',
        type=str,
        default='predictions',
        help='Predictions file directory (default: predictions)'
    )
    
    # Results directory
    parser.add_argument(
        '--results-dir', '-r',
        type=str,
        default=None,
        help='Results save directory. If not specified, results will be saved to default results directory (default: auto-save)'
    )
    
    # Verbose output
    parser.add_argument(
        '--verbose', '-v',
        action='store_true',
        help='Show detailed evaluation information'
    )
    
    # Don't save results
    parser.add_argument(
        '--no-save',
        action='store_true',
        help='Don\'t save result files, only display evaluation results'
    )
    
    # Show available datasets
    parser.add_argument(
        '--list-datasets',
        action='store_true',
        help='Show all available datasets'
    )
    
    args = parser.parse_args()
    
    # Show available datasets
    if args.list_datasets:
        show_available_datasets()
        return
    
    # Validate predictions directory
    predictions_dir = Path(args.predictions_dir)
    if not predictions_dir.exists():
        print(f"Error: predictions directory does not exist: {predictions_dir}")
        print(f"Please ensure the directory exists or use --predictions-dir to specify the correct path")
        return
    
    # Create evaluator
    try:
        evaluator = InfiHelperEvaluator(
            predictions_dir=str(predictions_dir),
            results_dir=args.results_dir if args.results_dir else "results"
        )
    except Exception as e:
        print(f"Error creating evaluator: {e}")
        return
    
    # Parse dataset list
    datasets = parse_datasets(args.dataset)
    
    if not datasets:
        print("Error: no valid datasets specified")
        return
    
    # Execute evaluation
    if len(datasets) == 1:
        evaluate_single_dataset(evaluator, datasets[0], args.verbose, args.no_save)
    else:
        evaluate_multiple_datasets(evaluator, datasets, args.verbose, args.no_save)

def show_available_datasets():
    """Show available datasets"""
    print("Available datasets:")
    datasets = [
        ("math", "MATH Mathematical Competition"),
        ("gsm8k", "GSM8K Mathematical Reasoning"),
        ("humaneval", "HumanEval Code Generation"),
        ("mbpp", "MBPP Code Generation"),
        ("hotpotqa", "HotpotQA Multi-hop QA"),
        ("drop", "DROP Reading Comprehension"),
        ("all", "All datasets")
    ]
    
    for name, description in datasets:
        print(f"  - {name:<12}: {description}")

def parse_datasets(dataset_arg):
    """Parse dataset parameters"""
    if dataset_arg.lower() == 'all':
        return ['math', 'gsm8k', 'humaneval', 'mbpp', 'hotpotqa', 'drop']
    
    datasets = [d.strip().lower() for d in dataset_arg.split(',')]
    valid_datasets = ['math', 'gsm8k', 'humaneval', 'mbpp', 'hotpotqa', 'drop']
    
    invalid_datasets = [d for d in datasets if d not in valid_datasets]
    if invalid_datasets:
        print(f"Invalid datasets: {', '.join(invalid_datasets)}")
        print(f"Available datasets: {', '.join(valid_datasets)}")
        return []
    
    return datasets

def evaluate_single_dataset(evaluator, dataset_name, verbose, no_save):
    """Evaluate single dataset"""
    print(f"Starting evaluation for dataset: {dataset_name}")
    print("-" * 50)
    
    try:
        result = evaluator.evaluate_dataset(dataset_name)
        
        # Display results
        display_result(dataset_name, result, verbose)
        
        if not no_save:
            print(f"Detailed results saved to: {evaluator.results_dir}")
        
    except Exception as e:
        print(f"Evaluation failed: {e}")
        if verbose:
            import traceback
            traceback.print_exc()

def evaluate_multiple_datasets(evaluator, datasets, verbose, no_save):
    """Evaluate multiple datasets"""
    print(f"Starting evaluation for {len(datasets)} datasets: {', '.join(datasets)}")
    print("=" * 60)
    
    results = {}
    for dataset_name in datasets:
        print(f"\nEvaluating {dataset_name}...")
        try:
            result = evaluator.evaluate_dataset(dataset_name)
            
            results[dataset_name] = result
            display_result(dataset_name, result, verbose)
            
        except Exception as e:
            print(f"{dataset_name} evaluation failed: {e}")
            results[dataset_name] = {"error": str(e)}
    
    # Display summary results
    if not no_save:
        print("\n" + "=" * 60)
        print("Summary Results:")
        print("-" * 60)
        
        for dataset_name, result in results.items():
            if "error" not in result:
                print(f"{dataset_name:<12}: Accuracy {result['accuracy']:.4f} ({result['correct_samples']}/{result['total_test_samples']})")
            else:
                print(f"{dataset_name:<12}: Evaluation failed - {result['error']}")

def display_result(dataset_name, result, verbose):
    """Display evaluation results"""
    if "error" in result:
        print(f"{dataset_name}: {result['error']}")
        return
    
    accuracy = result['accuracy']
    total = result['total_test_samples']
    correct = result['correct_samples']
    cost = result['total_cost']
    
    # Status indicator
    status = "PASS" if accuracy >= 0.8 else "WARN" if accuracy >= 0.6 else "FAIL"
    
    print(f"{status} {dataset_name.upper()}: Accuracy {accuracy:.4f} ({correct}/{total})")
    
    if verbose:
        print(f"   Total cost: {cost:.4f}")
        print(f"   Average cost: {result['avg_cost']:.4f}")

if __name__ == "__main__":
    main()
