#!/usr/bin/env python3

import json
from typing import Dict, List, Optional, Any

class SimpleEvaluator:
    """Simple evaluator for basic performance metrics."""
    
    def __init__(self, task_data: Dict[str, Any]):
        self.task_data = task_data
        self.ground_truth = task_data.get("ground_truth", {})
        self.feasible_packages = self.ground_truth.get("feasible_packages", {})
        self.optimal_utility = self.ground_truth.get("utility", 0)
        
        # Calculate utility percentiles based on unique utility values
        utilities = [pkg["utility_score"] for pkg in self.feasible_packages.values()]
        unique_utilities = sorted(set(utilities), reverse=True)  # Unique values, highest to lowest
        
        self.top_5_percent_threshold = unique_utilities[int(len(unique_utilities) * 0.05)] if unique_utilities else 0
        self.top_10_percent_threshold = unique_utilities[int(len(unique_utilities) * 0.1)] if unique_utilities else 0
        self.top_20_percent_threshold = unique_utilities[int(len(unique_utilities) * 0.2)] if unique_utilities else 0
        self.top_50_percent_threshold = unique_utilities[int(len(unique_utilities) * 0.5)] if unique_utilities else 0
        
    def evaluate_recommendation(self, package_ids: List[str]) -> Dict[str, Any]:
        """Evaluates a SINGLE recommendation (list of package IDs) and returns metrics.
        
        Note: This method evaluates only the provided package_ids, typically the agent's
        LAST recommendation. It does not consider any previous recommendations made
        during the conversation.
        """
        if not package_ids:
            return {
                "status": "no_recommendation",
                "message": "No package IDs found in recommendation"
            }
        
        # Check which packages are valid (in feasibility set)
        valid_packages = []
        invalid_packages = []
        
        for pkg_id in package_ids:
            if pkg_id in self.feasible_packages:
                pkg_data = self.feasible_packages[pkg_id]
                valid_packages.append({
                    "package_id": pkg_id,
                    "utility_score": pkg_data["utility_score"],
                    "is_optimal": pkg_data["is_optimal"],
                    "total_cost": pkg_data["total_cost"]
                })
            else:
                invalid_packages.append(pkg_id)
        
        if not valid_packages:
            return {
                "status": "invalid_recommendation",
                "message": f"None of the recommended packages are in the feasibility set",
                "invalid_packages": invalid_packages
            }
        
        # Pick the best valid package (highest utility)
        best_package = max(valid_packages, key=lambda x: x["utility_score"])
        best_utility = best_package["utility_score"]
        
        # Determine performance category
        if best_package["is_optimal"]:
            performance = "optimal"
        elif best_utility >= self.top_5_percent_threshold:
            performance = "top_5_percent"
        elif best_utility >= self.top_10_percent_threshold:
            performance = "top_10_percent"
        elif best_utility >= self.top_20_percent_threshold:
            performance = "top_20_percent"
        elif best_utility >= self.top_50_percent_threshold:
            performance = "top_50_percent"
        else:
            performance = "bottom_50_percent"
        
        # Calculate utility gap from optimal
        utility_gap = self.optimal_utility - best_utility
        utility_gap_percent = (utility_gap / self.optimal_utility * 100) if self.optimal_utility > 0 else 0
        
        return {
            "status": "success",
            "total_recommended": len(package_ids),
            "valid_packages": len(valid_packages),
            "invalid_packages": len(invalid_packages),
            "best_package": best_package,
            "performance": performance,
            "utility_metrics": {
                "best_utility": best_utility,
                "optimal_utility": self.optimal_utility,
                "utility_gap": utility_gap,
                "utility_gap_percent": round(utility_gap_percent, 1),
                "top_5_threshold": self.top_5_percent_threshold,
                "top_10_threshold": self.top_10_percent_threshold,
                "top_20_threshold": self.top_20_percent_threshold,
                "top_50_threshold": self.top_50_percent_threshold
            }
        }
    
    # Note: evaluate_conversation method removed - we now use tool calls directly
    
    def print_evaluation_summary(self, evaluation: Dict[str, Any]) -> None:
        """Prints a nice summary of the evaluation results."""
        print("\n" + "="*50)
        print("EVALUATION SUMMARY")
        print("="*50)
        
        status = evaluation.get("status")
        
        if status == "no_recommendation":
            print("❌ No recommendation found")
            print(f"   {evaluation.get('message', '')}")
            
        elif status == "invalid_recommendation":
            print("❌ Invalid recommendation")
            print(f"   {evaluation.get('message', '')}")
            if evaluation.get("invalid_packages"):
                print(f"   Invalid packages: {evaluation['invalid_packages']}")
                
        elif status == "success":
            print("✅ Valid recommendation found")
            print(f"   Total packages recommended: {evaluation['total_recommended']}")
            print(f"   Valid packages: {evaluation['valid_packages']}")
            
            best_pkg = evaluation["best_package"]
            performance = evaluation["performance"]
            metrics = evaluation["utility_metrics"]
            
            print(f"\n📊 BEST PACKAGE: {best_pkg['package_id']}")
            print(f"   Utility: {best_pkg['utility_score']:.3f}")
            print(f"   Cost: ${best_pkg['total_cost']:.2f}")
            
            print(f"\n🎯 PERFORMANCE: {performance.upper()}")
            
            if performance == "optimal":
                print("   🏆 OPTIMAL CHOICE! Perfect recommendation.")
            elif performance == "top_5_percent":
                print("   🥇 Excellent choice - in top 5% of options")
            elif performance == "top_10_percent":
                print("   🥈 Very good choice - in top 10% of options")
            elif performance == "top_20_percent":
                print("   🥉 Good choice - in top 20% of options")
            elif performance == "top_50_percent":
                print("   📈 Reasonable choice - in top 50% of options")
            else:
                print("   📉 Poor choice - in bottom 50% of options")
            
            print(f"\n📏 UTILITY METRICS:")
            print(f"   Achieved utility: {metrics['best_utility']:.3f}")
            print(f"   Optimal utility:  {metrics['optimal_utility']:.3f}")
            print(f"   Gap: {metrics['utility_gap']:.3f} ({metrics['utility_gap_percent']}%)")
            print(f"   Top 5% threshold:  {metrics['top_5_threshold']:.3f}")
            print(f"   Top 10% threshold: {metrics['top_10_threshold']:.3f}")
            print(f"   Top 20% threshold: {metrics['top_20_threshold']:.3f}")
            print(f"   Top 50% threshold: {metrics['top_50_threshold']:.3f}")
        
        print("="*50)

# Note: evaluate_task_simple function removed - use SimpleEvaluator.evaluate_recommendation directly 