#!/usr/bin/env python3
"""
Test actual tool execution in the CampusLifeBench Action system
"""

import sys
import os

# Add the src directory to Python path
current_dir = os.path.dirname(os.path.abspath(__file__))
src_path = os.path.join(current_dir, 'Bench-main', 'src')
sys.path.insert(0, src_path)

def test_actual_tool_execution():
    """Test actual tool execution with real environment"""
    print("🔧 Testing Actual Tool Execution:")
    print("=" * 50)
    
    try:
        from tasks.instance.campus_life_bench.environment import CampusEnvironment
        from tasks.instance.campus_life_bench.action_executor import ActionExecutor
        
        # Create environment and executor
        env = CampusEnvironment()
        executor = ActionExecutor(env, ["email", "calendar", "map", "geography"])
        
        print(f"✅ Environment and executor created")
        print(f"   Available actions: {len(executor.get_available_actions())}")
        
        # Test 1: Email sending
        print("\n1. Testing email sending:")
        try:
            result = executor.execute_action('email.send_email(to="test@university.edu", subject="Test Email", body="This is a test email.")')
            print(f"   Status: {result.status}")
            print(f"   Message: {result.message}")
            print(f"   Success: {'✅' if result.is_success() else '❌'}")
        except Exception as e:
            print(f"   Error: {str(e)}")
        
        # Test 2: Geography - get current location
        print("\n2. Testing geography - get current location:")
        try:
            result = executor.execute_action('geography.get_current_location()')
            print(f"   Status: {result.status}")
            print(f"   Message: {result.message}")
            print(f"   Success: {'✅' if result.is_success() else '❌'}")
            if result.data:
                print(f"   Data: {result.data}")
        except Exception as e:
            print(f"   Error: {str(e)}")
        
        # Test 3: Map - find building
        print("\n3. Testing map - find building:")
        try:
            result = executor.execute_action('map.find_building_id(building_name="Grand Central Library")')
            print(f"   Status: {result.status}")
            print(f"   Message: {result.message}")
            print(f"   Success: {'✅' if result.is_success() else '❌'}")
            if result.data:
                print(f"   Data: {result.data}")
        except Exception as e:
            print(f"   Error: {str(e)}")
        
        # Test 4: Calendar - add event
        print("\n4. Testing calendar - add event:")
        try:
            result = executor.execute_action('calendar.add_event(calendar_id="self", event_title="Test Meeting", location="Office", time="Week 1, Monday, 14:00-15:00")')
            print(f"   Status: {result.status}")
            print(f"   Message: {result.message}")
            print(f"   Success: {'✅' if result.is_success() else '❌'}")
            if result.data:
                print(f"   Data: {result.data}")
        except Exception as e:
            print(f"   Error: {str(e)}")
        
        # Test 5: System availability enforcement
        print("\n5. Testing system availability enforcement:")
        limited_executor = ActionExecutor(env, ["email"])  # Only email allowed
        try:
            result = limited_executor.execute_action('map.find_building_id(building_name="Library")')
            print(f"   Status: {result.status}")
            print(f"   Message: {result.message}")
            print(f"   Should fail: {'✅' if result.is_failure() else '❌'}")
        except Exception as e:
            print(f"   Error: {str(e)}")
        
        return True
        
    except Exception as e:
        print(f"❌ Tool execution test failed: {str(e)}")
        import traceback
        traceback.print_exc()
        return False

def test_system_prompt_generation():
    """Test system prompt generation with different configurations"""
    print("\n📝 Testing System Prompt Generation:")
    print("=" * 50)
    
    try:
        from tasks.instance.campus_life_bench.system_prompt_generator import SystemPromptGenerator
        
        generator = SystemPromptGenerator()
        
        # Test different system combinations
        test_combinations = [
            (["email"], "Email-only"),
            (["email", "calendar"], "Email + Calendar"),
            (["map", "geography"], "Navigation"),
            (["email", "calendar", "map", "geography", "reservation"], "Multi-system"),
            (None, "All systems")
        ]
        
        for systems, description in test_combinations:
            print(f"\n{description}:")
            try:
                prompt = generator.generate_prompt(systems)
                print(f"   Length: {len(prompt)} characters")
                
                # Check for expected content
                if systems is None or "email" in (systems or []):
                    has_email = "Email System Tools" in prompt
                    print(f"   Contains Email Tools: {'✅' if has_email else '❌'}")
                
                if systems and "map" in systems:
                    has_map = "Map & Geography Tools" in prompt
                    print(f"   Contains Map Tools: {'✅' if has_map else '❌'}")
                elif systems and "map" not in systems:
                    has_map = "Map & Geography Tools" not in prompt
                    print(f"   Excludes Map Tools: {'✅' if has_map else '❌'}")
                
            except Exception as e:
                print(f"   Error: {str(e)}")
        
        return True
        
    except Exception as e:
        print(f"❌ System prompt generation test failed: {str(e)}")
        return False

def test_task_integration():
    """Test integration with CampusTask"""
    print("\n🎯 Testing Task Integration:")
    print("=" * 50)
    
    try:
        from tasks.instance.campus_life_bench.task import CampusTask, CampusDatasetItem
        
        # Create a test task
        task_data = {
            "task_id": "integration_test",
            "task_type": "email_sending",
            "instruction": "Send a test email",
            "available_systems": ["email", "calendar"]
        }
        
        dataset_item = CampusDatasetItem(**task_data)
        print(f"✅ Dataset item created: {dataset_item.task_id}")
        print(f"   Available systems: {dataset_item.available_systems}")
        
        # Test action parsing
        test_responses = [
            'Action: email.send_email(to="test@test.com", subject="Test", body="Hello")',
            'Action: finish()',
            'Invalid response format'
        ]
        
        for i, response in enumerate(test_responses, 1):
            result = CampusTask._parse_agent_response(response)
            print(f"{i}. Response: {response[:50]}...")
            print(f"   Action: {result.action}")
            print(f"   Content: {result.content}")
        
        return True
        
    except Exception as e:
        print(f"❌ Task integration test failed: {str(e)}")
        import traceback
        traceback.print_exc()
        return False

def test_english_enforcement():
    """Test English-only message enforcement"""
    print("\n🌍 Testing English-Only Enforcement:")
    print("=" * 50)
    
    try:
        from tasks.instance.campus_life_bench.tools import ensure_english_message
        
        # Test valid English messages
        valid_messages = [
            "Email sent successfully to recipient",
            "Building found: Grand Central Library",
            "Navigation completed to target location",
            "Event added to calendar successfully"
        ]
        
        print("Testing valid English messages:")
        for msg in valid_messages:
            try:
                result = ensure_english_message(msg)
                print(f"   ✅ '{msg[:30]}...' -> Valid")
            except Exception as e:
                print(f"   ❌ '{msg[:30]}...' -> Failed: {str(e)}")
                return False
        
        # Test invalid messages (should raise exceptions)
        invalid_messages = [
            "邮件发送成功",  # Chinese
            "Correo enviado exitosamente",  # Spanish with special characters
        ]
        
        print("\nTesting invalid messages (should be rejected):")
        for msg in invalid_messages:
            try:
                result = ensure_english_message(msg)
                print(f"   ❌ '{msg}' -> Should have been rejected but wasn't")
                return False
            except Exception:
                print(f"   ✅ '{msg}' -> Correctly rejected")
        
        return True
        
    except Exception as e:
        print(f"❌ English enforcement test failed: {str(e)}")
        return False

def main():
    """Run all execution tests"""
    print("🚀 CampusLifeBench Actual Execution Tests")
    print("=" * 70)
    
    tests = [
        ("Actual Tool Execution", test_actual_tool_execution),
        ("System Prompt Generation", test_system_prompt_generation),
        ("Task Integration", test_task_integration),
        ("English Enforcement", test_english_enforcement),
    ]
    
    results = []
    
    for test_name, test_func in tests:
        print(f"\n🧪 Running: {test_name}")
        try:
            result = test_func()
            results.append((test_name, result))
            status = "✅ PASS" if result else "❌ FAIL"
            print(f"   Result: {status}")
        except Exception as e:
            print(f"   Result: ❌ ERROR - {str(e)}")
            results.append((test_name, False))
    
    # Summary
    print(f"\n🎯 EXECUTION TEST RESULTS:")
    print("=" * 70)
    
    passed = 0
    for test_name, result in results:
        status = "✅ PASS" if result else "❌ FAIL"
        print(f"{status} {test_name}")
        if result:
            passed += 1
    
    print(f"\n📊 Overall: {passed}/{len(results)} tests passed ({passed/len(results)*100:.1f}%)")
    
    if passed == len(results):
        print("\n🎉 ALL EXECUTION TESTS PASSED!")
        print("✅ The CampusLifeBench Action-based system is fully operational!")
        print("\n🔧 Verified Capabilities:")
        print("   ✅ Real tool execution with environment")
        print("   ✅ Dynamic system prompt generation")
        print("   ✅ Task integration and parsing")
        print("   ✅ English-only message enforcement")
        print("   ✅ System availability enforcement")
        print("   ✅ Parameter parsing and validation")
        print("\n🚀 System is production-ready!")
    else:
        print(f"\n⚠️  {len(results) - passed} tests failed.")
        print("   Please review the failing tests and fix any issues.")

if __name__ == "__main__":
    main()
