
import os
import sys
import json
import time
import asyncio
import argparse
from typing import Dict, List, Any, Optional, Tuple
from dataclasses import dataclass, asdict
from datetime import datetime
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent))

from mcp_agent.app import MCPApp
from mcp_agent.agents.agent import Agent
from mcp_agent.workflows.llm.augmented_llm_openai import OpenAIAugmentedLLM
from mcp_agent.workflows.llm.augmented_llm_plan_execute_simple import SimplePlanThenExecuteLLM
from src.mcp_agent.config_utils import merge_secrets_into_config
from dynamic_evaluation_framework import (
    DynamicEvaluationFramework, 
    AgentExecution, 
    DynamicEvaluationResult
)


@dataclass
class DynamicBenchmarkResult:
    task_id: str
    task_name: str
    success: bool
    agent_execution: AgentExecution
    evaluation_result: DynamicEvaluationResult
    error: Optional[str] = None


class DynamicBenchmarkRunner:
    
    def __init__(self, tasks_file: str = "benchmark_tasks_v4.json"):
        self.tasks_file = tasks_file
        self.tasks = self._load_tasks()
        self.results_dir = Path("dynamic_benchmark_results")
        self.results_dir.mkdir(exist_ok=True)
        
        self.agent_output_dir = Path("agent_outputs")
        self.agent_output_dir.mkdir(exist_ok=True)
        
        self.ground_truth_output_dir = Path("ground_truth_outputs")
        self.ground_truth_output_dir.mkdir(exist_ok=True)
        
        self._setup_config()
        
        self.all_available_servers = self._get_all_servers_from_tasks()
        
        self.evaluation_framework = DynamicEvaluationFramework(
            available_servers=self.all_available_servers,
            config_path=str(self.temp_config_path),
            use_intelligent_selection=False,
            agent_output_dir=str(self.agent_output_dir),
            ground_truth_output_dir=str(self.ground_truth_output_dir)
        )
    
    def _setup_config(self):
        import yaml
        config_path = Path("mcp_agent.config.yaml")
        secrets_path = Path("mcp_agent.secrets.yaml")
        
        with open(config_path, 'r') as f:
            raw_config = yaml.safe_load(f)
        
        with open(secrets_path, 'r') as f:
            secrets = yaml.safe_load(f)
        
        merged_config = merge_secrets_into_config(raw_config, secrets)
        
        for key, value in secrets.items():
            if isinstance(value, str):
                os.environ[key.upper()] = value
            elif isinstance(value, dict):
                for sub_key, sub_value in value.items():
                    if isinstance(sub_value, str):
                        os.environ[f"{key.upper()}_{sub_key.upper()}"] = sub_value
        
        self.temp_config_path = Path("mcp_agent.config.temp.yaml")
        with open(self.temp_config_path, 'w') as f:
            yaml.dump(merged_config, f, default_flow_style=False)
        
        print("✅ Configuration loaded successfully")
    
    def _load_tasks(self) -> List[Dict[str, Any]]:
        try:
            with open(self.tasks_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            return data.get('tasks', [])
        except Exception as e:
            print(f"❌ Failed to load task file: {e}")
            return []
    
    def _get_all_servers_from_tasks(self) -> List[str]:
        all_servers = set()
        for task in self.tasks:
            if 'available_servers' in task:
                all_servers.update(task['available_servers'])
        
        servers_list = sorted(list(all_servers))
        print(f"📊 Found {len(servers_list)} unique MCP servers from tasks")
        return servers_list
    
    def _modify_file_paths_for_agent(self, arguments: dict, task_id: str) -> dict:
        if not arguments:
            return arguments
            
        modified_args = arguments.copy()
        
        # File path parameter names that need modification
        path_params = ['path', 'file_path', 'filename', 'filepath', 'file', 'output_path']
        
        for param in path_params:
            if param in modified_args:
                original_path = modified_args[param]
                if original_path:
                    # Parse file path
                    path_obj = Path(original_path)
                    
                    # If absolute path, only take file name
                    if path_obj.is_absolute():
                        filename = path_obj.name
                    else:
                        filename = str(path_obj)
                    
                    # Build new path to agent_outputs directory
                    new_path = self.agent_output_dir / f"task_{task_id}" / filename
                    
                    # Ensure directory exists
                    new_path.parent.mkdir(parents=True, exist_ok=True)
                    
                    modified_args[param] = str(new_path)
                    
                    print(f"🔄 Agent file path modified: {original_path} -> {new_path}")
        
        return modified_args
    
    async def run_agent_execution(self, task: Dict[str, Any], execution_mode: str = "react") -> AgentExecution:
        """Execute Agent task"""
        print(f"\n🤖 Executing Agent: {task['name']} (ID: {task['id']})")
        print(f"📝 Description: {task['description']}")
        print(f"🔧 Execution mode: {execution_mode.upper()}")
        
        # Create task-specific output directory path
        task_output_dir = self.agent_output_dir / f"task_{task['id']}"
        print(f"📁 Agent output directory: {task_output_dir}")
        
        # Get task's available server list
        task_servers = task.get('available_servers', [])
        if not task_servers:
            print("⚠️ Task has no defined available_servers, using all available servers")
            task_servers = self.all_available_servers
        
        print(f"🖥️ Using {len(task_servers)} MCP servers: {task_servers[:5]}{'...' if len(task_servers) > 5 else ''}")
        
        start_time = time.time()
        tools_used = []
        tool_calls = []
        execution_steps = []  # New: Record execution steps
        full_conversation = []  # New: Record full conversation
        reasoning_trace = []  # New: Record reasoning trace
        initial_plan = ""  # New: Record initial plan
        
        try:
            # Create MCP application
            app = MCPApp(
                name=f"agent_{task['id']}",
                settings=str(self.temp_config_path)
            )
            
            async with app.run() as mcp_app:
                # Create Agent, only using task-specified servers
                agent = Agent(
                    name=f"agent_{task['id']}",
                    description=task['description'],
                    server_names=task_servers,  # Use task-specified server list
                    context=mcp_app.context
                )
                
                async with agent:
                    # Create a custom LLM class to track tool calls
                    current_step_index = 0  # Initialize step index
                    
                    class TrackingLLM(OpenAIAugmentedLLM if execution_mode == "react" else SimplePlanThenExecuteLLM):
                        def __init__(self, *args, **kwargs):
                            super().__init__(*args, **kwargs)
                            self._tool_calls = tool_calls
                            self._tools_used = tools_used
                            self._execution_steps = execution_steps
                            self._current_step_index = [current_step_index]  # Use list to allow modification in closure
                            self._start_time = start_time
                            self._task_id = task['id']
                            # Directly use outer class method
                            self._outer_instance = None  # Will be set later
                        
                        async def call_tool(self, request, tool_call_id=None):
                            """Override call_tool method to track tool calls"""
                            tool_name = request.params.name
                            arguments = request.params.arguments
                            call_start = time.time()
                            
                            # Record execution step start
                            step_info = {
                                "step_number": self._current_step_index[0] + 1,
                                "tool_name": tool_name,
                                "arguments": arguments,
                                "timestamp": time.time() - self._start_time,
                                "status": "started"
                            }
                            
                            try:
                                # Modify path for file operation tools
                                modified_arguments = arguments
                                if tool_name and ('write_file' in tool_name or 'create_file' in tool_name or 
                                                'edit_file' in tool_name or 'modify_file' in tool_name or
                                                'save_file' in tool_name or 'output' in tool_name):
                                    # Create modified request
                                    if self._outer_instance:
                                        modified_arguments = self._outer_instance._modify_file_paths_for_agent(arguments, self._task_id)
                                    else:
                                        # Fallback method: directly modify path
                                        modified_arguments = self._modify_file_paths_fallback(arguments, self._task_id)
                                    print(f"🔧 Agent tool call path modified: {tool_name}")
                                    
                                    # Create new request object
                                    from mcp.types import CallToolRequest
                                    modified_request = CallToolRequest(
                                        method=request.method,
                                        params={
                                            "name": tool_name,
                                            "arguments": modified_arguments
                                        }
                                    )
                                    request = modified_request
                                
                                # Call parent class call_tool
                                result = await super().call_tool(request, tool_call_id)
                                
                                self._tools_used.append(tool_name)
                                
                                # Record successful tool call
                                tool_call_record = {
                                    "tool_name": tool_name,
                                    "arguments": modified_arguments if 'modified_arguments' in locals() else arguments,
                                    "result": str(result)[:500] if result else "",  # Limit result length
                                    "success": True,
                                    "duration": time.time() - call_start
                                }
                                self._tool_calls.append(tool_call_record)
                                
                                # Update step information
                                step_info.update({
                                    "status": "completed",
                                    "result": str(result)[:200] if result else "",  # Brief result
                                    "duration": time.time() - call_start,
                                    "success": True
                                })
                                self._execution_steps.append(step_info)
                                self._current_step_index[0] += 1
                                
                                return result
                                
                            except Exception as e:
                                self._tool_calls.append({
                                    "tool_name": tool_name,
                                    "arguments": modified_arguments if 'modified_arguments' in locals() else arguments,
                                    "success": False,
                                    "error": str(e),
                                    "duration": time.time() - call_start
                                })
                                
                                # Record failed step
                                step_info.update({
                                    "status": "failed",
                                    "error": str(e),
                                    "duration": time.time() - call_start,
                                    "success": False
                                })
                                self._execution_steps.append(step_info)
                                
                                raise
                        
                        def _modify_file_paths_fallback(self, arguments: dict, task_id: str) -> dict:
                            """Fallback file path modification method"""
                            if not arguments:
                                return arguments
                                
                            modified_args = arguments.copy()
                            
                            # File path parameter names that need modification
                            path_params = ['path', 'file_path', 'filename', 'filepath', 'file', 'output_path']
                            
                            for param in path_params:
                                if param in modified_args:
                                    original_path = modified_args[param]
                                    if original_path:
                                        # Parse file path
                                        path_obj = Path(original_path)
                                        
                                        # If absolute path, only take file name
                                        if path_obj.is_absolute():
                                            filename = path_obj.name
                                        else:
                                            filename = str(path_obj)
                                        
                                        # Build new path to agent_outputs directory
                                        new_path = Path("agent_outputs") / f"task_{task_id}" / filename
                                        
                                        # Ensure directory exists
                                        new_path.parent.mkdir(parents=True, exist_ok=True)
                                        
                                        modified_args[param] = str(new_path)
                            
                            return modified_args
                    
                    # Set _outer_instance to TrackingLLM instance
                    TrackingLLM._outer_instance = self
                    
                    # Get brief information about available tools
                    available_tools_info = f"You have access to {len(task_servers)} MCP servers with their respective tools"
                    
                    # Select different prompt based on execution mode
                    if execution_mode == "plan_execute":
                        # Plan-Execute mode prompt - emphasize planning before execution
                        agent_prompt = f"""You are a systematic problem solver using MCP (Model Context Protocol) tools. Complete this task using a Plan-then-Execute approach.

TASK INFORMATION:
- User Query: {task['query']}
- Task Description: {task['description']}
- Available MCP Servers: {task_servers}

IMPORTANT FILE HANDLING:
- All files you create will be automatically stored in a dedicated output directory
- You don't need to worry about file path management - the system handles this automatically
- Simply use standard file names (e.g., "report.xlsx", "data.json") in your tool calls

MCP TOOL PLANNING METHODOLOGY:

Phase 1 - TOOL PLANNING (Strategic tool selection):
• Analyze available MCP servers and their tools
• Create a complete, step-by-step plan identifying:
  - Which specific MCP server and tool to use for each step
  - Expected inputs and outputs for each tool
  - Data flow between tool calls
  - Potential fallback tools if primary choice fails
• Consider tool composition - how outputs from one tool feed into another

Phase 2 - EXECUTION (Systematic tool orchestration):
• Execute your tool plan step by step
• Validate each tool's output before proceeding
• Handle tool errors gracefully

Phase 3 - SYNTHESIS (Compile results):
• Integrate outputs from all tool calls
• Ensure the final result addresses all requirements

OUTPUT FORMAT:
Start with: "MCP TOOL EXECUTION PLAN:"
List each step as: "Step N: [Server.Tool] - Purpose - Expected Output"
Then: "EXECUTING TOOL PLAN:" and implement each step.

Remember: Effective MCP tool planning is key to task success.

Begin planning your tool usage:"""
                    else:
                        # ReAct mode prompt - emphasize iteration and exploration
                        agent_prompt = f"""You are an adaptive problem solver using MCP (Model Context Protocol) tools. Complete this task using the ReAct (Reasoning-Acting) approach.

TASK INFORMATION:
- User Query: {task['query']}
- Task Description: {task['description']}
- Available MCP Servers: {task_servers}

IMPORTANT FILE HANDLING:
- All files you create will be automatically stored in a dedicated output directory
- You don't need to worry about file path management - the system handles this automatically
- Simply use standard file names (e.g., "report.xlsx", "data.json") in your tool calls

MCP TOOL PLANNING WITH REACT:

Your approach should dynamically plan and use MCP tools:

1. REASON about tool needs: "To accomplish X, I need server Y's tool Z because..."
2. SELECT appropriate MCP tool: Choose the most suitable server and tool for current need
3. EXECUTE tool call: Use the tool with proper parameters
4. OBSERVE results: Analyze what the tool returned
5. ADAPT your plan: Based on results, decide next tool to use

KEY PRINCIPLES FOR MCP TOOL USAGE:
• Each server has specific capabilities - match server to task
• Tools can fail - have backup strategies
• Tool outputs inform next steps - be responsive
• Some tasks need tool combinations - think holistically
• Explore available tools if unsure of capabilities

TOOL EXECUTION EXAMPLE:
"I need to find restaurants. The google-maps server has search_places tool..."
[IMMEDIATELY CALL google-maps.search_places with location parameters]
"The search returned 20 results. Now I need to save this to a file using filesystem server..."
[IMMEDIATELY CALL filesystem.write_file to save the results]

CRITICAL: You MUST execute tool calls immediately after identifying the need. Do NOT just describe what you plan to do - actually DO IT by making tool calls.

Start by executing your first tool call:"""
                    
                    # Record initial prompt
                    full_conversation.append({
                        "role": "system",
                        "content": agent_prompt,
                        "timestamp": 0
                    })
                    
                    # Select execution mode - using TrackingLLM
                    if execution_mode == "plan_execute":
                        llm = await agent.attach_llm(TrackingLLM)
                    else:
                        llm = await agent.attach_llm(TrackingLLM)
                    
                    # Execute task - let Agent plan freely
                    response = await llm.generate(agent_prompt)
                    
                    # Extract response text
                    response_text = self._extract_response_text(response)
                    
                    # Record complete response
                    full_conversation.append({
                        "role": "assistant",
                        "content": response_text,
                        "timestamp": time.time() - start_time
                    })
                    
                    # Try to extract initial plan and reasoning trace
                    initial_plan, reasoning_trace = self._extract_plan_and_reasoning(response_text, execution_mode)
                    
                    # Convert conversation to string
                    full_conversation_str = "\n\n".join([
                        f"[{msg['role'].upper()}] (T={msg['timestamp']:.2f}s):\n{msg['content']}"
                        for msg in full_conversation
                    ])
                    
                    # Get token usage
                    tokens_used = 0
                    if hasattr(response, 'usage') and response.usage:
                        tokens_used = response.usage.total_tokens
            
            execution_time = time.time() - start_time
            
            return AgentExecution(
                task_id=task['id'],
                tools_used=tools_used,
                tool_calls=tool_calls,
                final_output=response_text,
                execution_time=execution_time,
                success=True,
                tokens_used=tokens_used,
                full_conversation=full_conversation_str,
                initial_plan=initial_plan,
                execution_steps=execution_steps,
                reasoning_trace=reasoning_trace
            )
            
        except Exception as e:
            execution_time = time.time() - start_time
            print(f"❌ Agent execution failed: {e}")
            
            # Convert conversation to string (if available)
            full_conversation_str = ""
            if full_conversation:
                full_conversation_str = "\n\n".join([
                    f"[{msg['role'].upper()}] (T={msg['timestamp']:.2f}s):\n{msg['content']}"
                    for msg in full_conversation
                ])
            
            return AgentExecution(
                task_id=task['id'],
                tools_used=tools_used,
                tool_calls=tool_calls,
                final_output="",
                execution_time=execution_time,
                success=False,
                error=str(e),
                full_conversation=full_conversation_str,
                initial_plan=initial_plan,
                execution_steps=execution_steps,
                reasoning_trace=reasoning_trace
            )
    
    def _extract_response_text(self, response) -> str:
        """Extract response text"""
        if hasattr(response, 'content'):
            if isinstance(response.content, list) and len(response.content) > 0:
                first_content = response.content[0]
                if hasattr(first_content, 'text'):
                    return first_content.text
                else:
                    return str(first_content)
            else:
                return str(response.content)
        else:
            return str(response)
    
    def _extract_plan_and_reasoning(self, response_text: str, execution_mode: str) -> Tuple[str, List[str]]:
        """Extract initial plan and reasoning trace from response text"""
        initial_plan = ""
        reasoning_trace = []
        
        if execution_mode == "plan_execute":
            # Try to extract Plan-Execute mode plan
            if "MCP TOOL EXECUTION PLAN:" in response_text:
                plan_start = response_text.find("MCP TOOL EXECUTION PLAN:")
                plan_end = response_text.find("EXECUTING TOOL PLAN:")
                if plan_end == -1:
                    plan_end = response_text.find("\n\n", plan_start + 100)  # Find next paragraph
                if plan_end != -1:
                    initial_plan = response_text[plan_start:plan_end].strip()
        
        # Extract reasoning trace (ReAct mode or general)
        lines = response_text.split('\n')
        for line in lines:
            line = line.strip()
            # Identify reasoning patterns
            if any(keyword in line.lower() for keyword in ['reason:', 'reasoning:', 'thought:', 'thinking:', 
                                                           'observation:', 'observe:', 'plan:', 'planning:']):
                reasoning_trace.append(line)
            # Identify decision points
            elif any(keyword in line.lower() for keyword in ['i need to', 'i will', 'i should', 'next step',
                                                            'to accomplish', 'based on', 'therefore']):
                if len(line) > 20:  # Avoid too short sentences
                    reasoning_trace.append(line)
        
        return initial_plan, reasoning_trace
    
    async def run_single_task(self, task: Dict[str, Any], execution_mode: str = "react") -> DynamicBenchmarkResult:
        """Execute complete dynamic evaluation for single task"""
        print(f"\n{'='*80}")
        print(f"🎯 Dynamic Benchmark: {task['name']} (ID: {task['id']})")
        print(f"{'='*80}")
        
        try:
            # 1. Execute Agent
            agent_execution = await self.run_agent_execution(task, execution_mode)
            
            # 2. Dynamic evaluation (includes Ground Truth execution and LLM Judge)
            evaluation_result = await self.evaluation_framework.evaluate_task(task, agent_execution)
            
            # 3. Display evaluation result
            self.evaluation_framework.display_evaluation_result(evaluation_result)
            
            return DynamicBenchmarkResult(
                task_id=task['id'],
                task_name=task['name'],
                success=agent_execution.success,
                agent_execution=agent_execution,
                evaluation_result=evaluation_result
            )
            
        except Exception as e:
            print(f"❌ Dynamic evaluation failed: {e}")
            
            # Create failed execution result
            failed_agent_execution = AgentExecution(
                task_id=task['id'],
                tools_used=[],
                tool_calls=[],
                final_output="",
                execution_time=0,
                success=False,
                error=str(e)
            )
            
            # Create failed evaluation result
            failed_evaluation = DynamicEvaluationResult(
                task_id=task['id'],
                toolchain_precision=0.0,
                toolchain_recall=0.0,
                toolchain_f1=0.0,
                toolchain_metrics={},
                llm_content_quality=0.0,
                llm_task_completion=0.0,
                llm_accuracy=0.0,
                llm_overall_score=0.0,
                llm_judge_details={"error": str(e)},
                agent_execution_time=0.0,
                ground_truth_execution_time=0.0,
                time_ratio=0.0,
                agent_tokens_used=0,
                ground_truth_tokens_used=0,
                tokens_ratio=0.0,
                efficiency_metrics={},
                ground_truth_execution=None,
                agent_execution=failed_agent_execution
            )
            
            return DynamicBenchmarkResult(
                task_id=task['id'],
                task_name=task['name'],
                success=False,
                agent_execution=failed_agent_execution,
                evaluation_result=failed_evaluation,
                error=str(e)
            )
    
    async def run_all_tasks(self, execution_mode: str = "react", filter_category: str = None):
        """Run all tasks"""
        # Filter tasks
        tasks_to_run = self.tasks
        if filter_category:
            tasks_to_run = [t for t in self.tasks if t.get('category') == filter_category]
        
        print(f"\n🎯 Preparing to run {len(tasks_to_run)} dynamic evaluation tasks")
        print(f"🔬 Evaluation mode: Ground Truth + Agent + LLM Judge")
        print(f"📋 Using predefined MCP server list (not using intelligent tool selection)")
        
        results = []
        for i, task in enumerate(tasks_to_run, 1):
            print(f"\nProgress: {i}/{len(tasks_to_run)}")
            result = await self.run_single_task(task, execution_mode)
            results.append(result)
            
            # Display brief result
            self._display_result_summary(result)
        
        # Save results
        self._save_results(results)
        
        # Display summary
        self._display_summary(results)
        
        return results
    
    def _display_result_summary(self, result: DynamicBenchmarkResult):
        """Display single task result summary"""
        print(f"\n📊 Task '{result.task_name}' Summary:")
        print(f"   Status: {'✅ Success' if result.success else '❌ Failed'}")
        print(f"   Tool Chain F1: {result.evaluation_result.toolchain_f1:.2%}")
        print(f"   LLM Score: {result.evaluation_result.llm_overall_score:.2%}")
        print(f"   Time Ratio: {result.evaluation_result.time_ratio:.2f}x")
    
    def _display_summary(self, results: List[DynamicBenchmarkResult]):
        """Display overall result summary"""
        print(f"\n\n{'='*80}")
        print("📊 Dynamic Benchmark Summary")
        print(f"{'='*80}")
        
        total_tasks = len(results)
        successful_tasks = sum(1 for r in results if r.success)
        
        print(f"\nTask Completion Rate: {successful_tasks}/{total_tasks} ({successful_tasks/total_tasks:.1%})")
        
        # Calculate average scores
        if results:
            # Metric 1: Tool chain matching
            avg_precision = sum(r.evaluation_result.toolchain_precision for r in results) / len(results)
            avg_recall = sum(r.evaluation_result.toolchain_recall for r in results) / len(results)
            avg_f1 = sum(r.evaluation_result.toolchain_f1 for r in results) / len(results)
            
            # Metric 2: LLM Judge score
            avg_content_quality = sum(r.evaluation_result.llm_content_quality for r in results) / len(results)
            avg_task_completion = sum(r.evaluation_result.llm_task_completion for r in results) / len(results)
            avg_accuracy = sum(r.evaluation_result.llm_accuracy for r in results) / len(results)
            avg_llm_score = sum(r.evaluation_result.llm_overall_score for r in results) / len(results)
            
            # Metric 3: Efficiency metrics
            valid_time_ratios = [r.evaluation_result.time_ratio for r in results if r.evaluation_result.time_ratio != float('inf')]
            avg_time_ratio = sum(valid_time_ratios) / len(valid_time_ratios) if valid_time_ratios else 0
            
            valid_token_ratios = [r.evaluation_result.tokens_ratio for r in results if r.evaluation_result.tokens_ratio != float('inf')]
            avg_token_ratio = sum(valid_token_ratios) / len(valid_token_ratios) if valid_token_ratios else 0
            
            print(f"\n📌 Metric 1: Tool Chain Matching (Average)")
            print(f"  • Precision: {avg_precision:.2%}")
            print(f"  • Recall: {avg_recall:.2%}")
            print(f"  • F1 Score: {avg_f1:.2%}")
            
            print(f"\n🎭 Metric 2: LLM Judge Score (Average)")
            print(f"  • Content Quality: {avg_content_quality:.2%}")
            print(f"  • Task Completion: {avg_task_completion:.2%}")
            print(f"  • Relative Accuracy: {avg_accuracy:.2%}")
            print(f"  • Overall Score: {avg_llm_score:.2%}")
            
            print(f"\n⚡ Metric 3: Efficiency Metrics (Average)")
            if valid_time_ratios:
                print(f"  • Time Ratio: {avg_time_ratio:.2f}x")
            if valid_token_ratios:
                print(f"  • Token Ratio: {avg_token_ratio:.2f}x")
        
        # Tool usage statistics
        all_agent_tools = []
        all_gt_tools = []
        for r in results:
            all_agent_tools.extend(r.agent_execution.tools_used)
            if r.evaluation_result.ground_truth_execution:
                all_gt_tools.extend(r.evaluation_result.ground_truth_execution.tools_used)
        
        from collections import Counter
        agent_tool_counts = Counter(all_agent_tools)
        gt_tool_counts = Counter(all_gt_tools)
        
        print(f"\n🔧 Agent Most Used Tools:")
        for tool, count in agent_tool_counts.most_common(5):
            print(f"  • {tool}: {count} times")
        
        print(f"\n🔬 Ground Truth Most Used Tools:")
        for tool, count in gt_tool_counts.most_common(5):
            print(f"  • {tool}: {count} times")
        
        # Group by category
        category_scores = {}
        for result in results:
            # Find corresponding task to get category
            task = next((t for t in self.tasks if t['id'] == result.task_id), None)
            if task:
                category = task.get('category', 'unknown')
                if category not in category_scores:
                    category_scores[category] = {
                        'f1_scores': [],
                        'llm_scores': [],
                        'time_ratios': []
                    }
                category_scores[category]['f1_scores'].append(result.evaluation_result.toolchain_f1)
                category_scores[category]['llm_scores'].append(result.evaluation_result.llm_overall_score)
                if result.evaluation_result.time_ratio != float('inf'):
                    category_scores[category]['time_ratios'].append(result.evaluation_result.time_ratio)
        
        if category_scores:
            print(f"\n📈 Scores by Category:")
            for category, scores in category_scores.items():
                avg_f1 = sum(scores['f1_scores']) / len(scores['f1_scores'])
                avg_llm = sum(scores['llm_scores']) / len(scores['llm_scores'])
                avg_time = sum(scores['time_ratios']) / len(scores['time_ratios']) if scores['time_ratios'] else 0
                
                print(f"\n  {category} ({len(scores['f1_scores'])} tasks):")
                print(f"    • Tool Chain F1: {avg_f1:.2%}")
                print(f"    • LLM Score: {avg_llm:.2%}")
                if scores['time_ratios']:
                    print(f"    • Time Ratio: {avg_time:.2f}x")
    
    def _save_results(self, results: List[DynamicBenchmarkResult]):
        """Save results to file"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = self.results_dir / f"dynamic_benchmark_results_{timestamp}.json"
        
        # Helper function: Handle infinity values
        def sanitize_for_json(obj):
            """Recursively handle infinity values in object to make it JSON serializable"""
            if isinstance(obj, dict):
                return {k: sanitize_for_json(v) for k, v in obj.items()}
            elif isinstance(obj, list):
                return [sanitize_for_json(v) for v in obj]
            elif isinstance(obj, float):
                if obj == float('inf') or obj == float('-inf'):
                    return None
                elif obj != obj:  # NaN
                    return None
                else:
                    return obj
            else:
                return obj
        
        # Prepare data to save
        results_data = {
            "timestamp": timestamp,
            "tasks_file": self.tasks_file,
            "evaluation_framework": "dynamic_llm_judge_with_predefined_servers",
            "total_tasks": len(results),
            "successful_tasks": sum(1 for r in results if r.success),
            "results": []
        }
        
        for result in results:
            # Find corresponding task to get more information
            task = next((t for t in self.tasks if t['id'] == result.task_id), None)
            
            result_dict = {
                "task_id": result.task_id,
                "task_name": result.task_name,
                "task_query": task.get('query', '') if task else '',
                "task_description": task.get('description', '') if task else '',
                "task_category": task.get('category', 'unknown') if task else 'unknown',
                "task_difficulty": task.get('difficulty', 'unknown') if task else 'unknown',
                "success": result.success,
                "error": result.error,
                
                # Agent execution trajectory - complete record
                "agent_trajectory": {
                    # Basic information
                    "execution_time": result.agent_execution.execution_time,
                    "tokens_used": result.agent_execution.tokens_used,
                    "tools_used": result.agent_execution.tools_used,
                    "tool_calls_count": len(result.agent_execution.tool_calls),
                    
                    # Full conversation history
                    "full_conversation": result.agent_execution.full_conversation,
                    
                    # Initial plan (if any)
                    "initial_plan": result.agent_execution.initial_plan,
                    
                    # Reasoning trace
                    "reasoning_trace": result.agent_execution.reasoning_trace,
                    
                    # Structured execution steps
                    "execution_steps": result.agent_execution.execution_steps,
                    
                    # Detailed tool call records
                    "tool_calls": result.agent_execution.tool_calls,
                    
                    # Final output
                    "final_output": result.agent_execution.final_output,
                    
                    # Execution success status
                    "success": result.agent_execution.success,
                    "error": result.agent_execution.error
                },
                
                # Ground Truth execution trajectory (if available)
                "ground_truth_trajectory": {
                    "execution_time": result.evaluation_result.ground_truth_execution.execution_time,
                    "tokens_used": result.evaluation_result.ground_truth_execution.tokens_used,
                    "tools_used": result.evaluation_result.ground_truth_execution.tools_used,
                    "tool_calls_count": len(result.evaluation_result.ground_truth_execution.tool_calls),
                    
                    # Full conversation history
                    "full_conversation": result.evaluation_result.ground_truth_execution.full_conversation,
                    
                    # Structured execution steps
                    "execution_steps": result.evaluation_result.ground_truth_execution.execution_steps,
                    
                    # Detailed tool call records
                    "tool_calls": result.evaluation_result.ground_truth_execution.tool_calls,
                    
                    # Final output
                    "final_output": result.evaluation_result.ground_truth_execution.final_output,
                    
                    # Execution success status
                    "success": result.evaluation_result.ground_truth_execution.success,
                    "error": result.evaluation_result.ground_truth_execution.error
                } if result.evaluation_result.ground_truth_execution else None,
                
                # Evaluation scores
                "evaluation_scores": {
                    # Metric 1: Tool chain matching
                    "toolchain_metrics": {
                        "precision": result.evaluation_result.toolchain_precision,
                        "recall": result.evaluation_result.toolchain_recall,
                        "f1": result.evaluation_result.toolchain_f1,
                        "details": result.evaluation_result.toolchain_metrics
                    },
                    # Metric 2: LLM Judge score
                    "llm_judge_metrics": {
                        "content_quality": result.evaluation_result.llm_content_quality,
                        "task_completion": result.evaluation_result.llm_task_completion,
                        "accuracy": result.evaluation_result.llm_accuracy,
                        "overall_score": result.evaluation_result.llm_overall_score,
                        "execution_trace_alignment": result.evaluation_result.llm_judge_details.get("raw_alignment_scores", {}).get("execution_trace_alignment", None),
                        "details": result.evaluation_result.llm_judge_details
                    },
                    # Metric 3: Efficiency metrics
                    "efficiency_metrics": sanitize_for_json({
                        "agent_execution_time": result.evaluation_result.agent_execution_time,
                        "ground_truth_execution_time": result.evaluation_result.ground_truth_execution_time,
                        "time_ratio": result.evaluation_result.time_ratio,
                        "agent_tokens": result.evaluation_result.agent_tokens_used,
                        "ground_truth_tokens": result.evaluation_result.ground_truth_tokens_used,
                        "tokens_ratio": result.evaluation_result.tokens_ratio,
                        "details": result.evaluation_result.efficiency_metrics
                    })
                }
            }
            results_data["results"].append(result_dict)
        
        # Add summary statistics
        if results:
            # Calculate average scores
            avg_toolchain_f1 = sum(r["evaluation_scores"]["toolchain_metrics"]["f1"] for r in results_data["results"]) / len(results)
            avg_llm_score = sum(r["evaluation_scores"]["llm_judge_metrics"]["overall_score"] for r in results_data["results"]) / len(results)
            
            # Calculate execution trace alignment average (if available)
            trace_alignments = [r["evaluation_scores"]["llm_judge_metrics"]["execution_trace_alignment"] 
                               for r in results_data["results"] 
                               if r["evaluation_scores"]["llm_judge_metrics"]["execution_trace_alignment"] is not None]
            avg_trace_alignment = sum(trace_alignments) / len(trace_alignments) if trace_alignments else None
            
            results_data["summary"] = {
                "average_toolchain_f1": avg_toolchain_f1,
                "average_llm_score": avg_llm_score,
                "average_execution_trace_alignment": avg_trace_alignment,
                "success_rate": sum(1 for r in results if r.success) / len(results)
            }
        
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(results_data, f, indent=2, ensure_ascii=False)
        
        print(f"\n💾 Dynamic evaluation results saved to: {filename}")
        print(f"   - Contains complete Agent execution trajectory")
        print(f"   - Contains Ground Truth execution trajectory")
        print(f"   - Contains detailed evaluation scores and feedback")


async def main():
    """Main function"""
    parser = argparse.ArgumentParser(description='Dynamic MCP Tool Planning Benchmark Runner (using predefined servers)')
    parser.add_argument('--tasks', default='benchmark_tasks_v4.json', help='Task file path')
    parser.add_argument('--mode', choices=['all', 'single'], help='Run mode (auto-detected)')
    parser.add_argument('--task-id', help='Single task ID (for single mode)')
    parser.add_argument('--execution', choices=['react', 'plan_execute'], default='react', help='Execution mode')
    parser.add_argument('--category', help='Filter tasks by category')
    
    args = parser.parse_args()
    
    # Auto-detect run mode
    if args.task_id:
        # If task-id specified, automatically switch to single mode
        mode = 'single'
    elif args.mode:
        # If mode explicitly specified, use that mode
        mode = args.mode
    else:
        # Default to run all tasks
        mode = 'all'
    
    # Create runner
    runner = DynamicBenchmarkRunner(tasks_file=args.tasks)
    
    print("✅ Using predefined MCP server list - intelligent tool selection disabled")
    
    if mode == 'single':
        # Run single task
        if not args.task_id:
            print("❌ single mode requires --task-id")
            return
        
        print(f"\n🎯 Single task mode: {args.task_id}")
        
        # Find task
        task = next((t for t in runner.tasks if t['id'] == args.task_id), None)
        if not task:
            print(f"❌ Task ID not found: {args.task_id}")
            return
        
        result = await runner.run_single_task(task, args.execution)
        
        # Save single task result
        runner._save_results([result])
    
    else:
        # Run all tasks
        await runner.run_all_tasks(args.execution, args.category)


if __name__ == "__main__":
    asyncio.run(main())
