
import json
import time
import asyncio
from typing import Dict, List, Any, Optional, Tuple
from dataclasses import dataclass
import os
from pathlib import Path
from openai import AsyncOpenAI

from mcp_agent.app import MCPApp
from mcp_agent.agents.agent import Agent
from mcp_agent.workflows.llm.augmented_llm_openai import OpenAIAugmentedLLM
from mcp_agent.logging.logger import get_logger

logger = get_logger(__name__)

@dataclass
class GroundTruthExecution:
    """Ground Truth Execution Result"""
    task_id: str
    tools_used: List[str]
    tool_calls: List[Dict[str, Any]]
    final_output: str
    execution_time: float
    success: bool
    tokens_used: int = 0  # Add token count
    error: Optional[str] = None
    full_conversation: str = ""  # Full conversation history
    execution_steps: List[Dict[str, Any]] = None  # Structured execution steps
    
    def __post_init__(self):
        if self.execution_steps is None:
            self.execution_steps = []


@dataclass
class AgentExecution:
    """Agent Execution Result"""
    task_id: str
    tools_used: List[str]
    tool_calls: List[Dict[str, Any]]
    final_output: str
    execution_time: float
    success: bool
    tokens_used: int = 0  # Add token count
    error: Optional[str] = None
    # New fields: Record full execution trace
    full_conversation: str = ""  # Full conversation history
    initial_plan: str = ""  # Agent's initial plan (if any)
    execution_steps: List[Dict[str, Any]] = None  # Structured execution steps
    reasoning_trace: List[str] = None  # Reasoning trace
    
    def __post_init__(self):
        if self.execution_steps is None:
            self.execution_steps = []
        if self.reasoning_trace is None:
            self.reasoning_trace = []


@dataclass
class DynamicEvaluationResult:
    task_id: str
    

    # Metric 1: LLM Judge Score
    llm_content_quality: float      # Content quality
    llm_task_completion: float      # Task completion
    llm_accuracy: float             # Relative accuracy
    llm_overall_score: float        # LLM overall score
    llm_judge_details: Dict[str, Any]  # Detailed feedback
    
    # Metric 2: Efficiency metrics
    agent_execution_time: float     # Agent execution time (seconds)
    ground_truth_execution_time: float  # Ground Truth execution time (seconds)
    time_ratio: float              # Time ratio (Agent/GT)
    agent_tokens_used: int         # Tokens used by Agent
    ground_truth_tokens_used: int  # Tokens used by Ground Truth
    tokens_ratio: float            # Token ratio (Agent/GT)
    efficiency_metrics: Dict[str, Any]  # Detailed information
    
    # Execution records
    ground_truth_execution: GroundTruthExecution
    agent_execution: AgentExecution


class GroundTruthExecutor:
    """Ground Truth Executor - Uses strongest model to execute standard tool chain"""
    
    def __init__(self, available_servers: List[str], use_intelligent_selection: bool = True, output_dir: str = ""):
        self.available_servers = available_servers
        self.openai_client = self._setup_openai_client()
        # Ground Truth doesn't need intelligent selection, directly uses predefined tool chain
        self.use_intelligent_selection = False  # Ignore this parameter
        self.tool_selector = None
        self.output_dir = output_dir
    
    def _setup_openai_client(self):
        """Setup OpenAI client"""
        api_key = os.environ.get("OPENAI_API_KEY")
        if not api_key:
            try:
                import yaml
                with open("mcp_agent.secrets.yaml", "r") as f:
                    secrets = yaml.safe_load(f)
                    api_key = secrets.get("openai", {}).get("api_key") or secrets.get("OPENAI_API_KEY")
            except:
                pass
        
        if api_key:
            return AsyncOpenAI(api_key=api_key)
        else:
            return AsyncOpenAI()
    
    def _modify_file_paths_for_gt(self, arguments: dict, task_id: str) -> dict:
        """Modify file path for Ground Truth, store to ground_truth_outputs directory"""
        if not arguments:
            return arguments
            
        modified_args = arguments.copy()
        
        # File path parameter names that need modification
        path_params = ['path', 'file_path', 'filename', 'filepath', 'file', 'output_path']
        
        for param in path_params:
            if param in modified_args:
                original_path = modified_args[param]
                if original_path:
                    # Parse file path
                    path_obj = Path(original_path)
                    
                    # If absolute path, only take file name
                    if path_obj.is_absolute():
                        filename = path_obj.name
                    else:
                        filename = str(path_obj)
                    
                    # Build new path to ground_truth_outputs directory
                    if self.output_dir:
                        new_path = Path(self.output_dir) / f"task_{task_id}" / filename
                    else:
                        # If output directory not set, use original _gt suffix logic
                        if path_obj.suffix:
                            new_name = path_obj.stem + "_gt" + path_obj.suffix
                        else:
                            new_name = path_obj.name + "_gt"
                        new_path = path_obj.parent / new_name
                    
                    # Ensure directory exists
                    if self.output_dir:
                        new_path.parent.mkdir(parents=True, exist_ok=True)
                    
                    modified_args[param] = str(new_path)
                    
                    print(f"🔄 GT file path modified: {original_path} -> {new_path}")
        
        return modified_args
    
    async def execute_ground_truth(self, task: Dict[str, Any], config_path: str) -> GroundTruthExecution:
        """Execute Ground Truth tool chain, generate reference result"""
        print(f"\n🔬 Executing Ground Truth: {task['name']}")
        print(f"📋 Ground Truth tool chain: {[step['tool'] for step in task['ground_truth']['tool_chain']]}")
        if self.output_dir:
            task_output_dir = Path(self.output_dir) / f"task_{task['id']}"
            print(f"📁 Ground Truth output directory: {task_output_dir}")
        
        start_time = time.time()
        tools_used = []
        tool_calls = []
        execution_steps = []  # New: Record execution steps
        full_conversation = []  # New: Record full conversation
        
        try:
            # Extract required servers from Ground Truth tool chain
            required_servers = set()
            for step in task['ground_truth']['tool_chain']:
                tool_name = step['tool']
                # Extract server name (tool name format: server.tool or server_tool)
                if '.' in tool_name:
                    server_name = tool_name.split('.')[0]
                elif '_' in tool_name:
                    # For underscore format, need to match available server list
                    for server in self.available_servers:
                        if tool_name.startswith(server + '_'):
                            server_name = server
                            break
                    else:
                        # If no matching server prefix, might be standalone tool
                        server_name = tool_name
                else:
                    server_name = tool_name
                
                # Ensure server is in available list
                if server_name in self.available_servers:
                    required_servers.add(server_name)
            
            # If no servers identified, add common ones
            if not required_servers:
                # Guess possible required servers based on tool name
                for step in task['ground_truth']['tool_chain']:
                    tool_name = step['tool'].lower()
                    if 'filesystem' in tool_name or 'write_file' in tool_name:
                        required_servers.add('filesystem')
                    elif 'google' in tool_name or 'maps' in tool_name:
                        required_servers.add('google-maps')
                    elif 'weather' in tool_name:
                        required_servers.add('weather')
                    elif 'sqlite' in tool_name:
                        required_servers.add('sqlite')
                    elif 'duckdb' in tool_name:
                        required_servers.add('duckdb')
            
            print(f"🎯 Ground Truth only loads required servers: {sorted(required_servers)}")
            
            # Create dedicated Ground Truth Agent, only using required servers
            app = MCPApp(
                name=f"ground_truth_{task['id']}",
                settings=config_path
            )
            
            async with app.run() as mcp_app:
                # Create Agent, only using servers required by Ground Truth
                agent = Agent(
                    name=f"ground_truth_agent_{task['id']}",
                    description=f"Ground Truth Executor: {task['description']}",
                    server_names=list(required_servers),  # Only use required servers
                    context=mcp_app.context
                )
                
                async with agent:
                    # Record tool calls and modify file paths
                    original_call_tool = agent.call_tool
                    current_step_index = 0  # Track current execution step
                    
                    async def track_tool_call(tool_name, arguments=None):
                        nonlocal tools_used, tool_calls, execution_steps, current_step_index
                        call_start = time.time()
                        
                        # Record execution step start
                        step_info = {
                            "step_number": current_step_index + 1,
                            "tool_name": tool_name,
                            "arguments": arguments,
                            "timestamp": time.time() - start_time,
                            "status": "started"
                        }
                        
                        # Get corresponding Ground Truth step description
                        if current_step_index < len(task['ground_truth']['tool_chain']):
                            gt_step = task['ground_truth']['tool_chain'][current_step_index]
                            step_info["purpose"] = gt_step.get('purpose', '')
                            step_info["expected_tool"] = gt_step.get('tool', '')
                        
                        try:
                            # Modify path for file operation tools
                            modified_arguments = arguments
                            if tool_name and ('write_file' in tool_name or 'create_file' in tool_name or 
                                            'edit_file' in tool_name or 'modify_file' in tool_name or
                                            'save_file' in tool_name or 'output' in tool_name):
                                modified_arguments = self._modify_file_paths_for_gt(arguments, task['id'])
                                print(f"🔧 GT tool call path modified: {tool_name}")
                            
                            result = await original_call_tool(tool_name, modified_arguments)
                            tools_used.append(tool_name)
                            
                            # Record successful tool call
                            tool_call_record = {
                                "tool_name": tool_name,
                                "arguments": modified_arguments,  # Save modified parameters
                                "result": str(result)[:500],  # Limit result length
                                "success": True,
                                "duration": time.time() - call_start
                            }
                            tool_calls.append(tool_call_record)
                            
                            # Update step information
                            step_info.update({
                                "status": "completed",
                                "result": str(result)[:200],  # Brief result
                                "duration": time.time() - call_start,
                                "success": True
                            })
                            execution_steps.append(step_info)
                            current_step_index += 1
                            
                            return result
                        except Exception as e:
                            tool_calls.append({
                                "tool_name": tool_name,
                                "arguments": modified_arguments if 'modified_arguments' in locals() else arguments,
                                "success": False,
                                "error": str(e),
                                "duration": time.time() - call_start
                            })
                            
                            # Record failed step
                            step_info.update({
                                "status": "failed",
                                "error": str(e),
                                "duration": time.time() - call_start,
                                "success": False
                            })
                            execution_steps.append(step_info)
                            
                            raise
                    
                    agent.call_tool = track_tool_call
                    
                    # Execute using GPT-4o
                    llm = await agent.attach_llm(OpenAIAugmentedLLM)
                    
                    # Build Ground Truth execution prompt (includes file path modification description)
                    gt_prompt = self._build_ground_truth_prompt(task)
                    
                    # Record initial prompt
                    full_conversation.append({
                        "role": "system",
                        "content": gt_prompt,
                        "timestamp": 0
                    })
                    
                    # Execute task
                    response = await llm.generate(gt_prompt)
                    
                    # Extract response text
                    response_text = self._extract_response_text(response)
                    
                    # Record complete response
                    full_conversation.append({
                        "role": "assistant",
                        "content": response_text,
                        "timestamp": time.time() - start_time
                    })
                    
                    # Convert conversation to string
                    full_conversation_str = "\n\n".join([
                        f"[{msg['role'].upper()}] (T={msg['timestamp']:.2f}s):\n{msg['content']}"
                        for msg in full_conversation
                    ])
                    
                    # Get token usage
                    tokens_used = 0
                    if hasattr(response, 'usage') and response.usage:
                        tokens_used = response.usage.total_tokens
            
            execution_time = time.time() - start_time
            
            return GroundTruthExecution(
                task_id=task['id'],
                tools_used=tools_used,
                tool_calls=tool_calls,
                final_output=response_text,
                execution_time=execution_time,
                success=True,
                tokens_used=tokens_used,
                full_conversation=full_conversation_str,
                execution_steps=execution_steps
            )
            
        except Exception as e:
            execution_time = time.time() - start_time
            print(f"❌ Ground Truth execution failed: {e}")
            logger.error(f"Ground Truth execution failed: {e}", exc_info=True)
            
            return GroundTruthExecution(
                task_id=task['id'],
                tools_used=tools_used,
                tool_calls=tool_calls,
                final_output="",
                execution_time=execution_time,
                success=False,
                error=str(e),
                full_conversation="",
                execution_steps=execution_steps
            )
    
    def _build_ground_truth_prompt(self, task: Dict[str, Any]) -> str:
        """Build Ground Truth execution prompt"""
        tool_chain = task['ground_truth']['tool_chain']
        
        # Build tool chain execution guidance
        tool_instructions = []
        for i, step in enumerate(tool_chain, 1):
            tool_instructions.append(
                f"{i}. Use {step['tool']} - {step['purpose']}\n"
                f"   Parameter hints: {step.get('params_hint', 'N/A')}"
            )
        
        prompt = f"""You are an Obedient Execution Engine. Your sole purpose is to execute a pre-defined plan without deviation. You must not add, skip, or re-order any steps.

## TASK CONTEXT
- **User Query:** {task['query']}
- **Task Description:** {task['description']}

## MANDATORY EXECUTION PROTOCOL
You are mandated to execute the following tool chain in the exact sequence provided.

{chr(10).join(tool_instructions)}

## IMPORTANT FILE HANDLING
**CRITICAL**: All files you create will be automatically stored in a dedicated Ground Truth output directory. You don't need to modify file names or paths - the system handles this automatically.

For example:
- If you create "report.md", it will be stored in the Ground Truth directory
- If you create "data.json", it will be stored in the Ground Truth directory
- Simply use standard file names as you would normally

## EXECUTION AND REPORTING FORMAT
For each step in the protocol, you must:
1.  **Announce the Step**: State which step you are about to execute (e.g., "Executing Step 1: ...").
2.  **Execute the Tool Call**: Perform the specified tool call.
3.  **Synthesize Final Answer**: After all steps are completed, provide a final, comprehensive answer.

"""
        
        return prompt
    
    def _extract_response_text(self, response) -> str:
        """Extract response text"""
        if hasattr(response, 'content'):
            if isinstance(response.content, list) and len(response.content) > 0:
                first_content = response.content[0]
                if hasattr(first_content, 'text'):
                    return first_content.text
                else:
                    return str(first_content)
            else:
                return str(response.content)
        else:
            return str(response)


class LLMJudge:
    """LLM Judge - Compare Agent output with Ground Truth result, supports file content comparison"""
    
    def __init__(self):
        self.openai_client = self._setup_openai_client()
    
    def _setup_openai_client(self):
        """Setup OpenAI client"""
        api_key = os.environ.get("OPENAI_API_KEY")
        if not api_key:
            try:
                import yaml
                with open("mcp_agent.secrets.yaml", "r") as f:
                    secrets = yaml.safe_load(f)
                    api_key = secrets.get("openai", {}).get("api_key") or secrets.get("OPENAI_API_KEY")
            except:
                pass
        
        if api_key:
            return AsyncOpenAI(api_key=api_key)
        else:
            return AsyncOpenAI()
    
    def _collect_execution_artifacts(self, execution: Any, prefix: str) -> Dict[str, Any]:
        """Collect all files and content produced during execution"""
        artifacts = {
            "text_output": execution.final_output,
            "files_created": [],
            "files_modified": [],
            "tool_outputs": []
        }
        
        # Extract file operations from tool calls
        for tool_call in execution.tool_calls:
            tool_name = tool_call.get("tool_name", "")
            arguments = tool_call.get("arguments", {})
            result = tool_call.get("result", "")
            
            # Record tool output
            artifacts["tool_outputs"].append({
                "tool": tool_name,
                "arguments": arguments,
                "result": result,
                "success": tool_call.get("success", False)
            })
            
            # Detect file operations
            if "write_file" in tool_name or "create_file" in tool_name:
                file_path = arguments.get("path") or arguments.get("file_path") or arguments.get("filename")
                if file_path:
                    # Check if file exists
                    full_path = Path(file_path)
                    if full_path.exists():
                        artifacts["files_created"].append({
                            "path": str(full_path),
                            "size": full_path.stat().st_size,
                            "type": self._detect_file_type(full_path)
                        })
            
            elif "modify_file" in tool_name or "update_file" in tool_name:
                file_path = arguments.get("path") or arguments.get("file_path") or arguments.get("filename")
                if file_path:
                    full_path = Path(file_path)
                    if full_path.exists():
                        artifacts["files_modified"].append({
                            "path": str(full_path),
                            "size": full_path.stat().st_size,
                            "type": self._detect_file_type(full_path)
                        })
        
        return artifacts
    
    def _detect_file_type(self, file_path: Path) -> str:
        """Detect file type"""
        suffix = file_path.suffix.lower()
        
        if suffix in ['.txt', '.md', '.json', '.yaml', '.yml', '.csv', '.log']:
            return 'text'
        elif suffix in ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg']:
            return 'image'
        elif suffix in ['.pdf']:
            return 'pdf'
        elif suffix in ['.html', '.htm']:
            return 'html'
        elif suffix in ['.py', '.js', '.java', '.cpp', '.c', '.go', '.rs']:
            return 'code'
        else:
            return 'unknown'
    
    async def _upload_file_to_openai(self, file_path: str) -> Optional[str]:
        """Upload file to OpenAI and return file ID"""
        try:
            with open(file_path, 'rb') as file:
                response = await self.openai_client.files.create(
                    file=file,
                    purpose='assistants'
                )
                return response.id
        except Exception as e:
            print(f"⚠️ File upload failed {file_path}: {e}")
            return None
    
    async def _prepare_evaluation_content_with_trace(self, 
                           task: Dict[str, Any], 
                           agent_artifacts: Dict[str, Any],
                           gt_artifacts: Dict[str, Any],
                           agent_execution: AgentExecution,
                           ground_truth_execution: GroundTruthExecution) -> Tuple[List[Dict], List[str]]:
        """Prepare evaluation content, including text and files, and execution trace comparison"""
        messages = []
        file_ids = []
        
        # Base system message
        system_message = {
            "role": "system",
            "content": """You are an advanced AI evaluation expert specializing in comprehensive task completion assessment. You analyze:
1. Execution traces (planning, reasoning, tool usage sequence)
2. Final outputs (text and files)
3. Process efficiency and correctness

Your primary focus is on EXECUTION TRACE ALIGNMENT - how the agent's problem-solving process compares to the ground truth approach.

For tasks WITH tangible outputs (files, data, etc.), evaluate both process AND results.
For tasks WITHOUT tangible outputs (information queries), focus primarily on the execution process."""
        }
        messages.append(system_message)
        
        # Format execution trace
        def format_execution_trace(execution, label):
            trace_parts = []
            
            # If has full conversation history
            if execution.full_conversation:
                trace_parts.append(f"=== {label} Full Conversation ===\n{execution.full_conversation}\n")
            
            # If has initial plan (Agent only)
            if hasattr(execution, 'initial_plan') and execution.initial_plan:
                trace_parts.append(f"=== {label} Initial Plan ===\n{execution.initial_plan}\n")
            
            # If has structured execution steps
            if execution.execution_steps:
                trace_parts.append(f"=== {label} Execution Steps ===")
                for step in execution.execution_steps:
                    trace_parts.append(
                        f"Step {step.get('step_number', '?')}: {step.get('tool_name', 'Unknown tool')}\n"
                        f"  Purpose: {step.get('purpose', 'N/A')}\n"
                        f"  Status: {step.get('status', 'unknown')}\n"
                        f"  Duration: {step.get('duration', 0):.2f}s"
                    )
                    if step.get('error'):
                        trace_parts.append(f"  Error: {step['error']}")
                trace_parts.append("")
            
            # Tool call details
            if execution.tool_calls:
                trace_parts.append(f"=== {label} Tool Calls Detail ===")
                for i, call in enumerate(execution.tool_calls, 1):
                    trace_parts.append(
                        f"{i}. {call['tool_name']}\n"
                        f"   Args: {json.dumps(call.get('arguments', {}), indent=2)}\n"
                        f"   Success: {call.get('success', False)}\n"
                        f"   Result preview: {str(call.get('result', ''))[:100]}..."
                    )
                trace_parts.append("")
            
            return "\n".join(trace_parts)
        
        # Determine if has tangible output
        has_tangible_output = (
            len(agent_artifacts['files_created']) > 0 or 
            len(gt_artifacts['files_created']) > 0 or
            (agent_execution.final_output and len(agent_execution.final_output.strip()) > 50) or
            (ground_truth_execution.final_output and len(ground_truth_execution.final_output.strip()) > 50)
        )
        
        # Build detailed evaluation prompt
        evaluation_prompt = f"""## TASK EVALUATION REQUEST

**Task Information:**
- Name: {task['name']}
- Description: {task['description']}
- User Query: {task['query']}
- Task Type: {'Has Tangible Output' if has_tangible_output else 'Information/Process Only'}

**Evaluation Objective:**
Compare the AI Agent's execution trace and outputs with the Ground Truth reference.

## EXECUTION TRACES

### GROUND TRUTH EXECUTION TRACE
{format_execution_trace(ground_truth_execution, "Ground Truth")}

### AGENT EXECUTION TRACE
{format_execution_trace(agent_execution, "Agent")}

## FINAL OUTPUTS

**Ground Truth Output:**
```
{gt_artifacts['text_output']}
```
Files Created: {len(gt_artifacts['files_created'])}

**Agent Output:**
```
{agent_artifacts['text_output']}
```
Files Created: {len(agent_artifacts['files_created'])}

## EVALUATION CRITERIA

{"### For this task WITH tangible outputs, evaluate BOTH execution trace AND final results:" if has_tangible_output else "### For this task WITHOUT tangible outputs, focus primarily on execution trace:"}

Rate the alignment on a scale of 0.0 to 1.0 for each dimension:

1. **Execution Trace Alignment** (0.0-1.0):
   - Tool selection appropriateness
   - Execution order logic
   - Parameter choices
   - Error handling
   - Overall problem-solving approach
   Weight: {"40%" if has_tangible_output else "60%"}

2. **Content Alignment** (0.0-1.0):
   - Final output correctness
   - Information completeness
   - Format consistency
   Weight: {"30%" if has_tangible_output else "20%"}

3. **Task Completion Alignment** (0.0-1.0):
   - Requirements fulfillment
   - Goal achievement
   Weight: {"20%" if has_tangible_output else "15%"}

4. **Overall Quality** (0.0-1.0):
   - Holistic assessment
   Weight: {"10%" if has_tangible_output else "5%"}

## REQUIRED JSON OUTPUT

Return a JSON object with this exact structure:

```json
{{
    "execution_trace_alignment": 0.85,
    "content_alignment": 0.90,
    "structure_alignment": 0.85,
    "task_completion_alignment": 0.80,
    "overall_quality": 0.85,
    "alignment_summary": "Brief summary focusing on execution trace comparison",
    "key_differences": [
        "Process difference 1",
        "Tool usage difference 2"
    ],
    "strengths": [
        "Good tool selection",
        "Efficient execution"
    ],
    "weaknesses": [
        "Different approach than ground truth",
        "Missing optimization"
    ],
    "detailed_analysis": {{
        "execution_trace_analysis": "Detailed comparison of execution approaches, tool sequences, and decision-making",
        "content_details": "Analysis of output content alignment",
        "structure_details": "Analysis of output structure",
        "task_completion_details": "Analysis of requirement fulfillment",
        "file_analysis": "Analysis of file outputs if applicable"
    }},
    "improvement_suggestions": [
        "Follow ground truth tool sequence more closely",
        "Optimize parameter selection"
    ]
}}
```

IMPORTANT: Focus your analysis on the EXECUTION TRACE - how the agent solved the problem compared to ground truth."""
        
        user_message = {
            "role": "user",
            "content": evaluation_prompt
        }
        messages.append(user_message)
        
        # Upload files (if any)
        all_files = agent_artifacts['files_created'] + gt_artifacts['files_created']
        
        if all_files:
            print(f"📎 Preparing to upload {len(all_files)} files for evaluation...")
            
            file_info = []
            for file_info_dict in all_files:
                file_path = file_info_dict['path']
                file_type = file_info_dict['type']
                
                # Only upload text type files (formats supported by GPT-4o)
                if file_type in ['text', 'code', 'html'] and Path(file_path).exists():
                    file_id = await self._upload_file_to_openai(file_path)
                    if file_id:
                        file_ids.append(file_id)
                        file_info.append(f"- {file_path} (Type: {file_type}, ID: {file_id})")
                        print(f"  ✅ Uploaded: {file_path}")
                    else:
                        print(f"  ❌ Upload failed: {file_path}")
                else:
                    # For files that don't support upload, try to read content
                    if file_type == 'text' and Path(file_path).exists():
                        try:
                            with open(file_path, 'r', encoding='utf-8') as f:
                                content = f.read()
                                if len(content) < 10000:  # Limit file size
                                    file_info.append(f"- {file_path} (Content included below)")
                                    # Add file content to messages
                                    messages.append({
                                        "role": "user",
                                        "content": f"**File Content: {file_path}**\n```\n{content}\n```"
                                    })
                                else:
                                    file_info.append(f"- {file_path} (Too large to include)")
                        except Exception as e:
                            file_info.append(f"- {file_path} (Could not read: {e})")
            
            if file_info:
                files_message = {
                    "role": "user", 
                    "content": f"**Additional Files for Analysis:**\n" + "\n".join(file_info)
                }
                messages.append(files_message)
        
        return messages, file_ids
    
    async def judge_results(self, 
                          task: Dict[str, Any], 
                          ground_truth_execution: GroundTruthExecution,
                          agent_execution: AgentExecution) -> Dict[str, Any]:
        """Use LLM to judge Agent result quality relative to Ground Truth, supports file content comparison and execution trace comparison"""
        
        print(f"🎭 Starting LLM Judge evaluation (supports file content and execution trace comparison)...")
        
        try:
            # Collect execution artifacts
            agent_artifacts = self._collect_execution_artifacts(agent_execution, "agent")
            gt_artifacts = self._collect_execution_artifacts(ground_truth_execution, "gt")
            
            print(f"📊 Agent artifacts: {len(agent_artifacts['files_created'])} files, {len(agent_artifacts['tool_outputs'])} tool calls")
            print(f"📊 Ground Truth artifacts: {len(gt_artifacts['files_created'])} files, {len(gt_artifacts['tool_outputs'])} tool calls")
            
            # Prepare evaluation content (includes execution trace)
            messages, file_ids = await self._prepare_evaluation_content_with_trace(
                task, agent_artifacts, gt_artifacts, 
                agent_execution, ground_truth_execution
            )
            
            # Call GPT-4o for evaluation
            response = await self.openai_client.chat.completions.create(
                model="gpt-4o",
                messages=messages,
                temperature=0.1,
                response_format={"type": "json_object"}
            )
            
            result = json.loads(response.choices[0].message.content)
            result["tokens_used"] = response.usage.total_tokens if response.usage else 0
            result["files_analyzed"] = len(file_ids)
            
            # Clean up uploaded files
            for file_id in file_ids:
                try:
                    await self.openai_client.files.delete(file_id)
                except:
                    pass  # Ignore delete errors
            
            # Convert to original format for compatibility
            compatible_result = {
                "content_quality": result.get("content_alignment", 0.5),
                "task_completion": result.get("task_completion_alignment", 0.5),
                "accuracy_vs_ground_truth": result.get("overall_quality", 0.5),
                "overall_assessment": result.get("alignment_summary", ""),
                "strengths": result.get("strengths", []),
                "weaknesses": result.get("weaknesses", []),
                "specific_feedback": {
                    "content_quality_details": result.get("detailed_analysis", {}).get("content_details", ""),
                    "task_completion_details": result.get("detailed_analysis", {}).get("task_completion_details", ""),
                    "accuracy_details": result.get("detailed_analysis", {}).get("file_analysis", ""),
                    "structure_details": result.get("detailed_analysis", {}).get("structure_details", ""),
                    "execution_trace_analysis": result.get("detailed_analysis", {}).get("execution_trace_analysis", "")  # New
                },
                "improvement_suggestions": result.get("improvement_suggestions", []),
                "tokens_used": result["tokens_used"],
                "files_analyzed": result["files_analyzed"],
                "raw_alignment_scores": result  # Save original alignment scores
            }
            
            print(f"✅ LLM Judge evaluation complete, analyzed {result['files_analyzed']} files and execution traces")
            return compatible_result
            
        except Exception as e:
            print(f"❌ LLM Judge evaluation failed: {e}")
            import traceback
            traceback.print_exc()
            return self._fallback_evaluation(task, ground_truth_execution, agent_execution)
    
    def _fallback_evaluation(self, 
                           task: Dict[str, Any], 
                           ground_truth: GroundTruthExecution,
                           agent: AgentExecution) -> Dict[str, Any]:
        """Fallback evaluation method"""
        # Simple rule-based evaluation
        content_quality = 0.6 if agent.success else 0.2
        task_completion = 0.7 if agent.success else 0.3
        accuracy = 0.5  # Medium accuracy
        creativity = 0.0
        
        return {
            "content_quality": content_quality,
            "task_completion": task_completion,
            "accuracy_vs_ground_truth": accuracy,
            "creativity_bonus": creativity,
            "overall_assessment": "Evaluation system failed, using fallback scoring",
            "strengths": ["Agent executed successfully" if agent.success else ""],
            "weaknesses": ["LLM Judge unavailable"],
            "specific_feedback": {
                "content_quality_details": "Fallback evaluation",
                "task_completion_details": "Fallback evaluation",
                "accuracy_details": "Fallback evaluation",
                "creativity_details": "Fallback evaluation"
            },
            "improvement_suggestions": ["Fix LLM Judge system"],
            "tokens_used": 0
        }


class ToolchainAnalyzer:
    """Tool Chain Analyzer - Calculate Precision/Recall/F1 metrics"""
    
    def analyze_toolchain(self, 
                         ground_truth_tools: List[str], 
                         agent_tools: List[str]) -> Dict[str, float]:
        """Analyze tool chain matching"""
        
        print(f"\n🔍 Tool Chain Analysis:")
        print(f"Ground Truth tools (raw): {ground_truth_tools}")
        print(f"Agent tools (raw): {agent_tools}")
        
        # Normalize tool names
        gt_tools_normalized = self._normalize_tool_names(ground_truth_tools)
        agent_tools_normalized = self._normalize_tool_names(agent_tools)
        
        print(f"Ground Truth tools (normalized): {gt_tools_normalized}")
        print(f"Agent tools (normalized): {agent_tools_normalized}")
        
        # Convert to sets for calculation
        gt_set = set(gt_tools_normalized)
        agent_set = set(agent_tools_normalized)
        
        # Calculate intersection (tools Agent got right)
        correct_tools = gt_set.intersection(agent_set)
        num_correct = len(correct_tools)
        
        print(f"Correct tools (intersection): {correct_tools}")
        print(f"Number of correct tools: {num_correct}")
        
        # Calculate Precision
        if len(agent_set) > 0:
            precision = num_correct / len(agent_set)
        else:
            precision = 0.0 if len(gt_set) > 0 else 1.0
        
        # Calculate Recall
        if len(gt_set) > 0:
            recall = num_correct / len(gt_set)
        else:
            recall = 1.0 if len(agent_set) == 0 else 0.0
        
        # Calculate F1 score
        if precision + recall > 0:
            f1 = 2 * (precision * recall) / (precision + recall)
        else:
            f1 = 0.0
        
        return {
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "correct_tools": list(correct_tools),
            "ground_truth_tools": list(gt_set),
            "agent_tools": list(agent_set),
            "num_correct": num_correct,
            "num_ground_truth": len(gt_set),
            "num_agent": len(agent_set)
        }
    
    def _normalize_tool_names(self, tools: List[str]) -> List[str]:
        """Normalize tool names, handle different naming formats - improved version"""
        normalized = []
                
        # Known MCP server list - updated to 55 servers
        self.known_servers = [
            # Core tools
            "filesystem", "fetch", "playwright", "agentql", "github", "git",
            # Databases
            "sqlite", "duckdb",
            # Communication
            "gmail", 
            # Information services
            "weather", "tavily-search", "google-maps",
            # Finance
            "alpha-vantage", "google-news", "exchange-rate", "qr-generator",
            # Tools
            "calculator", "markitdown", "mermaid-doc",
            # Advanced
            "exa", "request", "yt-dlp-server",
            # Testing
            "everything", "sequential-thinking",
            # Memory and time
            "memory", "time",
            # Content and media
            "tmdb-movies", "wikipedia", "spoonacular", "national-parks", "sports-data",
            # Creative
            "image-generation",
            # Utilities
            "ip-geolocation", "giphy", "musicbrainz", "tinyurl", "public-holidays", "arxiv",
            # Communication platforms
            "discord", "slack",
            # Development platforms
            "gitlab", "huggingface", "youtube-data", "notion",
            # Document processing
            "pdf-tools",
            # Computation
            "code-interpreter", "wolfram-alpha",
            # Academic
            "google-scholar",
            # Travel
            "flight-tracker",
            # Terminal and office
            "tmux", "cli-executor", "leetcode", "excel"
        ]
        
        # Create all possible server prefix variants
        server_prefixes = set()
        for server in self.known_servers:
            server_prefixes.add(server)
            server_prefixes.add(server.replace('-', '_'))
            server_prefixes.add(server.replace('_', '-'))
        
        for tool in tools:
            original = tool
            tool_name = tool
            
            # 1. Handle dot-separated format: server.tool_name
            if '.' in tool:
                parts = tool.split('.')
                if len(parts) >= 2:
                    potential_server = parts[0]
                    if potential_server in server_prefixes:
                        tool_name = '.'.join(parts[1:])  # Take all parts after server
                    else:
                        tool_name = parts[-1]  # Take last part
            
            # 2. Handle underscore-separated format: server_tool_name or server_server_tool_name
            elif '_' in tool:
                # Try to match known server prefixes
                found_match = False
                
                # Sort prefixes by length descending, prioritize longer prefixes
                sorted_prefixes = sorted(server_prefixes, key=len, reverse=True)
                
                for prefix in sorted_prefixes:
                    prefix_pattern = prefix + '_'
                    if tool.startswith(prefix_pattern):
                        remaining = tool[len(prefix_pattern):]
                        
                        # Check if duplicate server prefix, like git_git_init
                        if remaining.startswith(prefix + '_'):
                            # Remove duplicate prefix
                            tool_name = remaining[len(prefix + '_'):]
                        else:
                            tool_name = remaining
                        
                        found_match = True
                        break
                
                # If no known server matched, use heuristic method
                if not found_match:
                    parts = tool.split('_')
                    if len(parts) > 1:
                        # Assume first part is server name
                        tool_name = '_'.join(parts[1:])
            else:
                tool_name = tool
            
            # 3. Further clean up tool name
            # Remove common duplicate prefixes
            if tool_name.startswith('maps_') and 'maps' in original.lower():
                tool_name = tool_name[5:]
            
            # Remove possible residual server prefixes
            for prefix in ['git_', 'filesystem_', 'sqlite_', 'request_']:
                if tool_name.startswith(prefix):
                    tool_name = tool_name[len(prefix):]
                    break
            
            # 4. Final normalization
            # Unify some tool name variants
            tool_name = self._standardize_tool_variants(tool_name)
            
            normalized.append(tool_name)
            
            # Debug print
            if original != tool_name:
                print(f"  Normalized: '{original}' -> '{tool_name}'")
        
        return normalized
    
    def _standardize_tool_variants(self, tool_name: str) -> str:
        """Standardize tool name variants"""
        # Handle common tool name variants
        variants_map = {
            # Git related
            'git_init': 'init',
            'git_add': 'add', 
            'git_commit': 'commit',
            'git_branch': 'branch',
            'git_checkout': 'checkout',
            'git_log': 'log',
            'git_create_branch': 'create_branch',
            
            # Filesystem related
            'write_file': 'write_file',
            'read_file': 'read_file',
            'edit_file': 'edit_file',
            'list_directory': 'list_directory',
            'move_file': 'move_file',
            
            # SQLite related
            'create_table': 'create_table',
            'insert': 'insert',
            'query': 'query',
            'read_query': 'query',
            
            # Request related
            'get': 'get',
            'post': 'post',
            'request': 'get',  # Generic request maps to get
            
            # Google Maps related
            'geocode': 'geocode',
            'search_places': 'search_places',
            'maps_geocode': 'geocode',
            'maps_search_places': 'search_places',
        }
        
        return variants_map.get(tool_name, tool_name)


class DynamicEvaluationFramework:
    """Dynamic Evaluation Framework Main Class"""
    
    def __init__(self, available_servers: List[str], config_path: str, use_intelligent_selection: bool = True, 
                 agent_output_dir: str = "agent_outputs", ground_truth_output_dir: str = "ground_truth_outputs"):
        self.available_servers = available_servers
        self.config_path = config_path
        self.use_intelligent_selection = use_intelligent_selection
        self.agent_output_dir = Path(agent_output_dir)
        self.ground_truth_output_dir = Path(ground_truth_output_dir)
        
        # Ensure output directories exist
        self.agent_output_dir.mkdir(exist_ok=True)
        self.ground_truth_output_dir.mkdir(exist_ok=True)
        
        # Ground Truth executor doesn't use intelligent selection, directly uses predefined tool chain
        self.ground_truth_executor = GroundTruthExecutor(
            available_servers, 
            use_intelligent_selection=False,
            output_dir=str(self.ground_truth_output_dir)
        )
        self.llm_judge = LLMJudge()
        self.toolchain_analyzer = ToolchainAnalyzer()
    
    async def evaluate_task(self, 
                          task: Dict[str, Any], 
                          agent_execution: AgentExecution) -> DynamicEvaluationResult:
        """Evaluate single task"""
        
        print(f"\n🎯 Dynamic Evaluation: {task['name']}")
        
        # 1. Execute Ground Truth
        ground_truth_execution = await self.ground_truth_executor.execute_ground_truth(
            task, self.config_path
        )
        
        if not ground_truth_execution.success:
            print(f"⚠️ Ground Truth execution failed, using fallback evaluation")
            return self._fallback_evaluation(task, agent_execution)
        
        # 2. Tool chain analysis (30%)
        gt_tools = [step["tool"] for step in task["ground_truth"]["tool_chain"]]
        toolchain_analysis = self.toolchain_analyzer.analyze_toolchain(
            gt_tools, agent_execution.tools_used
        )
        
        # 3. LLM Judge evaluation (70%)
        llm_judge_result = await self.llm_judge.judge_results(
            task, ground_truth_execution, agent_execution
        )
        
        # 4. Calculate final score
        toolchain_score = toolchain_analysis["f1"]
        
        # Adjust weights based on whether task has tangible output
        has_tangible_output = (
            len(agent_execution.final_output.strip()) > 50 or 
            len([call for call in agent_execution.tool_calls if 'write_file' in call.get('tool_name', '')]) > 0
        )
        
        if has_tangible_output:
            # Tasks with tangible output: execution trace 40%, content 30%, task completion 20%, overall quality 10%
            llm_judge_score = (
                llm_judge_result.get("raw_alignment_scores", {}).get("execution_trace_alignment", 0.5) * 0.40 +
                llm_judge_result["content_quality"] * 0.30 +
                llm_judge_result["task_completion"] * 0.20 +
                llm_judge_result["accuracy_vs_ground_truth"] * 0.10
            )
        else:
            # Tasks without tangible output: execution trace 60%, content 20%, task completion 15%, overall quality 5%
            llm_judge_score = (
                llm_judge_result.get("raw_alignment_scores", {}).get("execution_trace_alignment", 0.5) * 0.60 +
                llm_judge_result["content_quality"] * 0.20 +
                llm_judge_result["task_completion"] * 0.15 +
                llm_judge_result["accuracy_vs_ground_truth"] * 0.05
            )
        
        # Calculate efficiency metrics
        time_ratio = agent_execution.execution_time / ground_truth_execution.execution_time if ground_truth_execution.execution_time > 0 else float('inf')
        tokens_ratio = agent_execution.tokens_used / ground_truth_execution.tokens_used if ground_truth_execution.tokens_used > 0 else float('inf')
        
        efficiency_metrics = {
            "agent_time": agent_execution.execution_time,
            "ground_truth_time": ground_truth_execution.execution_time,
            "time_ratio": time_ratio,
            "agent_tokens": agent_execution.tokens_used,
            "ground_truth_tokens": ground_truth_execution.tokens_used,
            "tokens_ratio": tokens_ratio,
            "time_efficiency": 1.0 / time_ratio if time_ratio != float('inf') else 0.0,
            "token_efficiency": 1.0 / tokens_ratio if tokens_ratio != float('inf') else 0.0
        }
        
        return DynamicEvaluationResult(
            task_id=task["id"],
            toolchain_precision=toolchain_analysis["precision"],
            toolchain_recall=toolchain_analysis["recall"],
            toolchain_f1=toolchain_analysis["f1"],
            toolchain_metrics=toolchain_analysis,
            llm_content_quality=llm_judge_result["content_quality"],
            llm_task_completion=llm_judge_result["task_completion"],
            llm_accuracy=llm_judge_result["accuracy_vs_ground_truth"],
            llm_overall_score=llm_judge_score,
            llm_judge_details=llm_judge_result,
            agent_execution_time=agent_execution.execution_time,
            ground_truth_execution_time=ground_truth_execution.execution_time,
            time_ratio=time_ratio,
            agent_tokens_used=agent_execution.tokens_used,
            ground_truth_tokens_used=ground_truth_execution.tokens_used,
            tokens_ratio=tokens_ratio,
            efficiency_metrics=efficiency_metrics,
            ground_truth_execution=ground_truth_execution,
            agent_execution=agent_execution
        )
    
    def _fallback_evaluation(self, task: Dict[str, Any], agent_execution: AgentExecution) -> DynamicEvaluationResult:
        """Fallback evaluation method"""
        base_score = 0.5 if agent_execution.success else 0.2
        
        return DynamicEvaluationResult(
            task_id=task["id"],
            toolchain_precision=base_score,
            toolchain_recall=base_score,
            toolchain_f1=base_score,
            toolchain_metrics=self.toolchain_analyzer.analyze_toolchain(
                [], agent_execution.tools_used
            ),
            llm_content_quality=base_score,
            llm_task_completion=base_score,
            llm_accuracy=base_score,
            llm_overall_score=base_score,
            llm_judge_details={"error": "Ground Truth execution failed"},
            agent_execution_time=agent_execution.execution_time,
            ground_truth_execution_time=0,
            time_ratio=float('inf'),
            agent_tokens_used=0,
            ground_truth_tokens_used=0,
            tokens_ratio=0,
            efficiency_metrics=self.toolchain_analyzer.analyze_toolchain(
                [], agent_execution.tools_used
            ),
            ground_truth_execution=GroundTruthExecution(
                task_id=task["id"],
                tools_used=[],
                tool_calls=[],
                final_output="",
                execution_time=0,
                success=False,
                error="Execution failed"
            ),
            agent_execution=agent_execution
        )
    
    def display_evaluation_result(self, result: DynamicEvaluationResult):
        """Display evaluation result"""
        print(f"\n📊 Dynamic Evaluation Result - Task: {result.task_id}")
        print(f"{'='*60}")
        
        # Metric 1: Tool chain matching
        print(f"\n📌 Metric 1: Tool Chain Matching")
        print(f"  • Precision: {result.toolchain_precision:.2%}")
        print(f"  • Recall: {result.toolchain_recall:.2%}")
        print(f"  • F1 Score: {result.toolchain_f1:.2%}")
        if result.toolchain_metrics:
            print(f"  • Correct Tools: {result.toolchain_metrics['num_correct']}/{result.toolchain_metrics['num_agent']} (Agent) vs {result.toolchain_metrics['num_ground_truth']} (GT)")
        
        # Metric 2: LLM Judge Score
        print(f"\n🎭 Metric 2: LLM Judge Score")
        
        # Display execution trace alignment (if available)
        if "raw_alignment_scores" in result.llm_judge_details:
            execution_trace_score = result.llm_judge_details["raw_alignment_scores"].get("execution_trace_alignment", 0)
            print(f"  • Execution Trace Alignment: {execution_trace_score:.2%} ⭐")  # Mark as important metric
        
        print(f"  • Content Quality: {result.llm_content_quality:.2%}")
        print(f"  • Task Completion: {result.llm_task_completion:.2%}")
        print(f"  • Relative Accuracy: {result.llm_accuracy:.2%}")
        print(f"  • Overall Score: {result.llm_overall_score:.2%}")
        
        # Metric 3: Efficiency metrics
        print(f"\n⚡ Metric 3: Efficiency Metrics")
        print(f"  • Agent Execution Time: {result.agent_execution_time:.2f}s")
        print(f"  • Ground Truth Time: {result.ground_truth_execution_time:.2f}s")
        print(f"  • Time Ratio: {result.time_ratio:.2f}x")
        print(f"  • Agent Tokens: {result.agent_tokens_used:,}")
        print(f"  • Ground Truth Tokens: {result.ground_truth_tokens_used:,}")
        if result.tokens_ratio != float('inf'):
            print(f"  • Token Ratio: {result.tokens_ratio:.2f}x")
        
        # Display execution trace analysis (if available)
        if "specific_feedback" in result.llm_judge_details:
            trace_analysis = result.llm_judge_details["specific_feedback"].get("execution_trace_analysis")
            if trace_analysis:
                print(f"\n🔍 Execution Trace Analysis:")
                # Limit display length
                if len(trace_analysis) > 200:
                    print(f"  {trace_analysis[:200]}...")
                else:
                    print(f"  {trace_analysis}")
        
        # Display detailed feedback
        if "overall_assessment" in result.llm_judge_details:
            print(f"\n💡 Overall Assessment: {result.llm_judge_details['overall_assessment']}")
        
        if "strengths" in result.llm_judge_details:
            print(f"\n✅ Strengths:")
            for strength in result.llm_judge_details["strengths"]:
                if strength:
                    print(f"  • {strength}")
        
        if "weaknesses" in result.llm_judge_details:
            print(f"\n❌ Weaknesses:")
            for weakness in result.llm_judge_details["weaknesses"]:
                if weakness:
                    print(f"  • {weakness}")
        
        print(f"{'='*60}")
