#!/usr/bin/env python3
"""
OfficeArena Benchmark Runner

This script runs benchmarks for OfficeArena with configurable options for:
1. Task registry path
2. Model(s) to evaluate
3. Evaluation mode (execute, verify, or evaluate)
4. OneDrive path configuration
5. Local results path

Usage:
    python -m officearena.run_benchmark --registry task_registry --models cua --mode evaluate --onedrive-path /OfficeArena --results-dir results\\cua_evaluate_1758610249 --num-concurrent 3 --resume
"""

import argparse
import concurrent.futures
import json
import os
import sys
import time
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional

# Load environment variables helper
from officearena.utils.load_dotenv import try_load_dotenv

# Load environment variables before importing components
try_load_dotenv()

# Import OfficeArena components (executed after code; ignore E402 for these lines)
from officearena.agents.claude import ClaudeAgent  # noqa: E402
from officearena.agents.config import (  # noqa: E402
    ClaudeConfig,
    CUAConfig,
    OpenCUAConfig,
    Qwen3VLConfig,
    UITARSConfig,
)
from officearena.agents.CUA import CUA  # noqa: E402
from officearena.agents.opencua import OpenCUA  # noqa: E402
from officearena.agents.qwen3vl import Qwen3VL  # noqa: E402
from officearena.agents.UITARS import UITARS  # noqa: E402
from officearena.environments import TaskExecutor  # noqa: E402


def get_available_models() -> Dict[str, Dict[str, str]]:
    """Get available models and their configurations."""
    return {
        "uitars": {
            "type": "uitars",
            "model_name": "uitars-v1",
            "endpoint_env": "UITARS_ENDPOINT_URL",
            "token_env": "UITARS_TOKEN",
        },
        "uitars70b": {
            "type": "uitars",
            "model_name": "uitars-70b",
            "endpoint_env": "UITARS_ENDPOINT_URL2",
            "token_env": "UITARS_TOKEN2",
        },
        "claude-4-sonnet": {
            "type": "anthropic",
            "model_name": "claude-sonnet-4-20250514",
            "api_key_env": "ANTHROPIC_API_KEY",
            "base_url_env": "ANTHROPIC_BASE_URL",
        },
        "claude-4-opus": {
            "type": "anthropic",
            "model_name": "claude-opus-4-1-20250805",
            "api_key_env": "ANTHROPIC_API_KEY",
            "base_url_env": "ANTHROPIC_BASE_URL",
        },
        "claude-4-5-opus": {
            "type": "anthropic",
            "model_name": "claude-opus-4-5-20251101",
            "api_key_env": "ANTHROPIC_API_KEY",
            "base_url_env": "ANTHROPIC_BASE_URL",
        },
        "cua": {
            "type": "cua",
            "model_name": "computer-use-preview",
            "api_key_env": "CUA_API_KEY",
            "base_url_env": "CUA_BASE_URL",
            "endpoint_env": "CUA_ENDPOINT",
        },
        "qwen3vl-8b": {
            "type": "qwen3vl",
            "model_name": "openai/Qwen/Qwen3-VL-8B-Instruct",
            "api_key_env": "QWEN3VL_8B_API_KEY",
            "base_url_env": "QWEN3VL_8B_BASE_URL",
        },
        "qwen3vl-32b": {
            "type": "qwen3vl",
            "model_name": "openai/Qwen/Qwen3-VL-32B-Instruct",
            "api_key_env": "DASHSCOPE_API_KEY",
            "base_url_env": "DASHSCOPE_BASE_URL",
        },
        "opencua-7b": {
            "type": "opencua",
            "model_name": "custom_openai/opencua-7b",
            "api_key_env": "OPENCUA7B_API_KEY",
            "base_url_env": "OPENCUA7B_BASE_URL",
            "cot_level": "l2",
            "use_old_sys_prompt": True,
        },
        "opencua-32b": {
            "type": "opencua",
            "model_name": "custom_openai/opencua-32b",
            "api_key_env": "OPENCUA32B_API_KEY",
            "base_url_env": "OPENCUA32B_BASE_URL",
            "cot_level": "l2",
            "use_old_sys_prompt": True,
        },
        "opencua-72b": {
            "type": "opencua",
            "model_name": "openai/OpenCUA-72B",
            "api_key_env": "OPENCUA72B_API_KEY",
            "base_url_env": "OPENCUA72B_BASE_URL",
            "cot_level": "l2",
            "use_old_sys_prompt": False,
        },
    }


def create_agent(model_name: str, display_size: Dict[str, int] = None):
    """
    Create an agent instance for the specified model.

    Args:
        model_name: Name of the model to use
        display_size: Display size configuration

    Returns:
        Configured agent instance
    """
    available_models = get_available_models()

    if model_name not in available_models:
        raise ValueError(f"Model '{model_name}' not available. Available models: {list(available_models.keys())}")

    model_config = available_models[model_name]

    # Default display size
    if display_size is None:
        display_size = {"width": model_config.get("display_width", 1024), "height": model_config.get("display_height", 768)}

    # Get API key if needed
    api_key = None
    if "api_key_env" in model_config:
        api_key = os.getenv(model_config["api_key_env"])
        if not api_key:
            raise ValueError(f"API key not found in environment variable: {model_config['api_key_env']}")

    # Get base URL if available
    base_url = None
    if "base_url_env" in model_config:
        base_url = os.getenv(model_config["base_url_env"])

    # Create appropriate configuration based on model type
    if model_config["type"] == "uitars":
        # For UITARS, handle endpoint and token configuration
        endpoint_url = None
        if "endpoint_env" in model_config:
            endpoint_url = os.getenv(model_config["endpoint_env"])

        token = None
        if "token_env" in model_config:
            token = os.getenv(model_config["token_env"])

        config = UITARSConfig(
            model_name=model_config["model_name"],
            endpoint_url=endpoint_url or "",
            token=token,
            display_width=display_size["width"],
            display_height=display_size["height"],
        )
        return UITARS(config=config)

    elif model_config["type"] == "anthropic":
        config = ClaudeConfig(
            model_name=model_config["model_name"],
            api_key=api_key,
            base_url=base_url,
            display_width=display_size["width"],
            display_height=display_size["height"],
        )
        return ClaudeAgent(config=config)

    elif model_config["type"] == "cua":
        # For CUA models, handle endpoint configuration
        endpoint = None
        if "endpoint_env" in model_config:
            endpoint = os.getenv(model_config["endpoint_env"])

        config = CUAConfig(
            model_name=model_config["model_name"],
            api_key=api_key,
            base_url=base_url,
            endpoint=endpoint,
            api_version="2025-04-01-preview",
            display_width=display_size["width"],
            display_height=display_size["height"],
        )
        return CUA(config=config)

    elif model_config["type"] == "qwen3vl":
        config = Qwen3VLConfig(
            model_name=model_config["model_name"],
            api_key=api_key,
            base_url=base_url,
            display_width=display_size["width"],
            display_height=display_size["height"],
        )
        return Qwen3VL(config=config)

    elif model_config["type"] == "opencua":
        config = OpenCUAConfig(
            model_name=model_config["model_name"],
            api_key=api_key,
            base_url=base_url,
            display_width=display_size["width"],
            display_height=display_size["height"],
            cot_level=model_config.get("cot_level", "l2"),
            use_old_sys_prompt=model_config.get("use_old_sys_prompt", True),
        )
        return OpenCUA(config=config)

    else:
        raise ValueError(f"Unknown model type: {model_config['type']}")


def get_completed_tasks(results_dir: str, model_name: str) -> set:
    """
    Get the set of task IDs that have been completed for a specific model.
    Handles both single results files and sharded results from concurrent execution.

    Args:
        results_dir: Results directory path (can be the exact model directory)
        model_name: Name of the model to check

    Returns:
        Set of completed task IDs (excludes infrastructure failures)
    """
    completed_tasks = set()
    results_path = Path(results_dir)

    if not results_path.exists():
        return completed_tasks

    def check_results_file(file_path: Path) -> set:
        """Check a single results file and return completed task IDs."""
        local_completed = set()
        try:
            with open(file_path, "r") as f:
                results = json.load(f)

            # Add tasks that have been fully processed with valid results
            for task_id, result in results.items():
                if isinstance(result, dict):
                    # Check execution status - infrastructure failures should be retried
                    execution_status = result.get("execution_status", "")
                    if execution_status and execution_status.startswith("infrastructure failure"):
                        # Skip infrastructure failures - they should be retried
                        continue

                    # Task is completed if it has a score (was properly evaluated)
                    score = result.get("score")
                    if score is not None:
                        local_completed.add(task_id)

        except Exception as e:
            print(f"Warning: Could not read results file {file_path}: {e}")
        return local_completed

        return local_completed

    def check_task_result_file(file_path: Path) -> tuple:
        """Check a single task result file and return (task_id, is_completed)."""
        try:
            with open(file_path, "r") as f:
                result = json.load(f)

            # Extract task_id from the parent directory name
            task_id = file_path.parent.name

            # Task is completed if it has a score (was properly evaluated)
            score = result.get("score")
            execution_status = result.get("execution_status", "")

            # Skip infrastructure failures - they should be retried
            if execution_status and execution_status.startswith("infrastructure failure"):
                is_completed = False
            else:
                is_completed = isinstance(result, dict) and score is not None

            return task_id, is_completed

        except Exception as e:
            print(f"Warning: Could not read task result file {file_path}: {e}")
            return None, False

    # If the results_dir is already a specific model directory (e.g., results\uitars_evaluate_1758452218)
    if model_name.lower() in results_path.name.lower():
        # Check for sharded results (shard_0, shard_1, etc.)
        for shard_dir in results_path.glob("shard_*"):
            if shard_dir.is_dir():
                # Check for task subdirectories with result_evaluate.json files
                for task_dir in shard_dir.iterdir():
                    if task_dir.is_dir() and "_backup_" not in task_dir.name:
                        result_file = task_dir / "result_evaluate.json"
                        if result_file.exists():
                            task_id, is_completed = check_task_result_file(result_file)
                            if is_completed and task_id:
                                completed_tasks.add(task_id)

                # Also check for legacy results files directly in shard directory
                for results_file in shard_dir.glob("*_results.json"):
                    completed_tasks.update(check_results_file(results_file))

        # Also check for direct results files in the model directory
        for results_file in results_path.glob("*_results.json"):
            completed_tasks.update(check_results_file(results_file))
    else:
        # Original behavior: look for model subdirectories
        for model_dir in results_path.iterdir():
            if model_dir.is_dir() and model_name.lower() in model_dir.name.lower():
                # Check for sharded results
                for shard_dir in model_dir.glob("shard_*"):
                    if shard_dir.is_dir():
                        # Check for task subdirectories with result_evaluate.json files
                        for task_dir in shard_dir.iterdir():
                            if task_dir.is_dir() and "_backup_" not in task_dir.name:
                                result_file = task_dir / "result_evaluate.json"
                                if result_file.exists():
                                    task_id, is_completed = check_task_result_file(result_file)
                                    if is_completed and task_id:
                                        completed_tasks.add(task_id)

                        # Also check for legacy results files
                        for results_file in shard_dir.glob("*_results.json"):
                            completed_tasks.update(check_results_file(results_file))

                # Check for direct results files
                for results_file in model_dir.glob("*_results.json"):
                    completed_tasks.update(check_results_file(results_file))

    return completed_tasks


def validate_resume_compatibility(results_dir: str, model_name: str) -> bool:
    """
    Validate that the existing results directory is compatible with the current model.

    Args:
        results_dir: Path to the results directory
        model_name: Current model name

    Returns:
        True if compatible, raises ValueError if not
    """
    results_path = Path(results_dir)

    if not results_path.exists():
        raise ValueError(f"Results directory does not exist: {results_dir}")

    # Check if the directory name contains the model name
    if model_name.lower() not in results_path.name.lower():
        raise ValueError(f"Results directory '{results_path.name}' does not match model '{model_name}'. " "Resume requires using results from the same model.")

    print(f"✅ Resume compatibility validated for model '{model_name}' with directory '{results_path.name}'")
    return True


def run_benchmark(
    registry_path: str,
    models: List[str],
    mode: str,
    onedrive_path: str,
    results_dir: str,
    task_filter: Optional[str] = None,
    timeout_minutes: int = 15,
    save_screenshots: bool = True,
    conversion_mode: str = "online",
    use_cached_original_images: bool = True,
    num_concurrent: int = 1,
    resume: bool = False,
) -> Dict[str, Dict[str, any]]:
    """
    Run benchmark for specified models in the given mode.

    Args:
        registry_path: Path to task registry
        models: List of model names to evaluate (empty for verify mode)
        mode: Processing mode ("execute", "verify", or "evaluate")
        onedrive_path: OneDrive root path
        results_dir: Local results directory
        task_filter: Optional task ID filter (substring match)
        timeout_minutes: Timeout for each task evaluation
        save_screenshots: Whether to save screenshots
        conversion_mode: Conversion mode for Office files
        use_cached_original_images: Whether to use cached original images
        num_concurrent: Number of concurrent task shards
        resume: Whether to resume from existing results (skip completed tasks)

    Returns:
        Dictionary of results by model and task
    """
    results = {}

    # Get client ID
    client_id = os.getenv("CLIENT_ID")
    if not client_id:
        raise ValueError("CLIENT_ID not found in environment variables")

    # Create results directory
    results_path = Path(results_dir)
    results_path.mkdir(parents=True, exist_ok=True)

    # For verify mode, we don't need models - just load tasks and verify
    if mode == "verify":
        print(f"\n{'='*60}")
        print("🔍 Verifying existing results")
        print(f"{'='*60}")

        # Create a timestamp-based results directory for verification
        verify_results_dir = results_path

        try:
            with TaskExecutor(
                client_id=client_id,
                onedrive_root_path=onedrive_path,
                results_dir=str(verify_results_dir),
                cleanup_temp_folder=False,
                enable_logging=True,
            ) as executor:
                print(f"Loading tasks from: {registry_path}")
                tasks = executor.load_tasks(registry_path)

                # Filter tasks if specified
                if task_filter:
                    filtered_tasks = {task_id: task for task_id, task in tasks.items() if task_filter.lower() in task_id.lower()}
                    tasks = filtered_tasks

                print(f"Found {len(tasks)} tasks to verify")

                # List tasks and prompt user to continue
                if tasks:
                    print("\n📋 Tasks to be verified:")
                    task_list = sorted(tasks.keys())
                    for i, task_id in enumerate(task_list, 1):
                        print(f"   {i:3d}. {task_id}")

                    print(f"\n🔄 About to verify {len(tasks)} tasks")

                    try:
                        user_input = input("\nDo you want to continue? (y/N): ").strip().lower()
                        if user_input not in ["y", "yes"]:
                            print("❌ Operation cancelled by user")
                            return {"verification": {"cancelled": True, "timestamp": datetime.now().isoformat()}}
                    except (KeyboardInterrupt, EOFError):
                        print("\n❌ Operation cancelled by user")
                        return {"verification": {"cancelled": True, "timestamp": datetime.now().isoformat()}}

                # Run verification mode
                verify_results = executor.evaluate_all_tasks(
                    tasks=tasks,
                    agent=None,  # Not needed for verify mode
                    timeout_minutes=timeout_minutes,
                    save_screenshots=False,  # No screenshots for verify-only
                    task_filter=None,  # Already filtered above
                    mode="verify",
                    conversion_mode=conversion_mode,
                    use_cached_original_images=use_cached_original_images,
                )

                results["verification"] = {
                    result.task_id: {
                        "success": result.success,
                        "score": result.score,
                        "details": result.details or result.reason,
                        "evaluation_time_seconds": result.evaluation_time_seconds,
                        "timestamp": datetime.now().isoformat(),
                    }
                    for result in verify_results
                }

                # Print verification summary
                print("\n📊 Verification Summary:")
                total_tasks = len(verify_results)
                passed_tasks = sum(1 for r in verify_results if r.success)
                print(f"   Total tasks: {total_tasks}")
                print(f"   Passed: {passed_tasks}")
                print(f"   Failed: {total_tasks - passed_tasks}")

                if verify_results and any(r.score for r in verify_results):
                    avg_score = sum(r.score or 0 for r in verify_results if r.score) / len([r for r in verify_results if r.score])
                    print(f"   Average score: {avg_score:.2f}")

        except Exception as e:
            print(f"❌ Verification failed for: {str(e)}")
            import traceback

            traceback.print_exc()

        return results

    def process_model(model_name: str) -> Dict[str, Dict[str, any]]:
        print(f"\n{'='*60}")
        print(f"🤖 Processing model: {model_name} (mode: {mode})")
        print(f"{'='*60}")

        model_results: Dict[str, Dict[str, any]] = {}

        # Handle resume mode - reuse existing directory or create new one
        if resume and mode in ["execute", "evaluate"]:
            # Validate compatibility with existing results
            validate_resume_compatibility(results_dir, model_name)
            model_results_dir = Path(results_dir)
            print(f"📁 Resume mode: Reusing existing directory: {model_results_dir}")
        else:
            # Create model-specific results directory
            model_results_dir = results_path / f"{model_name}_{mode}_{int(time.time())}"
            model_results_dir.mkdir(exist_ok=True)
            print(f"📁 Created new results directory: {model_results_dir}")

        try:
            # Initialize task executor with per-task logging
            with TaskExecutor(
                client_id=client_id,
                onedrive_root_path=onedrive_path,
                results_dir=str(model_results_dir),
                cleanup_temp_folder=False,
                enable_logging=True,
            ) as executor:
                print(f"Loading tasks from: {registry_path}")
                tasks = executor.load_tasks(registry_path)

                # Filter tasks if specified
                if task_filter:
                    filtered_tasks = {task_id: task for task_id, task in tasks.items() if task_filter.lower() in task_id.lower()}
                    tasks = filtered_tasks

                # Handle resume mode - skip completed tasks
                if resume and mode in ["execute", "evaluate"]:
                    print(f"🔍 Resume mode: Checking for completed tasks in {results_dir}")
                    completed_tasks = get_completed_tasks(results_dir, model_name)
                    print(f"🔍 Found {len(completed_tasks)} completed tasks")

                    if completed_tasks:
                        original_count = len(tasks)
                        tasks = {task_id: task for task_id, task in tasks.items() if task_id not in completed_tasks}
                        skipped_count = original_count - len(tasks)
                        print(f"📋 Resume mode: Skipping {skipped_count} completed tasks, processing {len(tasks)} remaining tasks")
                        if completed_tasks:
                            print(f"   Completed tasks: {', '.join(sorted(list(completed_tasks)[:5]))}{'...' if len(completed_tasks) > 5 else ''}")
                    else:
                        print(f"⚠️  No completed tasks found - will process all {len(tasks)} tasks")
                        print("   Make sure the results directory contains shard_* subdirectories with *_results.json files")

                print(f"Found {len(tasks)} tasks to process")

                # Calculate shard count early for user prompt
                task_ids = sorted(tasks.keys())
                shard_count = max(1, min(num_concurrent, len(task_ids)))

                # List tasks and prompt user to continue
                if tasks:
                    print("\n📋 Tasks to be processed:")
                    task_list = sorted(tasks.keys())
                    for i, task_id in enumerate(task_list, 1):
                        print(f"   {i:3d}. {task_id}")

                    print(f"\n🔄 About to process {len(tasks)} tasks with model '{model_name}' in {mode} mode")
                    if num_concurrent > 1:
                        print(f"   Using {shard_count} concurrent shards")
                    if resume:
                        print("   Resume mode: Only processing remaining tasks")

                    try:
                        user_input = input("\nDo you want to continue? (y/N): ").strip().lower()
                        if user_input not in ["y", "yes"]:
                            print("❌ Operation cancelled by user")
                            return {model_name: {"cancelled": True, "timestamp": datetime.now().isoformat()}}
                    except (KeyboardInterrupt, EOFError):
                        print("\n❌ Operation cancelled by user")
                        return {model_name: {"cancelled": True, "timestamp": datetime.now().isoformat()}}

                # Shard tasks across num_concurrent and run each shard via evaluate_all_tasks
                if shard_count == 1:
                    # Single shard: run normally

                    # Backup existing task folders if they exist and we're re-running tasks
                    if resume and mode in ["execute", "evaluate"]:
                        for task_id in task_ids:
                            task_dir = model_results_dir / task_id
                            if task_dir.exists() and task_dir.is_dir():
                                # Create backup with timestamp for uniqueness
                                timestamp = int(time.time())
                                backup_name = f"{task_id}_backup_{timestamp}"
                                backup_dir = model_results_dir / backup_name

                                try:
                                    task_dir.rename(backup_dir)
                                    print(f"   📁 Backed up existing task folder: {task_id} -> {backup_name}")
                                except Exception as e:
                                    print(f"   ⚠️ Warning: Could not backup task folder {task_id}: {e}")

                    agent = None if mode == "verify" else create_agent(model_name)
                    try:
                        shard_results = executor.evaluate_all_tasks(
                            tasks=tasks,
                            agent=agent,  # None for verify
                            timeout_minutes=timeout_minutes,
                            save_screenshots=save_screenshots,
                            task_filter=None,
                            mode=mode,
                            conversion_mode=conversion_mode,
                            use_cached_original_images=use_cached_original_images,
                            config=agent.agent_config,
                        )
                    finally:
                        if agent is not None:
                            try:
                                agent.close()
                            except Exception:
                                pass
                    for result in shard_results:
                        model_results[result.task_id] = {
                            "success": result.success,
                            "score": result.score,
                            "details": result.details or result.reason,
                            "evaluation_time_seconds": result.evaluation_time_seconds,
                            "timestamp": datetime.now().isoformat(),
                        }
                        status = "✅ PASSED" if result.success else "❌ FAILED"
                        score_str = f"{result.score:.2f}" if result.score is not None else "N/A"
                        time_str = f"{result.evaluation_time_seconds:.1f}s" if result.evaluation_time_seconds is not None else "N/A"
                        print(f"   📋 {result.task_id}: {status} - Score: {score_str} - Time: {time_str}")
                        if not result.success and result.details:
                            print(f"      Details: {result.details}")
                else:
                    shards: List[List[str]] = [[] for _ in range(shard_count)]
                    for i, tid in enumerate(task_ids):
                        shards[i % shard_count].append(tid)

                    print(f"🧵 Running {len(task_ids)} tasks for model {model_name} in {shard_count} parallel shards")

                    def run_shard(shard_index: int, shard_task_ids: List[str]) -> List[any]:
                        # In resume mode, reuse existing shard directory
                        if resume and mode in ["execute", "evaluate"]:
                            shard_dir = model_results_dir / f"shard_{shard_index}"
                            # Don't recreate - it should already exist
                            shard_dir.mkdir(exist_ok=True)
                        else:
                            shard_dir = model_results_dir / f"shard_{shard_index}"
                            shard_dir.mkdir(exist_ok=True)

                        # Backup existing task folders if they exist and we're re-running tasks
                        if resume and mode in ["execute", "evaluate"]:
                            for task_id in shard_task_ids:
                                task_dir = shard_dir / task_id
                                if task_dir.exists() and task_dir.is_dir():
                                    # Create backup with timestamp for uniqueness
                                    timestamp = int(time.time())
                                    backup_name = f"{task_id}_backup_{timestamp}"
                                    backup_dir = shard_dir / backup_name

                                    try:
                                        task_dir.rename(backup_dir)
                                        print(f"   📁 Backed up existing task folder: {task_id} -> {backup_name}")
                                    except Exception as e:
                                        print(f"   ⚠️ Warning: Could not backup task folder {task_id}: {e}")

                        agent = None if mode == "verify" else create_agent(model_name)
                        try:
                            with TaskExecutor(
                                client_id=client_id,
                                onedrive_root_path=onedrive_path,
                                results_dir=str(shard_dir),
                                cleanup_temp_folder=False,
                                enable_logging=True,
                            ) as shard_executor:
                                return shard_executor.evaluate_all_tasks(
                                    tasks=tasks,
                                    agent=agent,
                                    timeout_minutes=timeout_minutes,
                                    save_screenshots=save_screenshots,
                                    task_filter=shard_task_ids,
                                    mode=mode,
                                    conversion_mode=conversion_mode,
                                    use_cached_original_images=use_cached_original_images,
                                    config=agent.agent_config,
                                )
                        finally:
                            if agent is not None:
                                try:
                                    agent.close()
                                except Exception:
                                    pass

                    with concurrent.futures.ThreadPoolExecutor(max_workers=shard_count) as pool:
                        futures = [pool.submit(run_shard, idx, shard_ids) for idx, shard_ids in enumerate(shards) if shard_ids]
                        for fut in concurrent.futures.as_completed(futures):
                            shard_results = fut.result()
                            for result in shard_results:
                                model_results[result.task_id] = {
                                    "success": result.success,
                                    "score": result.score,
                                    "details": result.details or result.reason,
                                    "evaluation_time_seconds": result.evaluation_time_seconds,
                                    "timestamp": datetime.now().isoformat(),
                                }
                                status = "✅ PASSED" if result.success else "❌ FAILED"
                                score_str = f"{result.score:.2f}" if result.score is not None else "N/A"
                                time_str = f"{result.evaluation_time_seconds:.1f}s" if result.evaluation_time_seconds is not None else "N/A"
                                print(f"   📋 {result.task_id}: {status} - Score: {score_str} - Time: {time_str}")
                                if not result.success and result.details:
                                    print(f"      Details: {result.details}")

                # Save model results to file
                results_file = model_results_dir / f"{mode}_results.json"

                # In resume mode, merge with existing results
                if resume and results_file.exists():
                    try:
                        with open(results_file, "r") as f:
                            existing_results = json.load(f)
                        # Merge new results with existing ones
                        existing_results.update(model_results)
                        model_results = existing_results
                        print(f"📋 Merged results with existing file: {results_file}")
                    except Exception as e:
                        print(f"Warning: Could not merge with existing results: {e}")

                with open(results_file, "w") as f:
                    json.dump(model_results, f, indent=2, ensure_ascii=False)

                # Print model summary
                print(f"\n📊 Summary for {model_name} ({mode} mode):")
                total_tasks = len(model_results)
                passed_tasks = sum(1 for r in model_results.values() if r.get("success", False))
                print(f"   Total tasks: {total_tasks}")
                print(f"   Passed: {passed_tasks}")
                print(f"   Failed: {total_tasks - passed_tasks}")

                if model_results and any(r.get("score") for r in model_results.values()):
                    scores = [r.get("score", 0) for r in model_results.values() if r.get("score") is not None]
                    if scores:
                        avg_score = sum(scores) / len(scores)
                        print(f"   Average score: {avg_score:.2f}")

        except Exception as e:
            print(f"❌ {mode.title()} failed for {model_name}: {str(e)}")
            import traceback

            traceback.print_exc()

            # Store error result marker
            model_results = {"error": str(e), "timestamp": datetime.now().isoformat()}

        return {model_name: model_results}

    # Process models sequentially; concurrency happens within each model via num_concurrent
    for model_name in models:
        results.update(process_model(model_name))

    return results


def run_evaluation(
    registry_path: str,
    models: List[str],
    onedrive_path: str,
    results_dir: str,
    task_filter: Optional[str] = None,
    timeout_minutes: int = 15,
    save_screenshots: bool = True,
    conversion_mode: str = "online",
    use_cached_original_images: bool = True,
    resume: bool = False,
) -> Dict[str, Dict[str, any]]:
    """
    Run evaluation for specified models.

    Args:
        registry_path: Path to task registry
        models: List of model names to evaluate
        onedrive_path: OneDrive root path
        results_dir: Local results directory
        task_filter: Optional task ID filter (substring match)
        timeout_minutes: Timeout for each task evaluation
        save_screenshots: Whether to save screenshots
        conversion_mode: Conversion mode for Office files
        use_cached_original_images: Whether to use cached original images
        resume: Whether to resume from existing results (skip completed tasks)

    Returns:
        Dictionary of results by model and task
    """
    results = {}

    # Get client ID
    client_id = os.getenv("CLIENT_ID")
    if not client_id:
        raise ValueError("CLIENT_ID not found in environment variables")

    # Create results directory
    results_path = Path(results_dir)
    results_path.mkdir(parents=True, exist_ok=True)

    for model_name in models:
        print(f"\n{'='*60}")
        print(f"🤖 Evaluating model: {model_name}")
        print(f"{'='*60}")

        model_results = {}
        results[model_name] = model_results

        # Create model-specific results directory
        model_results_dir = results_path / f"{model_name}_{int(time.time())}"
        model_results_dir.mkdir(exist_ok=True)

        try:
            # Initialize agent
            print(f"Initializing {model_name} agent...")
            agent = create_agent(model_name)

            # Initialize task executor with per-task logging
            with TaskExecutor(
                client_id=client_id,
                onedrive_root_path=onedrive_path,
                results_dir=str(model_results_dir),
                cleanup_temp_folder=False,
                enable_logging=True,
            ) as executor:
                print(f"Loading tasks from: {registry_path}")
                tasks = executor.load_tasks(registry_path)

                # Filter tasks if specified
                if task_filter:
                    filtered_tasks = {task_id: task for task_id, task in tasks.items() if task_filter.lower() in task_id.lower()}
                    tasks = filtered_tasks

                # Handle resume mode - skip completed tasks
                if resume:
                    completed_tasks = get_completed_tasks(results_dir, model_name)
                    if completed_tasks:
                        original_count = len(tasks)
                        tasks = {task_id: task for task_id, task in tasks.items() if task_id not in completed_tasks}
                        skipped_count = original_count - len(tasks)
                        print(f"📋 Resume mode: Skipping {skipped_count} completed tasks, processing {len(tasks)} remaining tasks")
                        if completed_tasks:
                            print(f"   Completed tasks: {', '.join(sorted(list(completed_tasks)[:5]))}{'...' if len(completed_tasks) > 5 else ''}")

                print(f"Found {len(tasks)} tasks to evaluate")

                # Evaluate each task
                for i, (task_id, task) in enumerate(tasks.items(), 1):
                    print(f"\n📋 [{i}/{len(tasks)}] Evaluating task: {task_id}")
                    print(f"   Goal: {task.goal}")

                    try:
                        start_time = time.time()
                        result = executor.evaluate_task(
                            task=task,
                            agent=agent,
                            timeout_minutes=timeout_minutes,
                            save_screenshots=save_screenshots,
                            conversion_mode=conversion_mode,
                            use_cached_original_images=use_cached_original_images,
                            config=agent.agent_config,
                        )

                        evaluation_time = time.time() - start_time

                        # Store results
                        model_results[task_id] = {
                            "success": result.success,
                            "score": result.score,
                            "details": result.details,
                            "evaluation_time_seconds": round(evaluation_time, 2),
                            "timestamp": datetime.now().isoformat(),
                        }

                        # Print result
                        status = "✅ PASSED" if result.success else "❌ FAILED"
                        print(f"   {status} - Score: {result.score:.2f} - Time: {evaluation_time:.1f}s")

                        if not result.success and result.details:
                            print(f"   Details: {result.details}")

                    except Exception as e:
                        print(f"   ❌ ERROR: {str(e)}")
                        model_results[task_id] = {
                            "success": False,
                            "score": 0.0,
                            "error": str(e),
                            "timestamp": datetime.now().isoformat(),
                        }

                # Save model results to file
                results_file = model_results_dir / "evaluation_results.json"
                with open(results_file, "w") as f:
                    json.dump(model_results, f, indent=2, ensure_ascii=False)

                # Print model summary
                print(f"\n📊 Summary for {model_name}:")
                total_tasks = len(model_results)
                passed_tasks = sum(1 for r in model_results.values() if r.get("success", False))
                avg_score = sum(r.get("score", 0) for r in model_results.values()) / total_tasks if total_tasks > 0 else 0

                print(f"   Tasks passed: {passed_tasks}/{total_tasks} ({passed_tasks/total_tasks*100:.1f}%)")
                print(f"   Average score: {avg_score:.3f}")
                print(f"   Results saved to: {results_file}")

        except Exception as e:
            print(f"❌ Error evaluating model {model_name}: {str(e)}")
            model_results["error"] = str(e)

    return results


def run_verification(registry_path: str, results_dir: str, task_filter: Optional[str] = None) -> Dict[str, Dict[str, any]]:
    """
    Run verification on existing evaluation results.

    NOTE: Verification functionality is currently not implemented.
    This function exists for CLI compatibility but returns empty results.

    Args:
        registry_path: Path to task registry
        results_dir: Directory containing evaluation results
        task_filter: Optional task ID filter

    Returns:
        Dictionary of verification results
    """
    print("\n🔍 Verification mode not implemented yet")
    print(f"Results directory: {results_dir}")

    # Return empty results for now
    verification_results = {}

    results_path = Path(results_dir)
    if not results_path.exists():
        print(f"Results directory does not exist: {results_dir}")
        return verification_results

    # Just list what would be verified
    for model_dir in results_path.iterdir():
        if model_dir.is_dir() and not model_dir.name.startswith("."):
            print(f"Would verify results for: {model_dir.name}")
            verification_results[model_dir.name] = {}

    return verification_results


def main():
    parser = argparse.ArgumentParser(
        description="Run OfficeArena benchmarks with configurable options",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Run full evaluation (execute + verify) with UITARS
  python run_benchmark.py --registry task_registry/ --models uitars --mode evaluate
  
  # Execute tasks only with multiple models
  python run_benchmark.py --registry task_registry/ --models uitars claude-3-5-sonnet --mode execute
  
  # Resume evaluation from existing results (skip completed tasks)
  python -m officearena.run_benchmark --registry task_registry --models uitars --mode evaluate --resume --results-dir results/uitars_evaluate_1758452218 --num-concurrent 3
  
  # Verify existing execution results only
  python run_benchmark.py --registry task_registry/ --mode verify --results-dir results/
  
  # Filter tasks by substring
  python run_benchmark.py --registry task_registry/ --models cua --mode execute --task-filter "Obesity"
        """,
    )

    parser.add_argument(
        "--registry",
        type=str,
        help="Path to the task registry directory",
    )

    parser.add_argument(
        "--models",
        nargs="+",
        help="Model(s) to evaluate. Available: " + ", ".join(get_available_models().keys()),
    )

    parser.add_argument(
        "--mode",
        choices=["execute", "verify", "evaluate"],
        default="evaluate",
        help="Task processing mode: execute (run tasks only), verify (verify only), or evaluate (both) (default: evaluate)",
    )

    parser.add_argument(
        "--onedrive-path",
        type=str,
        default="/OfficeArena",
        help="OneDrive root path (default: /OfficeArena)",
    )

    parser.add_argument(
        "--results-dir",
        type=str,
        default="evaluation_results",
        help="Local directory for results (default: evaluation_results)",
    )

    parser.add_argument("--task-filter", type=str, help="Filter tasks by substring match in task ID")

    parser.add_argument(
        "--timeout",
        type=int,
        default=15,
        help="Timeout in minutes for each task (default: 15)",
    )

    parser.add_argument("--no-screenshots", action="store_true", help="Disable screenshot saving")

    parser.add_argument("--list-models", action="store_true", help="List available models and exit")

    parser.add_argument(
        "--num-concurrent",
        type=int,
        default=1,
        help="Number of concurrent shards of tasks per model (default: 1)",
    )

    parser.add_argument(
        "--conversion-mode",
        type=str,
        choices=["online", "com", "libreoffice+ghostcript", "libreoffice+poppler"],
        default="online",
        help="Conversion mode for Office files: online or offline (default: online)",
    )

    parser.add_argument(
        "--use-cached-original-images",
        action="store_true",
        help="Use cached original images for comparison instead of re-downloading, stored as filename.zip",
    )

    parser.add_argument(
        "--resume",
        action="store_true",
        help="Resume evaluation from existing results directory (skip completed tasks). "
        "Requires --results-dir to point to existing model directory (e.g., results/uitars_evaluate_1758452218). "
        "Only works with single model and --mode=execute or --mode=evaluate",
    )

    args = parser.parse_args()

    # Handle list models
    if args.list_models:
        print("Available models:")
        for model_name, config in get_available_models().items():
            print(f"  {model_name}: {config['model_name']} ({config['type']})")
        return

    # Validate required arguments
    if not args.registry:
        print("Error: --registry is required")
        parser.print_help()
        sys.exit(1)

    # Validate arguments
    if args.mode in ["evaluation", "both"] and not args.models:
        print("Error: --models is required for evaluation mode")
        sys.exit(1)

    # Validate resume mode
    if args.resume and args.mode not in ["execute", "evaluate"]:
        print("Error: --resume can only be used with --mode=execute or --mode=evaluate")
        sys.exit(1)

    if args.resume and not Path(args.results_dir).exists():
        print(f"Error: Results directory does not exist for resume mode: {args.results_dir}")
        sys.exit(1)

    if args.resume and len(args.models) > 1:
        print("Error: --resume can only be used with a single model. Multiple models require separate result directories.")
        sys.exit(1)

    # Validate models
    if args.models:
        available_models = get_available_models()
        invalid_models = [m for m in args.models if m not in available_models]
        if invalid_models:
            print(f"Error: Invalid models: {invalid_models}")
            print(f"Available models: {list(available_models.keys())}")
            sys.exit(1)

    # Validate registry path
    if not Path(args.registry).exists():
        print(f"Error: Registry path does not exist: {args.registry}")
        sys.exit(1)

    print("🚀 OfficeArena Benchmark Runner")
    print(f"Registry: {args.registry}")
    print(f"Mode: {args.mode}")
    print(f"Models: {args.models if args.models else 'N/A'}")
    print(f"OneDrive path: {args.onedrive_path}")
    print(f"Results directory: {args.results_dir}")
    if args.resume:
        print("Resume mode: Enabled (skipping completed tasks)")
    if args.task_filter:
        print(f"Task filter: {args.task_filter}")
    if args.num_concurrent > 1:
        print(f"Concurrent shards: {args.num_concurrent}")

    # Run benchmark
    try:
        # For verify mode, we don't need models
        if args.mode == "verify":
            if args.models:
                print("⚠️ Warning: Models specified for verify mode will be ignored")

        # For execute and evaluate modes, we need models
        if args.mode in ["execute", "evaluate"] and not args.models:
            print("❌ Error: Models must be specified for execute and evaluate modes")
            sys.exit(1)

        print(f"\n🔄 Running in '{args.mode}' mode...")
        results = run_benchmark(
            registry_path=args.registry,
            models=args.models if args.mode != "verify" else [],
            mode=args.mode,
            onedrive_path=args.onedrive_path,
            results_dir=args.results_dir,
            task_filter=args.task_filter,
            timeout_minutes=args.timeout,
            save_screenshots=not args.no_screenshots,
            conversion_mode=args.conversion_mode,
            use_cached_original_images=args.use_cached_original_images,
            num_concurrent=max(1, args.num_concurrent),
            resume=args.resume,
        )

        # Print final summary
        if results:
            print("\n🎯 Final Results Summary:")
            for model_name, model_results in results.items():
                if "error" in model_results:
                    print(f"   {model_name}: ❌ ERROR - {model_results['error']}")
                else:
                    total = len(model_results)
                    passed = sum(1 for r in model_results.values() if r.get("success", False))
                    print(f"   {model_name}: {passed}/{total} tasks passed")

        print("\n🎉 Benchmark completed successfully!")

    except KeyboardInterrupt:
        print("\n⚠️ Benchmark interrupted by user")
        sys.exit(1)
    except Exception as e:
        print(f"\n❌ Benchmark failed: {str(e)}")
        import traceback

        traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    main()
