"""
Task Executor for running evaluations across task registries.

This module provides orchestration for running multiple tasks from a task registry,
managing OfficeArena environments per task, and collecting results.
"""

import json
import shutil
import tempfile
import traceback
from dataclasses import asdict, dataclass
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional

from ..agents.base import BaseAgent
from ..agents.config import AgentConfig
from ..environments.office_env import OfficeArena
from ..environments.task import Task, load_task_registry_from_directory
from ..utils.load_dotenv import try_load_dotenv
from ..verify.ppt import PPTVerifier


@dataclass
class EvaluationResult:
    """Results from a single task evaluation."""

    task_id: str
    goal: str
    success: bool
    score: Optional[float] = None
    reason: Optional[str] = None
    details: Optional[str] = None
    error_message: Optional[str] = None
    evaluation_time_seconds: Optional[float] = None
    agent_steps: Optional[int] = None
    screenshots_saved: Optional[List[str]] = None
    execution_status: Optional[str] = None  # 'success', 'out of steps', 'infrastructure failure: reason'
    verification_status: Optional[str] = None  # 'success', 'failed: image missing', 'failed: file missing', 'rubric error'

    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary for JSON serialization."""
        return asdict(self)


class TaskExecutor:
    """
    Orchestrates evaluation of multiple tasks from a task registry.

    Creates OfficeArena environments per task and manages the complete
    evaluation workflow including setup, execution, verification, and cleanup.

    Can be used as a context manager to ensure cleanup of temp folders.
    """

    def __init__(
        self,
        client_id: str,
        onedrive_root_path: str = "/OfficeArena",
        results_dir: Optional[str] = None,
        cleanup_temp_folder: bool = False,
        enable_logging: bool = True,
    ):
        """
        Initialize the task executor.

        Args:
            client_id: OneDrive client ID for file operations
            onedrive_root_path: Root path in OneDrive for file storage
            results_dir: Local directory to save results (defaults to temp dir)
            cleanup_temp_folder: Whether to delete the temp folder on completion (default: False)
            enable_logging: Whether to enable detailed logging for each task
        """
        # Load environment variables
        try_load_dotenv()

        self.client_id = client_id
        self.onedrive_root_path = onedrive_root_path
        self.cleanup_temp_folder = cleanup_temp_folder
        self.enable_logging = enable_logging

        # Create temp folder for the entire registry
        if results_dir:
            self.results_dir = Path(results_dir)
            self.results_dir.mkdir(exist_ok=True, parents=True)
            self.temp_folder = self.results_dir  # For backward compatibility
            self.is_temp_dir = False
        else:
            self.temp_folder = Path(tempfile.mkdtemp(prefix="officearena_eval_"))
            self.results_dir = self.temp_folder
            self.is_temp_dir = True

        print(f"Results directory: {self.results_dir}")
        if self.is_temp_dir and not self.cleanup_temp_folder:
            print("Temp folder will be preserved after completion")

        # Storage for results
        self.results: List[EvaluationResult] = []

    def load_tasks(self, task_registry_path: str, verifier_class: type = PPTVerifier) -> Dict[str, Task]:
        """
        Load tasks from a task registry directory.

        Args:
            task_registry_path: Path to directory containing task registry JSON
            verifier_class: Verifier class to use for task grading

        Returns:
            Dictionary of loaded tasks
        """
        print(f"Loading tasks from {task_registry_path}...")
        tasks = load_task_registry_from_directory(task_registry_path, verifier_class)
        print(f"Loaded {len(tasks)} tasks: {list(tasks.keys())}")
        return tasks

    def execute_task(self, task: Task, agent: BaseAgent, timeout_minutes: int = 10, save_screenshots: bool = True, save_online_images: bool = True, config: AgentConfig = None) -> EvaluationResult:
        """
        Execute a single task with the given agent (no verification).

        Args:
            task: Task to execute
            agent: Agent to perform the task
            timeout_minutes: Maximum time for task execution
            save_screenshots: Whether to save screenshots during execution

        Returns:
            EvaluationResult with execution outcomes (verification fields will be None)
        """
        start_time = datetime.now()
        print(f"\n{'='*60}")
        print(f"Executing Task: {task.task_id}")
        print(f"Goal: {task.goal}")
        print(f"File: {task.file_path}")
        print(f"{'='*60}")

        # Create task-specific results directory
        task_results_dir = self.results_dir / task.task_id
        task_results_dir.mkdir(exist_ok=True)

        env = None  # Initialize for cleanup
        try:
            # Set up per-task log directory
            log_dir = str(task_results_dir / "logs") if self.enable_logging else None

            # Create OfficeArena environment for this task
            env = OfficeArena(
                client_id=self.client_id,
                onedrive_root_path=self.onedrive_root_path,
                enable_logging=self.enable_logging,
                log_dir=log_dir,
                resolution=(config.display_width, config.display_height) if config else (1024, 768), #(1920, 1080)
            )

            # Ensure we have the local file path
            if not task.file_path:
                raise ValueError(f"Task {task.task_id} missing file_path")

            local_file_path = Path(task.file_path)
            if not local_file_path.exists():
                raise FileNotFoundError(f"Task file not found: {local_file_path}")

            print(f"Setting up environment for file: {local_file_path}")

            # Upload file to OneDrive for task evaluation
            from ..utils.onedrive import OneDriveClient

            onedrive_client = OneDriveClient(client_id=self.client_id, root_path=self.onedrive_root_path)

            # Create a unique remote path for this task with timestamp to avoid locks
            import time

            timestamp = int(time.time())
            file_extension = local_file_path.suffix
            remote_filename = f"{task.task_id}_{timestamp}_{local_file_path.stem}{file_extension}"
            remote_path = f"tasks/{remote_filename}"

            # Set up screenshot directory if needed
            screenshots_dir = str(task_results_dir / "screenshots") if save_screenshots else None

            print(f"Uploading file to OneDrive: {remote_path}")
            try:
                onedrive_client.upload_file(str(local_file_path), remote_path)
            except Exception as upload_error:
                # OneDrive upload failures are infrastructure failures
                if "rate" in str(upload_error).lower() or "429" in str(upload_error) or "throttl" in str(upload_error).lower():
                    error_msg = f"OneDrive rate limit exceeded during file upload: {upload_error}"
                elif "network" in str(upload_error).lower() or "connection" in str(upload_error).lower() or "timeout" in str(upload_error).lower():
                    error_msg = f"Network error during file upload: {upload_error}"
                elif "auth" in str(upload_error).lower() or "token" in str(upload_error).lower() or "401" in str(upload_error) or "403" in str(upload_error):
                    error_msg = f"Authentication error during file upload: {upload_error}"
                else:
                    error_msg = f"OneDrive API error during file upload: {upload_error}"

                return EvaluationResult(
                    task_id=task.task_id,
                    score=None,
                    max_score=1.0,
                    success=False,
                    execution_status=f"infrastructure failure: {error_msg}",
                    verification_status=None,
                    error=error_msg,
                    local_file_path=str(local_file_path),
                    remote_path=remote_path,
                    results_directory=str(task_results_dir),
                    screenshot_directory=screenshots_dir,
                    downloaded_file_path=None,
                    log_paths=None,
                )

            print("Running agent execution...")

            # Run the agent evaluation (this includes start_evaluation internally)
            agent_result = env.evaluate_agent(
                agent=agent,
                file_path=remote_path,
                task_instruction=task.goal,
                save_screenshots=save_screenshots,
                save_online_images=save_online_images,
                screenshot_dir=screenshots_dir,
            )

            print("Agent execution completed.")

            # Calculate execution time
            end_time = datetime.now()
            execution_time = (end_time - start_time).total_seconds()

            # Get downloaded file path for later verification if needed
            downloaded_file_path = agent_result.get("downloaded_file_path")
            if downloaded_file_path and Path(downloaded_file_path).exists():
                print(f"Agent downloaded file to: {downloaded_file_path}")
            else:
                print("No file was downloaded by the agent")

            # Store task result info for potential later verification
            task_info_file = task_results_dir / "task_execution_info.json"

            # Determine execution status from agent result
            execution_status = agent_result.get("execution_status", "success")
            verification_status = agent_result.get("verification_status")

            # Extract agent steps from result
            agent_steps = agent_result.get("steps_taken", 0)

            with open(task_info_file, "w") as f:
                json.dump(
                    {
                        "task_id": task.task_id,
                        "remote_path": remote_path,
                        "local_file_path": str(local_file_path),
                        "downloaded_file_path": downloaded_file_path,
                        "execution_time_seconds": execution_time,
                        "success": agent_result.get("success", False),
                        "execution_status": execution_status,
                        "verification_status": verification_status,
                        "agent_steps": agent_steps,
                        "agent_result": agent_result,
                        "timestamp": timestamp,
                    },
                    f,
                    indent=2,
                )

            # Check if this is an infrastructure failure
            is_infrastructure_failure = execution_status and execution_status.startswith("infrastructure failure")

            return EvaluationResult(
                task_id=task.task_id,
                goal=task.goal,
                success=not is_infrastructure_failure and agent_result.get("success", False),
                reason="Task execution completed successfully" if not is_infrastructure_failure else execution_status,
                evaluation_time_seconds=execution_time,
                agent_steps=agent_steps,
                screenshots_saved=agent_result.get("screenshots", []) if save_screenshots else None,
                execution_status=execution_status,
                verification_status=verification_status,
            )

        except Exception as e:
            error_msg = f"Error executing task {task.task_id}: {str(e)}"
            print(f"❌ {error_msg}")
            print(f"Error details: {traceback.format_exc()}")

            end_time = datetime.now()
            execution_time = (end_time - start_time).total_seconds()

            # Determine if this is an infrastructure failure
            execution_status = "infrastructure failure: task execution error"
            if "connection" in str(e).lower() or "network" in str(e).lower() or "timeout" in str(e).lower():
                execution_status = f"infrastructure failure: network error - {str(e)}"

            return EvaluationResult(
                task_id=task.task_id,
                goal=task.goal,
                success=False,
                error_message=error_msg,
                evaluation_time_seconds=execution_time,
                execution_status=execution_status,
                verification_status=None,
            )

        finally:
            if env:
                env.close()

    def verify_task(
        self, task: Task, task_execution_info_path: Optional[str] = None, conversion_mode: str = "online", use_cached_original_images: bool = True, config: AgentConfig = None
    ) -> EvaluationResult:
        """
        Verify the results of a previously executed task.

        Args:
            task: Task to verify
            task_execution_info_path: Path to task execution info JSON file (if None, looks in results_dir)

        Returns:
            EvaluationResult with verification outcomes
        """
        start_time = datetime.now()
        print(f"\n{'='*60}")
        print(f"Verifying Task: {task.task_id}")
        print(f"{'='*60}")

        # Create task-specific results directory
        task_results_dir = self.results_dir / task.task_id
        task_results_dir.mkdir(exist_ok=True)

        try:
            # Load execution info
            if task_execution_info_path:
                info_file = Path(task_execution_info_path)
            else:
                info_file = task_results_dir / "task_execution_info.json"

            if not info_file.exists():
                raise FileNotFoundError(f"Task execution info not found: {info_file}")

            with open(info_file, "r") as f:
                execution_info = json.load(f)

            local_file_path = Path(execution_info["local_file_path"])
            downloaded_file_path = execution_info.get("downloaded_file_path")

            # Get modified file for verification
            if downloaded_file_path and Path(downloaded_file_path).exists():
                modified_file_path = Path(downloaded_file_path)
                print(f"Using downloaded file: {modified_file_path}")
            else:
                # Fallback: download the file manually
                print("Downloading file for verification...")
                download_dir = task_results_dir / "screenshots"
                download_dir.mkdir(exist_ok=True)

                from ..utils.onedrive import OneDriveClient

                download_client = OneDriveClient(client_id=self.client_id, root_path=self.onedrive_root_path)
                try:
                    downloaded_path = download_client.download_file(execution_info["remote_path"], str(download_dir))
                    modified_file_path = Path(downloaded_path)
                    print(f"Downloaded file to: {modified_file_path}")
                except Exception as download_error:
                    # OneDrive download failures during verification are not infrastructure failures for execution
                    # since verification can be rerun separately. Just mark verification as failed.
                    if "rate" in str(download_error).lower() or "429" in str(download_error) or "throttl" in str(download_error).lower():
                        error_msg = f"OneDrive rate limit exceeded during verification download: {download_error}"
                        verification_status = "rubric error"
                    elif "network" in str(download_error).lower() or "connection" in str(download_error).lower() or "timeout" in str(download_error).lower():
                        error_msg = f"Network error during verification download: {download_error}"
                        verification_status = "rubric error"
                    elif "auth" in str(download_error).lower() or "token" in str(download_error).lower() or "401" in str(download_error) or "403" in str(download_error):
                        error_msg = f"Authentication error during verification download: {download_error}"
                        verification_status = "rubric error"
                    else:
                        error_msg = f"OneDrive API error during verification download: {download_error}"
                        verification_status = "rubric error"

                    # Don't return early - continue with no modified file for verification
                    modified_file_path = None
                    print(f"❌ Verification download failed: {error_msg}")
                    print("Verification will be skipped due to download failure")

            # Verify the task completion using the grader
            if task.grader and modified_file_path is not None:
                print("Running verification with task grader...")
                try:
                    verification_result = task.grader.evaluate(
                        original_file_path=str(local_file_path),
                        modified_file_path=str(modified_file_path),
                        include_reason=True,
                        conversion_mode=conversion_mode,
                        use_cached_original_images=use_cached_original_images,
                    )

                    if isinstance(verification_result, tuple):
                        score, reason = verification_result
                    else:
                        score = verification_result
                        reason = "No detailed reason provided"

                    print(f"✅ Verification Score: {score}")
                    print(f"📝 Verification Reason: {reason}")

                    success = int(score) == 1 if score is not None else False
                    verification_status = "success" if success else "failed: rubric evaluation"

                except Exception as verification_error:
                    # Check if this is a rate limiting error from the evaluation LLM/VLM
                    error_str = str(verification_error).lower()
                    if any(keyword in error_str for keyword in ["rate", "429", "throttl", "quota", "limit"]):
                        verification_status = "rubric error"
                        score = None
                        reason = f"Rate limiting error during verification: {verification_error}"
                    else:
                        verification_status = "rubric error"
                        score = None
                        reason = f"Verification error: {verification_error}"

                    success = False
                    print(f"❌ Verification failed: {reason}")

                task.grader.save_as_file(task_results_dir / "scored_rubric.json")
            elif task.grader and modified_file_path is None:
                print("⚠️ Skipping verification - modified file not available")
                score = None
                success = False
                reason = "Verification skipped - modified file download failed"
                if "verification_status" not in locals():
                    verification_status = "rubric error"
            else:
                print("⚠️  No grader available for verification")
                score = None
                reason = "No grader configured for this task"
                success = False
                verification_status = "rubric error"

            # Calculate verification time
            end_time = datetime.now()
            verification_time = (end_time - start_time).total_seconds()

            return EvaluationResult(
                task_id=task.task_id,
                goal=task.goal,
                success=success,
                score=score,
                reason=reason,
                evaluation_time_seconds=verification_time,
                details=f"Verified using grader: {task.grader.__class__.__name__ if task.grader else 'None'}",
                verification_status=verification_status,
            )

        except Exception as e:
            error_msg = f"Error verifying task {task.task_id}: {str(e)}"
            print(f"❌ {error_msg}")
            print(f"Error details: {traceback.format_exc()}")

            end_time = datetime.now()
            verification_time = (end_time - start_time).total_seconds()

            # Determine verification status based on error type
            verification_status = "rubric error"
            if "rate limit" in str(e).lower() or "quota" in str(e).lower():
                verification_status = "rubric error"
            elif "image" in str(e).lower() and ("missing" in str(e).lower() or "not found" in str(e).lower()):
                verification_status = "failed: image missing"
            elif "file" in str(e).lower() and ("missing" in str(e).lower() or "not found" in str(e).lower()):
                verification_status = "failed: file missing"

            return EvaluationResult(
                task_id=task.task_id,
                goal=task.goal,
                success=False,
                error_message=error_msg,
                evaluation_time_seconds=verification_time,
                verification_status=verification_status,
                score=None,  # Infrastructure failures should have null score
            )

    def evaluate_task(
        self,
        task: Task,
        agent: BaseAgent,
        timeout_minutes: int = 10,
        save_screenshots: bool = True,
        conversion_mode: str = "online",
        use_cached_original_images: bool = True,
        config: AgentConfig = None,
    ) -> EvaluationResult:
        """
        Execute and verify a single task with the given agent (convenience method).

        Args:
            task: Task to evaluate
            agent: Agent to perform the task
            timeout_minutes: Maximum time for task execution
            save_screenshots: Whether to save screenshots during evaluation

        Returns:
            EvaluationResult with both execution and verification outcomes
        """
        start_time = datetime.now()
        print(f"\n{'='*60}")
        print(f"Evaluating Task (Execute + Verify): {task.task_id}")
        print(f"Goal: {task.goal}")
        print(f"File: {task.file_path}")
        print(f"{'='*60}")

        try:
            # Execute the task
            execution_result = self.execute_task(task, agent, timeout_minutes, save_screenshots, config=config)

            # Only skip verification if this was an infrastructure failure
            # Even if the agent ran out of steps or failed, we should still verify what they accomplished
            is_infrastructure_failure = execution_result.execution_status and execution_result.execution_status.startswith("infrastructure failure")

            if is_infrastructure_failure:
                print("❌ Task execution had infrastructure failure, skipping verification")
                return execution_result

            # Verify the task (even if execution wasn't fully successful)
            verification_result = self.verify_task(task, conversion_mode=conversion_mode, use_cached_original_images=use_cached_original_images, config=config)

            # Combine results
            end_time = datetime.now()
            total_time = (end_time - start_time).total_seconds()

            return EvaluationResult(
                task_id=task.task_id,
                goal=task.goal,
                success=verification_result.success,
                score=verification_result.score,
                reason=verification_result.reason,
                details=f"Execution: {execution_result.reason}; Verification: {verification_result.details}",
                error_message=verification_result.error_message,
                evaluation_time_seconds=total_time,
                agent_steps=execution_result.agent_steps,
                screenshots_saved=execution_result.screenshots_saved,
                execution_status=execution_result.execution_status,
                verification_status=verification_result.verification_status,
            )

        except Exception as e:
            error_msg = f"Error in full evaluation of task {task.task_id}: {str(e)}"
            print(f"❌ {error_msg}")

            end_time = datetime.now()
            total_time = (end_time - start_time).total_seconds()

            return EvaluationResult(
                task_id=task.task_id,
                goal=task.goal,
                success=False,
                error_message=error_msg,
                evaluation_time_seconds=total_time,
            )

    def evaluate_all_tasks(
        self,
        tasks: Dict[str, Task],
        agent: BaseAgent,
        timeout_minutes: int = 10,
        save_screenshots: bool = True,
        task_filter: Optional[List[str]] = None,
        mode: str = "evaluate",  # "execute", "verify", or "evaluate"
        conversion_mode: str = "online",
        use_cached_original_images: bool = True,
        config: AgentConfig = None,
    ) -> List[EvaluationResult]:
        """
        Evaluate multiple tasks with the given agent.

        Args:
            tasks: Dictionary of tasks to process
            agent: Agent to perform the tasks (not needed for verify mode)
            timeout_minutes: Maximum time per task
            save_screenshots: Whether to save screenshots (execute/evaluate modes)
            task_filter: Optional list of specific task IDs to run
            mode: "execute" (run only), "verify" (verify only), or "evaluate" (both)

        Returns:
            List of evaluation results
        """
        print(f"\n🔄 Processing {len(tasks)} tasks in '{mode}' mode...")

        # Filter tasks if specified
        if task_filter:
            filtered_tasks = {task_id: task for task_id, task in tasks.items() if any(f in task_id for f in task_filter)}
            print(f"Filtered to {len(filtered_tasks)} tasks matching filter: {task_filter}")
        else:
            filtered_tasks = tasks

        results = []
        total_tasks = len(filtered_tasks)

        for i, (task_id, task) in enumerate(filtered_tasks.items(), 1):
            print(f"\n📋 Task {i}/{total_tasks}: {task_id}")

            try:
                if mode == "execute":
                    result = self.execute_task(task, agent, timeout_minutes, save_screenshots, save_online_images=conversion_mode == "online", config=config)
                elif mode == "verify":
                    result = self.verify_task(task, conversion_mode=conversion_mode, use_cached_original_images=use_cached_original_images, config=config)
                elif mode == "evaluate":
                    result = self.evaluate_task(task, agent, timeout_minutes, save_screenshots, config=config)
                else:
                    raise ValueError(f"Invalid mode: {mode}. Must be 'execute', 'verify', or 'evaluate'")

                results.append(result)
                self.results.append(result)

                # Save individual result
                task_results_dir = self.results_dir / task.task_id
                task_results_dir.mkdir(exist_ok=True)
                result_file = task_results_dir / f"result_{mode}.json"
                with open(result_file, "w", encoding="utf-8") as f:
                    json.dump(result.to_dict(), f, indent=2)

                print(f"✅ Task {task_id} completed ({'✅ Success' if result.success else '❌ Failed'})")

            except Exception as e:
                error_msg = f"Error processing task {task_id}: {str(e)}"
                print(f"❌ {error_msg}")

                error_result = EvaluationResult(
                    task_id=task_id,
                    goal=task.goal if hasattr(task, "goal") else "Unknown",
                    success=False,
                    error_message=error_msg,
                    evaluation_time_seconds=0.0,
                )
                results.append(error_result)
                self.results.append(error_result)

        # Save summary results
        summary_file = self.results_dir / f"summary_{mode}.json"
        with open(summary_file, "w", encoding="utf-8") as f:
            summary = {
                "mode": mode,
                "total_tasks": total_tasks,
                "successful_tasks": sum(1 for r in results if r.success),
                "failed_tasks": sum(1 for r in results if not r.success),
                "average_score": sum(r.score or 0 for r in results if r.score) / len([r for r in results if r.score]) if any(r.score for r in results) else None,
                "results": [r.to_dict() for r in results],
            }
            json.dump(summary, f, indent=2)

        print(f"\n� {mode.title()} Summary:")
        print(f"   Total tasks: {total_tasks}")
        print(f"   Successful: {sum(1 for r in results if r.success)}")
        print(f"   Failed: {sum(1 for r in results if not r.success)}")
        if any(r.score for r in results):
            avg_score = sum(r.score or 0 for r in results if r.score) / len([r for r in results if r.score])
            print(f"   Average score: {avg_score:.3f}")

        return results

    def save_results(self, filename: str) -> None:
        """
        Save evaluation results to a specified file.

        Args:
            filename: Path to save the results JSON file
        """
        results_file = Path(filename)
        results_file.parent.mkdir(exist_ok=True, parents=True)

        summary = {
            "timestamp": datetime.now().isoformat(),
            "total_tasks": len(self.results),
            "successful_tasks": sum(1 for r in self.results if r.success),
            "failed_tasks": sum(1 for r in self.results if not r.success),
            "average_score": None,
            "results": [r.to_dict() for r in self.results],
        }

        # Calculate average score for successful tasks with scores
        scores = [r.score for r in self.results if r.success and r.score is not None]
        if scores:
            summary["average_score"] = sum(scores) / len(scores)

        with open(results_file, "w", encoding="utf-8") as f:
            json.dump(summary, f, indent=2)

        print(f"Results saved to: {results_file}")

    def save_summary_results(self) -> None:
        """Save summary of all evaluation results."""
        summary_file = self.results_dir / "evaluation_summary.json"

        summary = {
            "timestamp": datetime.now().isoformat(),
            "total_tasks": len(self.results),
            "successful_tasks": sum(1 for r in self.results if r.success),
            "failed_tasks": sum(1 for r in self.results if not r.success),
            "average_score": None,
            "results": [r.to_dict() for r in self.results],
        }

        # Calculate average score for successful tasks with scores
        scores = [r.score for r in self.results if r.success and r.score is not None]
        if scores:
            summary["average_score"] = sum(scores) / len(scores)

        with open(summary_file, "w", encoding="utf-8") as f:
            json.dump(summary, f, indent=2)

        print("\n📊 Evaluation Summary:")
        print(f"Total tasks: {summary['total_tasks']}")
        print(f"Successful: {summary['successful_tasks']}")
        print(f"Failed: {summary['failed_tasks']}")
        if summary["average_score"] is not None:
            print(f"Average score: {summary['average_score']:.2f}")
        print(f"Results saved to: {summary_file}")

    def get_results_summary(self) -> Dict[str, Any]:
        """Get a summary of evaluation results."""
        successful = [r for r in self.results if r.success]
        failed = [r for r in self.results if not r.success]

        scores = [r.score for r in successful if r.score is not None]

        return {
            "total_tasks": len(self.results),
            "successful_tasks": len(successful),
            "failed_tasks": len(failed),
            "average_score": sum(scores) / len(scores) if scores else None,
            "min_score": min(scores) if scores else None,
            "max_score": max(scores) if scores else None,
            "results_directory": str(self.temp_folder),
        }

    def cleanup(self) -> None:
        """
        Clean up the temp folder if cleanup_temp_folder is True.
        """
        if self.cleanup_temp_folder and self.is_temp_dir and self.temp_folder.exists():
            try:
                shutil.rmtree(self.temp_folder)
                print(f"Cleaned up temp folder: {self.temp_folder}")
            except Exception as e:
                print(f"Warning: Failed to clean up temp folder {self.temp_folder}: {e}")

    def __del__(self):
        """
        Destructor to clean up temp folder if needed.
        """
        try:
            self.cleanup()
        except Exception:
            # Ignore cleanup errors in destructor
            pass

    def __enter__(self):
        """Context manager entry."""
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        """Context manager exit with cleanup."""
        self.cleanup()
        return False
