"""
Data evaluators for WebArena Verified expected_data field validation.

This module provides individual evaluators for different types of expected data
validation (exact_match, must_include, template_match) and a
registry system to manage them.
"""

import logging
import re
import unicodedata
from abc import ABC, abstractmethod
from collections import Counter
from collections.abc import Callable
from typing import Any, Optional

from .models import AllocationResource
from jinja2 import Template

from .types import (
    EvalFunc,
    WebArenaTask,
    WebarenaTaskEvalResult,
    WebArenaTaskResponse,
)

logger = logging.getLogger(__name__)


def unicode_to_ascii(unicode_string: str) -> str:
    """
    Converts a unicode string to its closest ASCII representation.
    For example, converts "ó" to "o".
    """
    # Normalize the string to separate base characters from accents
    normalized_string = unicodedata.normalize("NFKD", unicode_string)
    # Encode to ASCII, ignoring characters that can't be represented
    # Then decode back to a string
    ascii_string = normalized_string.encode("ascii", "ignore").decode("utf-8")
    return ascii_string


def _convert_to_ascii(data: Any) -> Any:
    """Recursively converts strings in a list or dictionary to ASCII."""
    if isinstance(data, str):
        return unicode_to_ascii(data)
    if isinstance(data, list):
        return [_convert_to_ascii(item) for item in data]
    if isinstance(data, dict):
        return {key: _convert_to_ascii(value) for key, value in data.items()}
    return data


class ArrayComparisonHelper:
    """Helper class for common array comparison and assertion logic."""

    @staticmethod
    def create_length_mismatch_result(
        expected_data: list[Any],
        actual_results: list[Any],
        task: WebArenaTask,
        task_type: str,
    ) -> WebarenaTaskEvalResult:
        """
        Create a standardized length mismatch result with detailed missing/extra item information.

        Args:
            expected_data: Original expected data list
            actual_results: Original actual results list
            task: The WebArena task
            task_type: Type identifier for the result

        Returns:
            WebarenaTaskEvalResult with detailed length mismatch information
        """
        error_msgs = [
            f"Length mismatch: Expected {len(expected_data)} values, got {len(actual_results)} values"
        ]
        error_msgs.append(f"Expected values: {expected_data}")
        error_msgs.append(f"Actual values: {actual_results}")

        # Use set-based comparison to find actual missing/extra items
        expected_counter = Counter(expected_data)
        actual_counter = Counter(actual_results)

        if len(expected_data) > len(actual_results):
            # Find items that are in expected but missing from actual
            missing_items = []
            for item, expected_count in expected_counter.items():
                actual_count = actual_counter.get(item, 0)
                if actual_count < expected_count:
                    missing_items.extend([item] * (expected_count - actual_count))

            missing_count = len(missing_items)
            error_msgs.append(
                f"Missing {missing_count} expected values: {missing_items}"
            )
        else:
            # Find items that are in actual but not expected
            extra_items = []
            for item, actual_count in actual_counter.items():
                expected_count = expected_counter.get(item, 0)
                if actual_count > expected_count:
                    extra_items.extend([item] * (actual_count - expected_count))

            extra_count = len(extra_items)
            error_msgs.append(f"Extra {extra_count} actual values: {extra_items}")

        return WebarenaTaskEvalResult(
            score=0.0,
            assertion_msgs=error_msgs,
            task_id=task.task_id,
            task_description=task.intent,
            task_type=task_type,
        )

    @staticmethod
    def compare_ordered_arrays(
        expected_data: list[Any],
        actual_results: list[Any],
        normalized_expected: list[Any],
        normalized_actual: list[Any],
        task: WebArenaTask,
        task_type: str,
        comparison_func: Callable[[Any, Any], bool] = lambda x, y: x == y,
    ) -> WebarenaTaskEvalResult:
        """
        Compare two arrays in ordered fashion and return appropriate result.

        Args:
            expected_data: Original expected data
            actual_results: Original actual results
            normalized_expected: Normalized expected values for comparison
            normalized_actual: Normalized actual values for comparison
            task: The WebArena task
            task_type: Type identifier for the result
            comparison_func: Function to compare normalized values (default: equality)

        Returns:
            WebarenaTaskEvalResult with success or detailed mismatch information
        """
        # Check if all items match using the comparison function
        matches = all(
            comparison_func(exp, act)
            for exp, act in zip(normalized_expected, normalized_actual)
        )

        if matches:
            return WebarenaTaskEvalResult(
                score=1.0,
                assertion_msgs=[
                    f"Ordered {task_type} match successful. Expected values: {expected_data}, Actual values: {actual_results}"
                ],
                task_id=task.task_id,
                task_description=task.intent,
                task_type=task_type,
            )
        else:
            # Show detailed mismatch information
            mismatches = []
            for i, (exp_norm, act_norm, exp_orig, act_orig) in enumerate(
                zip(
                    normalized_expected,
                    normalized_actual,
                    expected_data,
                    actual_results,
                )
            ):
                if not comparison_func(exp_norm, act_norm):
                    mismatches.append(
                        f"Position {i}: Expected '{exp_orig}' but got '{act_orig}'"
                    )

            return WebarenaTaskEvalResult(
                score=0.0,
                assertion_msgs=[
                    f"Ordered {task_type} match failed. Expected values: {expected_data}, Actual values: {actual_results}",
                    "Mismatches: " + "; ".join(mismatches),
                ],
                task_id=task.task_id,
                task_description=task.intent,
                task_type=task_type,
            )

    @staticmethod
    def compare_unordered_arrays(
        expected_data: list[Any],
        actual_results: list[Any],
        normalized_expected: list[Any],
        normalized_actual: list[Any],
        task: WebArenaTask,
        task_type: str,
    ) -> WebarenaTaskEvalResult:
        """
        Compare two arrays in unordered fashion and return appropriate result.

        Args:
            expected_data: Original expected data
            actual_results: Original actual results
            normalized_expected: Normalized expected values for comparison
            normalized_actual: Normalized actual values for comparison
            task: The WebArena task
            task_type: Type identifier for the result

        Returns:
            WebarenaTaskEvalResult with success or detailed missing/extra information
        """
        expected_counter = Counter(normalized_expected)
        actual_counter = Counter(normalized_actual)

        if expected_counter == actual_counter:
            return WebarenaTaskEvalResult(
                score=1.0,
                assertion_msgs=[
                    f"Unordered {task_type} match successful. Expected values: {expected_data}, Actual values: {actual_results}"
                ],
                task_id=task.task_id,
                task_description=task.intent,
                task_type=task_type,
            )
        else:
            # Show detailed missing/extra information
            missing_items = []
            extra_items = []

            # Find missing expected items
            for item, count in expected_counter.items():
                actual_count = actual_counter.get(item, 0)
                if actual_count < count:
                    # Find original values that correspond to this normalized item
                    orig_values = [
                        orig
                        for orig, norm in zip(expected_data, normalized_expected)
                        if norm == item
                    ]
                    missing_items.extend(orig_values[: count - actual_count])

            # Find extra actual items
            for item, count in actual_counter.items():
                expected_count = expected_counter.get(item, 0)
                if count > expected_count:
                    # Find original values that correspond to this normalized item
                    orig_values = [
                        orig
                        for orig, norm in zip(actual_results, normalized_actual)
                        if norm == item
                    ]
                    extra_items.extend(orig_values[: count - expected_count])

            error_msgs = [
                f"Unordered {task_type} match failed. Expected values: {expected_data}, Actual values: {actual_results}"
            ]
            if missing_items:
                error_msgs.append(f"Missing expected items: {missing_items}")
            if extra_items:
                error_msgs.append(f"Extra actual items: {extra_items}")

            return WebarenaTaskEvalResult(
                score=0.0,
                assertion_msgs=error_msgs,
                task_id=task.task_id,
                task_description=task.intent,
                task_type=task_type,
            )

    @staticmethod
    def evaluate_array_comparison(
        expected_data: list[Any],
        actual_results: list[Any],
        task: WebArenaTask,
        task_type: str,
        normalize_func: Callable[[Any], Any],
        ordered: bool = False,
        comparison_func: Callable[[Any, Any], bool] = lambda x, y: x == y,
    ) -> WebarenaTaskEvalResult:
        """
        Complete array comparison workflow with normalization and comparison.

        Args:
            expected_data: Original expected data list
            actual_results: Original actual results list
            task: The WebArena task
            task_type: Type identifier for the result
            normalize_func: Function to normalize individual values
            ordered: Whether to perform ordered comparison
            comparison_func: Function to compare normalized values (default: equality)

        Returns:
            WebarenaTaskEvalResult with appropriate success or failure information
        """
        try:
            # Normalize all values
            normalized_expected = [normalize_func(item) for item in expected_data]
            normalized_actual = [normalize_func(item) for item in actual_results]
        except ValueError as e:
            return WebarenaTaskEvalResult(
                score=0.0,
                assertion_msgs=[f"Normalization error: {str(e)}"],
                task_id=task.task_id,
                task_description=task.intent,
                task_type=task_type,
            )

        # Check if lengths match
        if len(normalized_expected) != len(normalized_actual):
            return ArrayComparisonHelper.create_length_mismatch_result(
                normalized_expected, normalized_actual, task, task_type
            )

        # Compare based on ordered flag
        if ordered:
            return ArrayComparisonHelper.compare_ordered_arrays(
                expected_data,
                actual_results,
                normalized_expected,
                normalized_actual,
                task,
                task_type,
                comparison_func,
            )
        else:
            return ArrayComparisonHelper.compare_unordered_arrays(
                expected_data,
                actual_results,
                normalized_expected,
                normalized_actual,
                task,
                task_type,
            )


class DataEvaluator(ABC):
    """Abstract base class for data evaluators."""

    @abstractmethod
    async def evaluate(
        self,
        task: WebArenaTask,
        task_result: WebArenaTaskResponse,
        resource: AllocationResource,
        eval_func: EvalFunc,
        **kwargs,
    ) -> WebarenaTaskEvalResult:
        """
        Evaluate the task result against expected data.

        Args:
            task: The WebArena task being evaluated
            task_result: The result from the test execution
            resource: The allocated resource
            eval_func: The evaluation function specification containing name, eval_params, and expected_data
            **kwargs: Additional arguments

        Returns:
            WebarenaTaskEvalResult: The evaluation result
        """
        pass

    @abstractmethod
    def can_evaluate(self, type: str) -> bool:
        """
        Check if this evaluator can handle the given type.

        Args:
            type: The evaluation type to check

        Returns:
            bool: True if this evaluator can handle the type
        """
        pass

    @property
    @abstractmethod
    def evaluator_type(self) -> str:
        """
        Return the type identifier for this evaluator.

        Returns:
            str: The evaluator type (e.g., "exact_match", "numeric")
        """
        pass


class TextMatchEvaluator(DataEvaluator):
    """Evaluator for text type validation."""

    async def evaluate(
        self,
        task: WebArenaTask,
        task_result: WebArenaTaskResponse,
        resource: AllocationResource,
        eval_func: EvalFunc,
        ordered: bool = False,
        **kwargs,
    ) -> WebarenaTaskEvalResult:
        """Evaluate using text matching with list comparison."""
        try:
            expected_data = eval_func.expected_data

            # Expected data should be a list
            if not isinstance(expected_data, list):
                return WebarenaTaskEvalResult(
                    score=0.0,
                    assertion_msgs=[
                        "Text match evaluation requires expected_data to be a list, got: "
                        + str(type(expected_data))
                    ],
                    task_id=task.task_id,
                    task_description=task.intent,
                    task_type="text",
                )

            if not expected_data:
                return WebarenaTaskEvalResult(
                    score=0.0,
                    assertion_msgs=["No expected values provided for text matching"],
                    task_id=task.task_id,
                    task_description=task.intent,
                    task_type="text",
                )

            # Get actual results from task_result
            actual_results = (
                task_result.response.results if task_result.response.results else []
            )

            # Use ArrayComparisonHelper to compare arrays
            def normalize_with_case_sensitivity(text):
                case_sensitive = False
                if eval_func.eval_params:
                    case_sensitive = eval_func.eval_params.get("case_sensitive", False)
                return self._normalize_text(text, case_sensitive)

            return ArrayComparisonHelper.evaluate_array_comparison(
                expected_data,
                actual_results,
                task,
                "text",
                normalize_with_case_sensitivity,
                ordered=ordered,
            )

        except Exception as e:
            logger.debug(f"Error in text match evaluation: {e}")
            return WebarenaTaskEvalResult(
                score=0.0,
                assertion_msgs=[f"Text match evaluation error: {str(e)}"],
                task_id=task.task_id,
                task_description=task.intent,
                task_type="text",
            )

    def _normalize_text(self, value, case_sensitive: bool = False) -> str:
        """
        Normalize a value to text for comparison.
        Handles case-insensitive or case-sensitive text comparison based on parameter.

        Args:
            value: The value to normalize (can be any type)
            case_sensitive: If True, preserve case; if False, convert to lowercase

        Returns:
            str: The normalized text value (trimmed, optionally lowercased)

        Raises:
            ValueError: If the value cannot be converted to text
        """
        try:
            # Handle None as empty string
            if value is None:
                return ""

            # Convert to string and normalize
            text = str(value).strip()

            # Handle empty strings
            if not text:
                raise ValueError("Cannot convert empty string to text")

            # Apply case normalization based on parameter
            if not case_sensitive:
                text = text.lower()

            return text

        except (ValueError, TypeError) as e:
            raise ValueError(f"Cannot convert '{value}' to text: {str(e)}")

    def can_evaluate(self, type: str) -> bool:
        """Check if this evaluator can handle the given type."""
        return type == "text"

    @property
    def evaluator_type(self) -> str:
        """Return the type identifier for this evaluator."""
        return "text"


class NumberMatchEvaluator(DataEvaluator):
    """Evaluator for numeric type validation."""

    async def evaluate(
        self,
        task: WebArenaTask,
        task_result: WebArenaTaskResponse,
        resource: AllocationResource,
        eval_func: EvalFunc,
        ordered: bool = False,
        **kwargs,
    ) -> WebarenaTaskEvalResult:
        """Evaluate using number matching with list comparison."""
        try:
            expected_data = eval_func.expected_data

            # Expected data should be a list
            if not isinstance(expected_data, list):
                return WebarenaTaskEvalResult(
                    score=0.0,
                    assertion_msgs=[
                        "Number match evaluation requires expected_data to be a list, got: "
                        + str(type(expected_data))
                    ],
                    task_id=task.task_id,
                    task_description=task.intent,
                    task_type="numeric",
                )

            if not expected_data:
                return WebarenaTaskEvalResult(
                    score=0.0,
                    assertion_msgs=["No expected values provided for number matching"],
                    task_id=task.task_id,
                    task_description=task.intent,
                    task_type="numeric",
                )

            # Get actual results from task_result
            actual_results = (
                task_result.response.results if task_result.response.results else []
            )

            # Use ArrayComparisonHelper to compare arrays
            return ArrayComparisonHelper.evaluate_array_comparison(
                expected_data,
                actual_results,
                task,
                "numeric",
                self._normalize_number,
                ordered=ordered,
            )

        except Exception as e:
            logger.debug(f"Error in number match evaluation: {e}")
            return WebarenaTaskEvalResult(
                score=0.0,
                assertion_msgs=[f"Number match evaluation error: {str(e)}"],
                task_id=task.task_id,
                task_description=task.intent,
                task_type="numeric",
            )

    def _normalize_number(self, value) -> float:
        """
        Normalize a value to a float for numeric comparison.
        Handles int, str, and mixed formats like 1, "1", "01", etc.

        Args:
            value: The value to normalize (can be int, float, str)

        Returns:
            float: The normalized numeric value

        Raises:
            ValueError: If the value cannot be converted to a number
        """
        try:
            # Handle None
            if value is None:
                raise ValueError("Cannot convert None to number")

            # Convert to string first to handle leading zeros, then to float
            str_value = str(value).strip()

            # Handle empty strings
            if not str_value:
                raise ValueError("Cannot convert empty string to number")

            # Convert to float for comparison (handles int, float, and numeric strings)
            return float(str_value)

        except (ValueError, TypeError) as e:
            raise ValueError(f"Cannot convert '{value}' to number: {str(e)}")

    def can_evaluate(self, type: str) -> bool:
        """Check if this evaluator can handle the given type."""
        return type == "numeric"

    @property
    def evaluator_type(self) -> str:
        """Return the type identifier for this evaluator."""
        return "numeric"


class CurrencyMatchEvaluator(DataEvaluator):
    """Evaluator for currency type validation."""

    async def evaluate(
        self,
        task: WebArenaTask,
        task_result: WebArenaTaskResponse,
        resource: AllocationResource,
        eval_func: EvalFunc,
        ordered: bool = False,
        **kwargs,
    ) -> WebarenaTaskEvalResult:
        """Evaluate using currency matching with list comparison."""
        try:
            expected_data = eval_func.expected_data

            # Expected data should be a list
            if not isinstance(expected_data, list):
                return WebarenaTaskEvalResult(
                    score=0.0,
                    assertion_msgs=[
                        "Currency match evaluation requires expected_data to be a list, got: "
                        + str(type(expected_data))
                    ],
                    task_id=task.task_id,
                    task_description=task.intent,
                    task_type="currency",
                )

            if not expected_data:
                return WebarenaTaskEvalResult(
                    score=0.0,
                    assertion_msgs=[
                        "No expected values provided for currency matching"
                    ],
                    task_id=task.task_id,
                    task_description=task.intent,
                    task_type="currency",
                )

            # Get actual results from task_result
            actual_results = (
                task_result.response.results if task_result.response.results else []
            )

            # Use ArrayComparisonHelper to compare arrays
            return ArrayComparisonHelper.evaluate_array_comparison(
                expected_data,
                actual_results,
                task,
                "currency",
                self._normalize_currency,
                ordered=ordered,
            )

        except Exception as e:
            logger.debug(f"Error in currency match evaluation: {e}")
            return WebarenaTaskEvalResult(
                score=0.0,
                assertion_msgs=[f"Currency match evaluation error: {str(e)}"],
                task_id=task.task_id,
                task_description=task.intent,
                task_type="currency",
            )

    def _normalize_currency(self, value) -> float:
        """
        Normalize a value to a currency amount for comparison.
        Handles various formats like "$3", "3", "$3.00", "3.00", "€3", etc.
        Note: This is format normalization only, no currency exchange.

        Args:
            value: The value to normalize (can be str, int, float with currency symbols)

        Returns:
            float: The normalized currency amount

        Raises:
            ValueError: If the value cannot be converted to a valid currency amount
        """
        try:
            # Handle None
            if value is None:
                raise ValueError("Cannot convert None to currency")

            # Convert to string and normalize
            str_value = str(value).strip()

            # Handle empty strings
            if not str_value:
                raise ValueError("Cannot convert empty string to currency")

            import re

            # Remove common currency symbols, spaces, and commas
            # Supports: $, €, £, ¥, ₹, ₽, ¢, etc.
            cleaned = re.sub(r"[\$€£¥₹₽¢\s,]", "", str_value)

            # Handle empty string after cleaning (e.g., just "$")
            if not cleaned:
                return 0.0

            # Parse as float
            return float(cleaned)

        except (ValueError, TypeError) as e:
            raise ValueError(f"Cannot convert '{value}' to currency: {str(e)}")

    def can_evaluate(self, type: str) -> bool:
        """Check if this evaluator can handle the given type."""
        return type == "currency"

    @property
    def evaluator_type(self) -> str:
        """Return the type identifier for this evaluator."""
        return "currency"


class MonthMatchEvaluator(DataEvaluator):
    """Evaluator for month type validation."""

    async def evaluate(
        self,
        task: WebArenaTask,
        task_result: WebArenaTaskResponse,
        resource: AllocationResource,
        eval_func: EvalFunc,
        ordered: bool = False,
        **kwargs,
    ) -> WebarenaTaskEvalResult:
        """Evaluate using month matching with list comparison."""
        try:
            expected_data = eval_func.expected_data

            # Expected data should be a list
            if not isinstance(expected_data, list):
                return WebarenaTaskEvalResult(
                    score=0.0,
                    assertion_msgs=[
                        "Month match evaluation requires expected_data to be a list, got: "
                        + str(type(expected_data))
                    ],
                    task_id=task.task_id,
                    task_description=task.intent,
                    task_type="month",
                )

            if not expected_data:
                return WebarenaTaskEvalResult(
                    score=0.0,
                    assertion_msgs=["No expected values provided for month matching"],
                    task_id=task.task_id,
                    task_description=task.intent,
                    task_type="month",
                )

            # Get actual results from task_result
            actual_results = (
                task_result.response.results if task_result.response.results else []
            )

            # Use ArrayComparisonHelper to compare arrays
            return ArrayComparisonHelper.evaluate_array_comparison(
                expected_data,
                actual_results,
                task,
                "month",
                self._normalize_month,
                ordered=ordered,
            )

        except Exception as e:
            logger.debug(f"Error in month match evaluation: {e}")
            return WebarenaTaskEvalResult(
                score=0.0,
                assertion_msgs=[f"Month match evaluation error: {str(e)}"],
                task_id=task.task_id,
                task_description=task.intent,
                task_type="month",
            )

    def _normalize_month(self, value) -> int:
        """
        Normalize a value to a month number (1-12) for comparison.
        Handles int, str, and month names like "January", "Jan", "1", etc.

        Args:
            value: The value to normalize (can be int, str, month name)

        Returns:
            int: The normalized month number (1-12)

        Raises:
            ValueError: If the value cannot be converted to a valid month
        """
        try:
            # Handle None
            if value is None:
                raise ValueError("Cannot convert None to month")

            # Convert to string and normalize
            str_value = str(value).strip().lower()

            # Handle empty strings
            if not str_value:
                raise ValueError("Cannot convert empty string to month")

            # Try parsing as number first
            try:
                month_num = int(str_value)
                if 1 <= month_num <= 12:
                    return month_num
                else:
                    raise ValueError(
                        f"Month number must be between 1-12, got {month_num}"
                    )
            except ValueError:
                pass

            # Try parsing as month name
            month_names = {
                "january": 1,
                "jan": 1,
                "february": 2,
                "feb": 2,
                "march": 3,
                "mar": 3,
                "april": 4,
                "apr": 4,
                "may": 5,
                "june": 6,
                "jun": 6,
                "july": 7,
                "jul": 7,
                "august": 8,
                "aug": 8,
                "september": 9,
                "sep": 9,
                "sept": 9,
                "october": 10,
                "oct": 10,
                "november": 11,
                "nov": 11,
                "december": 12,
                "dec": 12,
            }

            if str_value in month_names:
                return month_names[str_value]

            raise ValueError(f"Cannot parse '{value}' as month")

        except (ValueError, TypeError) as e:
            raise ValueError(f"Cannot convert '{value}' to month: {str(e)}")

    def can_evaluate(self, type: str) -> bool:
        """Check if this evaluator can handle the given type."""
        return type == "month"

    @property
    def evaluator_type(self) -> str:
        """Return the type identifier for this evaluator."""
        return "month"


class DurationMatchEvaluator(DataEvaluator):
    """Evaluator for duration type validation."""

    async def evaluate(
        self,
        task: WebArenaTask,
        task_result: WebArenaTaskResponse,
        resource: AllocationResource,
        eval_func: EvalFunc,
        ordered: bool = False,
        **kwargs,
    ) -> WebarenaTaskEvalResult:
        """Evaluate using duration matching with list comparison."""
        try:
            expected_data = eval_func.expected_data

            # Expected data should be a list
            if not isinstance(expected_data, list):
                return WebarenaTaskEvalResult(
                    score=0.0,
                    assertion_msgs=[
                        "Duration match evaluation requires expected_data to be a list, got: "
                        + str(type(expected_data))
                    ],
                    task_id=task.task_id,
                    task_description=task.intent,
                    task_type="duration",
                )

            if not expected_data:
                return WebarenaTaskEvalResult(
                    score=0.0,
                    assertion_msgs=[
                        "No expected values provided for duration matching"
                    ],
                    task_id=task.task_id,
                    task_description=task.intent,
                    task_type="duration",
                )

            # Get actual results from task_result
            actual_results = (
                task_result.response.results if task_result.response.results else []
            )

            # Use ArrayComparisonHelper to compare arrays
            return ArrayComparisonHelper.evaluate_array_comparison(
                expected_data,
                actual_results,
                task,
                "duration",
                self._normalize_duration,
                ordered=ordered,
                comparison_func=self._durations_equal,
            )

        except Exception as e:
            logger.debug(f"Error in duration match evaluation: {e}")
            return WebarenaTaskEvalResult(
                score=0.0,
                assertion_msgs=[f"Duration match evaluation error: {str(e)}"],
                task_id=task.task_id,
                task_description=task.intent,
                task_type="duration",
            )

    def _normalize_duration(self, value) -> float:
        """
        Normalize a value to duration in seconds for comparison.
        Handles various formats like "5 minutes", "1 hour", "30s", "2.5", etc.

        Args:
            value: The value to normalize (can be int, float, str with units)

        Returns:
            float: The normalized duration in seconds

        Raises:
            ValueError: If the value cannot be converted to a valid duration
        """
        try:
            # Handle None
            if value is None:
                raise ValueError("Cannot convert None to duration")

            # Convert to string and normalize
            str_value = str(value).strip().lower()

            # Handle empty strings
            if not str_value:
                raise ValueError("Cannot convert empty string to duration")

            # Try to extract number and unit using regex
            match = re.match(
                r"(\d+(?:\.\d+)?)\s*(second|minute|hour|day|week|month|year|sec|min|hr|s|m|h|d|w|y)s?",
                str_value,
            )

            if match:
                num = float(match.group(1))
                unit = match.group(2)

                multipliers = {
                    "second": 1,
                    "sec": 1,
                    "s": 1,
                    "minute": 60,
                    "min": 60,
                    "m": 60,
                    "hour": 3600,
                    "hr": 3600,
                    "h": 3600,
                    "day": 86400,
                    "d": 86400,
                    "week": 604800,
                    "w": 604800,
                    "month": 2592000,  # 30 days
                    "year": 31536000,
                    "y": 31536000,  # 365 days
                }

                return num * multipliers.get(unit, 1)

            # Try parsing as just a number (assume seconds)
            try:
                return float(str_value)
            except ValueError:
                pass

            raise ValueError(f"Cannot parse '{value}' as duration")

        except (ValueError, TypeError) as e:
            raise ValueError(f"Cannot convert '{value}' to duration: {str(e)}")

    def _durations_equal(
        self, duration1: float, duration2: float, tolerance: float = 60.0
    ) -> bool:
        """
        Check if two durations are equal within tolerance.

        Args:
            duration1: First duration in seconds
            duration2: Second duration in seconds
            tolerance: Tolerance in seconds (default 60 seconds = 1 minute)

        Returns:
            bool: True if durations are equal within tolerance
        """
        return abs(duration1 - duration2) <= tolerance

    def can_evaluate(self, type: str) -> bool:
        """Check if this evaluator can handle the given type."""
        return type == "duration"

    @property
    def evaluator_type(self) -> str:
        """Return the type identifier for this evaluator."""
        return "duration"


class DistanceMatchEvaluator(DataEvaluator):
    """Evaluator for distance type validation."""

    async def evaluate(
        self,
        task: WebArenaTask,
        task_result: WebArenaTaskResponse,
        resource: AllocationResource,
        eval_func: EvalFunc,
        ordered: bool = False,
        **kwargs,
    ) -> WebarenaTaskEvalResult:
        """Evaluate using distance matching with list comparison."""
        try:
            expected_data = eval_func.expected_data

            # Expected data should be a list
            if not isinstance(expected_data, list):
                return WebarenaTaskEvalResult(
                    score=0.0,
                    assertion_msgs=[
                        "Distance match evaluation requires expected_data to be a list, got: "
                        + str(type(expected_data))
                    ],
                    task_id=task.task_id,
                    task_description=task.intent,
                    task_type="distance",
                )

            if not expected_data:
                return WebarenaTaskEvalResult(
                    score=0.0,
                    assertion_msgs=[
                        "No expected values provided for distance matching"
                    ],
                    task_id=task.task_id,
                    task_description=task.intent,
                    task_type="distance",
                )

            # Get actual results from task_result
            actual_results = (
                task_result.response.results if task_result.response.results else []
            )

            # Use ArrayComparisonHelper to compare arrays
            return ArrayComparisonHelper.evaluate_array_comparison(
                expected_data,
                actual_results,
                task,
                "distance",
                self._normalize_distance,
                ordered=ordered,
                comparison_func=self._distances_equal,
            )

        except Exception as e:
            logger.debug(f"Error in distance match evaluation: {e}")
            return WebarenaTaskEvalResult(
                score=0.0,
                assertion_msgs=[f"Distance match evaluation error: {str(e)}"],
                task_id=task.task_id,
                task_description=task.intent,
                task_type="distance",
            )

    def _normalize_distance(self, value) -> float:
        """
        Normalize a value to distance in meters for comparison.
        Handles various formats like "5 km", "1 mile", "30m", "2.5 kilometers", "100 feet", etc.

        Args:
            value: The value to normalize (can be int, float, str with units)

        Returns:
            float: The normalized distance in meters

        Raises:
            ValueError: If the value cannot be converted to a valid distance
        """
        try:
            # Handle None
            if value is None:
                raise ValueError("Cannot convert None to distance")

            # Convert to string and normalize
            str_value = str(value).strip().lower()

            # Handle empty strings
            if not str_value:
                raise ValueError("Cannot convert empty string to distance")

            # Try to extract number and unit using regex
            match = re.match(
                r"(\d+(?:\.\d+)?)\s*(kilometer|kilometre|meter|metre|centimeter|centimetre|millimeter|millimetre|mile|yard|foot|feet|inch|inches|km|m|cm|mm|mi|yd|ft|in)s?",
                str_value,
            )

            if match:
                num = float(match.group(1))
                unit = match.group(2)

                # Convert to meters
                multipliers = {
                    "kilometer": 1000,
                    "kilometre": 1000,
                    "km": 1000,
                    "meter": 1,
                    "metre": 1,
                    "m": 1,
                    "centimeter": 0.01,
                    "centimetre": 0.01,
                    "cm": 0.01,
                    "millimeter": 0.001,
                    "millimetre": 0.001,
                    "mm": 0.001,
                    "mile": 1609.344,
                    "mi": 1609.344,
                    "yard": 0.9144,
                    "yd": 0.9144,
                    "foot": 0.3048,
                    "feet": 0.3048,
                    "ft": 0.3048,
                    "inch": 0.0254,
                    "inches": 0.0254,
                    "in": 0.0254,
                }

                return num * multipliers.get(unit, 1)

            # Try parsing as just a number (assume meters)
            try:
                return float(str_value)
            except ValueError:
                pass

            raise ValueError(f"Cannot parse '{value}' as distance")

        except (ValueError, TypeError) as e:
            raise ValueError(f"Cannot convert '{value}' to distance: {str(e)}")

    def _distances_equal(
        self, distance1: float, distance2: float, tolerance: float = 1.0
    ) -> bool:
        """
        Check if two distances are equal within tolerance.

        Args:
            distance1: First distance in meters
            distance2: Second distance in meters
            tolerance: Tolerance in meters (default 1 meter)

        Returns:
            bool: True if distances are equal within tolerance
        """
        return abs(distance1 - distance2) <= tolerance

    def can_evaluate(self, type: str) -> bool:
        """Check if this evaluator can handle the given type."""
        return type == "distance"

    @property
    def evaluator_type(self) -> str:
        """Return the type identifier for this evaluator."""
        return "distance"


class DurationRangeEvaluator(DataEvaluator):
    """Evaluator for duration range type validation."""

    async def evaluate(
        self,
        task: WebArenaTask,
        task_result: WebArenaTaskResponse,
        resource: AllocationResource,
        eval_func: EvalFunc,
        ordered: bool = False,
        **kwargs,
    ) -> WebarenaTaskEvalResult:
        """Evaluate using duration range matching with list comparison."""
        try:
            expected_data = eval_func.expected_data

            # Expected data should be a list
            if not isinstance(expected_data, list):
                return WebarenaTaskEvalResult(
                    score=0.0,
                    assertion_msgs=[
                        "Duration range evaluation requires expected_data to be a list, got: "
                        + str(type(expected_data))
                    ],
                    task_id=task.task_id,
                    task_description=task.intent,
                    task_type="duration_range",
                )

            if not expected_data:
                return WebarenaTaskEvalResult(
                    score=0.0,
                    assertion_msgs=[
                        "No expected values provided for duration range matching"
                    ],
                    task_id=task.task_id,
                    task_description=task.intent,
                    task_type="duration_range",
                )

            # Get actual results from task_result
            actual_results = (
                task_result.response.results if task_result.response.results else []
            )

            # Use ArrayComparisonHelper to compare arrays
            return ArrayComparisonHelper.evaluate_array_comparison(
                expected_data,
                actual_results,
                task,
                "duration_range",
                self._normalize_duration_range,
                ordered=ordered,
                comparison_func=self._duration_ranges_overlap,
            )

        except Exception as e:
            logger.debug(f"Error in duration range evaluation: {e}")
            return WebarenaTaskEvalResult(
                score=0.0,
                assertion_msgs=[f"Duration range evaluation error: {str(e)}"],
                task_id=task.task_id,
                task_description=task.intent,
                task_type="duration_range",
            )

    def _normalize_duration_range(self, value) -> tuple[float, float]:
        """
        Normalize a value to duration range in seconds for comparison.
        Handles various formats like:
        - Range: "5-10 minutes", "1-2 hours"
        - Comparison: "<1 minute", ">5 hours", "≤30 seconds", "≥2 days"
        - Single: "30s", "90"
        Also handles single durations as point ranges.

        Args:
            value: The value to normalize (can be str with range, comparison, or single duration)

        Returns:
            tuple[float, float]: The normalized duration range (min, max) in seconds

        Raises:
            ValueError: If the value cannot be converted to a valid duration range
        """
        try:
            # Handle None
            if value is None:
                raise ValueError("Cannot convert None to duration range")

            # Convert to string and normalize
            str_value = str(value).strip().lower()

            # Handle empty strings
            if not str_value:
                raise ValueError("Cannot convert empty string to duration range")

            # Define multipliers for time units
            multipliers = {
                "second": 1,
                "sec": 1,
                "s": 1,
                "minute": 60,
                "min": 60,
                "m": 60,
                "hour": 3600,
                "hr": 3600,
                "h": 3600,
                "day": 86400,
                "d": 86400,
                "week": 604800,
                "w": 604800,
                "month": 2592000,  # 30 days
                "year": 31536000,
                "y": 31536000,  # 365 days
            }

            # First check for malformed comparison expressions (operator without valid number/unit)
            malformed_comparison = re.match(r"(<=|>=|[<>≤≥])\s*(.*)$", str_value)
            if malformed_comparison:
                operator = malformed_comparison.group(1)
                remainder = malformed_comparison.group(2).strip()

                # If remainder is empty or doesn't start with a number, it's malformed
                if not remainder or not re.match(r"\d", remainder):
                    raise ValueError(
                        f"Malformed comparison expression: '{value}' - operator '{operator}' must be followed by a number and optional time unit"
                    )

            # Check for comparison operators: <, >, ≤, ≥, <=, >=
            comparison_match = re.match(
                r"(<=|>=|[<>≤≥])\s*(\d+(?:\.\d+)?)\s*(second|minute|hour|day|week|month|year|sec|min|hr|s|m|h|d|w|y)s?",
                str_value,
            )

            if comparison_match:
                operator = comparison_match.group(1)
                num = float(comparison_match.group(2))
                unit = comparison_match.group(3)

                seconds = num * multipliers.get(unit, 1)

                # Convert comparison to range
                # For practical purposes, we'll use reasonable bounds
                max_reasonable_duration = 365 * 24 * 3600  # 1 year in seconds

                if operator in ["<", "≤", "<="]:
                    # Less than: range from 0 to the value
                    return (0.0, seconds)
                elif operator in [">", "≥", ">="]:
                    # Greater than: range from the value to a reasonable maximum
                    return (seconds, max_reasonable_duration)

            # Check for range patterns like "5-10 minutes", "1-2 hours"
            range_match = re.match(
                r"(\d+(?:\.\d+)?)\s*[-–—]\s*(\d+(?:\.\d+)?)\s*(second|minute|hour|day|week|month|year|sec|min|hr|s|m|h|d|w|y)s?",
                str_value,
            )

            if range_match:
                min_val = float(range_match.group(1))
                max_val = float(range_match.group(2))
                unit = range_match.group(3)

                multiplier = multipliers.get(unit, 1)
                return (min_val * multiplier, max_val * multiplier)

            # Check for single duration with unit
            single_match = re.match(
                r"(\d+(?:\.\d+)?)\s*(second|minute|hour|day|week|month|year|sec|min|hr|s|m|h|d|w|y)s?",
                str_value,
            )

            if single_match:
                num = float(single_match.group(1))
                unit = single_match.group(2)

                seconds = num * multipliers.get(unit, 1)
                # Single duration becomes a point range
                return (seconds, seconds)

            # Try parsing as just a number (assume seconds)
            try:
                seconds = float(str_value)
                return (seconds, seconds)
            except ValueError:
                pass

            # Check for numeric range without units like "5-10"
            numeric_range_match = re.match(
                r"(\d+(?:\.\d+)?)\s*[-–—]\s*(\d+(?:\.\d+)?)", str_value
            )
            if numeric_range_match:
                min_val = float(numeric_range_match.group(1))
                max_val = float(numeric_range_match.group(2))
                return (min_val, max_val)

            # Check for numeric comparison without units like "<5", ">=10"
            numeric_comparison_match = re.match(
                r"(<=|>=|[<>≤≥])\s*(\d+(?:\.\d+)?)", str_value
            )
            if numeric_comparison_match:
                operator = numeric_comparison_match.group(1)
                num = float(numeric_comparison_match.group(2))

                max_reasonable_duration = 365 * 24 * 3600  # 1 year in seconds

                if operator in ["<", "≤", "<="]:
                    return (0.0, num)
                elif operator in [">", "≥", ">="]:
                    return (num, max_reasonable_duration)

            raise ValueError(f"Cannot parse '{value}' as duration range")

        except (ValueError, TypeError) as e:
            raise ValueError(f"Cannot convert '{value}' to duration range: {str(e)}")

    def _duration_ranges_overlap(
        self,
        range1: tuple[float, float],
        range2: tuple[float, float],
        tolerance: float = 60.0,
    ) -> bool:
        """
        Check if two duration ranges overlap or are within tolerance.

        Args:
            range1: First duration range (min, max) in seconds
            range2: Second duration range (min, max) in seconds
            tolerance: Tolerance in seconds (default 60 seconds = 1 minute)

        Returns:
            bool: True if ranges overlap or are within tolerance
        """
        min1, max1 = range1
        min2, max2 = range2

        # Check for overlap or near overlap within tolerance
        return not (max1 + tolerance < min2 or max2 + tolerance < min1)

    def can_evaluate(self, type: str) -> bool:
        """Check if this evaluator can handle the given type."""
        return type == "duration_range"

    @property
    def evaluator_type(self) -> str:
        """Return the type identifier for this evaluator."""
        return "duration_range"


class DateMatchEvaluator(DataEvaluator):
    """Evaluator for date type validation."""

    async def evaluate(
        self,
        task: WebArenaTask,
        task_result: WebArenaTaskResponse,
        resource: AllocationResource,
        eval_func: EvalFunc,
        ordered: bool = False,
        **kwargs,
    ) -> WebarenaTaskEvalResult:
        """Evaluate using date matching with list comparison."""
        try:
            expected_data = eval_func.expected_data

            # Expected data should be a list
            if not isinstance(expected_data, list):
                return WebarenaTaskEvalResult(
                    score=0.0,
                    assertion_msgs=[
                        "Date match evaluation requires expected_data to be a list, got: "
                        + str(type(expected_data))
                    ],
                    task_id=task.task_id,
                    task_description=task.intent,
                    task_type="date",
                )

            if not expected_data:
                return WebarenaTaskEvalResult(
                    score=0.0,
                    assertion_msgs=["No expected values provided for date matching"],
                    task_id=task.task_id,
                    task_description=task.intent,
                    task_type="date",
                )

            # Get actual results from task_result
            actual_results = (
                task_result.response.results if task_result.response.results else []
            )

            # Use ArrayComparisonHelper to compare arrays
            return ArrayComparisonHelper.evaluate_array_comparison(
                expected_data,
                actual_results,
                task,
                "date",
                self._normalize_date,
                ordered=ordered,
            )

        except Exception as e:
            logger.debug(f"Error in date match evaluation: {e}")
            return WebarenaTaskEvalResult(
                score=0.0,
                assertion_msgs=[f"Date match evaluation error: {str(e)}"],
                task_id=task.task_id,
                task_description=task.intent,
                task_type="date",
            )

    def _normalize_date(self, value) -> str:
        """
        Normalize a value to ISO date format (YYYY-MM-DD) for comparison.
        Handles various date formats like "2023-06-12", "6/12/2023", "June 12, 2023", "June 12th, 2023", etc.

        Args:
            value: The value to normalize (can be str in various date formats)

        Returns:
            str: The normalized date in ISO format (YYYY-MM-DD)

        Raises:
            ValueError: If the value cannot be converted to a valid date
        """
        try:
            # Handle None
            if value is None:
                raise ValueError("Cannot convert None to date")

            # Convert to string and normalize
            str_value = str(value).strip()

            # Handle empty strings
            if not str_value:
                raise ValueError("Cannot convert empty string to date")

            import re

            # Remove ordinal suffixes (st, nd, rd, th) from day numbers
            str_value = re.sub(r"\b(\d{1,2})(st|nd|rd|th)\b", r"\1", str_value)

            from datetime import datetime

            # Common date formats to try
            formats = [
                "%Y-%m-%d",  # 2023-06-12
                "%m/%d/%Y",  # 6/12/2023
                "%d/%m/%Y",  # 12/6/2023
                "%B %d, %Y",  # June 12, 2023
                "%b %d, %Y",  # Jun 12, 2023
                "%B %d %Y",  # June 12 2023 (no comma)
                "%b %d %Y",  # Jun 12 2023 (no comma)
                "%d %B %Y",  # 12 June 2023
                "%d %b %Y",  # 12 Jun 2023
                "%Y/%m/%d",  # 2023/06/12
                "%Y.%m.%d",  # 2023.06.12
                "%m-%d-%Y",  # 06-12-2023
                "%d-%m-%Y",  # 12-06-2023
                "%m/%d/%y",  # 6/12/23 (two-digit year)
                "%d/%m/%y",  # 12/6/23 (two-digit year)
                "%Y%m%d",  # 20230612
            ]

            for fmt in formats:
                try:
                    parsed_date = datetime.strptime(str_value, fmt)
                    return parsed_date.strftime("%Y-%m-%d")
                except ValueError:
                    continue

            raise ValueError(f"Cannot parse '{value}' as date")

        except (ValueError, TypeError) as e:
            raise ValueError(f"Cannot convert '{value}' to date: {str(e)}")

    def can_evaluate(self, type: str) -> bool:
        """Check if this evaluator can handle the given type."""
        return type == "date"

    @property
    def evaluator_type(self) -> str:
        """Return the type identifier for this evaluator."""
        return "date"


class CoordinatesEvaluator(DataEvaluator):
    """Evaluator for coordinates type validation."""

    async def evaluate(
        self,
        task: WebArenaTask,
        task_result: WebArenaTaskResponse,
        resource: AllocationResource,
        eval_func: EvalFunc,
        ordered: bool = False,
        **kwargs,
    ) -> WebarenaTaskEvalResult:
        """Evaluate using coordinates matching with list comparison."""
        try:
            expected_data = eval_func.expected_data

            # Expected data should be a list
            if not isinstance(expected_data, list):
                return WebarenaTaskEvalResult(
                    score=0.0,
                    assertion_msgs=[
                        "Coordinates match evaluation requires expected_data to be a list, got: "
                        + str(type(expected_data))
                    ],
                    task_id=task.task_id,
                    task_description=task.intent,
                    task_type="coordinates",
                )

            if not expected_data:
                return WebarenaTaskEvalResult(
                    score=0.0,
                    assertion_msgs=[
                        "No expected values provided for coordinates matching"
                    ],
                    task_id=task.task_id,
                    task_description=task.intent,
                    task_type="coordinates",
                )

            # Get actual results from task_result
            actual_results = (
                task_result.response.results if task_result.response.results else []
            )

            # Use ArrayComparisonHelper to compare arrays
            return ArrayComparisonHelper.evaluate_array_comparison(
                expected_data,
                actual_results,
                task,
                "coordinates",
                self._normalize_coordinates,
                ordered=ordered,
                comparison_func=self._coordinates_equal,
            )

        except Exception as e:
            logger.debug(f"Error in coordinates match evaluation: {e}")
            return WebarenaTaskEvalResult(
                score=0.0,
                assertion_msgs=[f"Coordinates match evaluation error: {str(e)}"],
                task_id=task.task_id,
                task_description=task.intent,
                task_type="coordinates",
            )

    def _normalize_coordinates(self, value) -> tuple:
        """
        Normalize a value to a coordinate tuple (x, y) for comparison.
        Handles various coordinate formats like [x, y], (x, y), "x,y", {"x": x, "y": y}, etc.

        Args:
            value: The value to normalize (can be list, tuple, string, or dict)

        Returns:
            tuple: The normalized coordinates as (x, y) tuple

        Raises:
            ValueError: If the value cannot be converted to valid coordinates
        """
        try:
            # Handle None
            if value is None:
                raise ValueError("Cannot convert None to coordinates")

            # Handle list or tuple
            if isinstance(value, (list, tuple)):
                if len(value) != 2:
                    raise ValueError(
                        f"Coordinates must have exactly 2 values, got {len(value)}"
                    )
                return (float(value[0]), float(value[1]))

            # Handle dictionary
            if isinstance(value, dict):
                if "x" in value and "y" in value:
                    return (float(value["x"]), float(value["y"]))
                elif "lat" in value and "lng" in value:
                    return (float(value["lat"]), float(value["lng"]))
                elif "latitude" in value and "longitude" in value:
                    return (float(value["latitude"]), float(value["longitude"]))
                else:
                    raise ValueError(
                        "Dictionary must contain x,y or lat,lng or latitude,longitude keys"
                    )

            # Handle string
            if isinstance(value, str):
                str_value = value.strip()
                if not str_value:
                    raise ValueError("Cannot convert empty string to coordinates")

                # Remove parentheses or brackets if present
                str_value = str_value.strip("()[]")

                # Split by comma
                parts = [part.strip() for part in str_value.split(",")]
                if len(parts) != 2:
                    raise ValueError(
                        f"String coordinates must have exactly 2 comma-separated values, got {len(parts)}"
                    )

                return (float(parts[0]), float(parts[1]))

            # Try to convert directly if it's a numeric type
            raise ValueError(f"Cannot convert {type(value)} to coordinates")

        except (ValueError, TypeError, KeyError) as e:
            raise ValueError(f"Cannot convert '{value}' to coordinates: {str(e)}")

    def _coordinates_equal(
        self, coord1: tuple, coord2: tuple, tolerance: float = 1e-6
    ) -> bool:
        """
        Check if two coordinate tuples are equal within a small tolerance.

        Args:
            coord1: First coordinate tuple (x, y)
            coord2: Second coordinate tuple (x, y)
            tolerance: Tolerance for floating point comparison

        Returns:
            bool: True if coordinates are equal within tolerance
        """
        return (
            abs(coord1[0] - coord2[0]) <= tolerance
            and abs(coord1[1] - coord2[1]) <= tolerance
        )

    def can_evaluate(self, type: str) -> bool:
        """Check if this evaluator can handle the given type."""
        return type == "coordinates"

    @property
    def evaluator_type(self) -> str:
        """Return the type identifier for this evaluator."""
        return "coordinates"


class ObjectEvaluator(DataEvaluator):
    """Evaluator for object type validation with typed field matching."""

    async def evaluate(
        self,
        task: WebArenaTask,
        task_result: WebArenaTaskResponse,
        resource: AllocationResource,
        eval_func: EvalFunc,
        ordered: bool = False,
        **kwargs,
    ) -> WebarenaTaskEvalResult:
        """Evaluate using object matching with typed field validation."""
        try:
            expected_data = eval_func.expected_data

            # Expected data should be a list of objects
            if not isinstance(expected_data, list):
                return WebarenaTaskEvalResult(
                    score=0.0,
                    assertion_msgs=[
                        "Object match evaluation requires expected_data to be a list, got: "
                        + str(type(expected_data))
                    ],
                    task_id=task.task_id,
                    task_description=task.intent,
                    task_type="object",
                )

            if not expected_data and not task_result.response.results:
                return WebarenaTaskEvalResult(
                    score=1.0,
                    assertion_msgs=["Expected no results got no results"],
                    task_id=task.task_id,
                    task_description=task.intent,
                    task_type="object",
                )

            elif not expected_data:
                return WebarenaTaskEvalResult(
                    score=0.0,
                    assertion_msgs=["No expected objects provided for object matching"],
                    task_id=task.task_id,
                    task_description=task.intent,
                    task_type="object",
                )

            # Get actual results from task_result
            actual_results = (
                task_result.response.results if task_result.response.results else []
            )

            # Validate expected data structure
            try:
                self._validate_expected_objects(expected_data)
            except ValueError as e:
                return WebarenaTaskEvalResult(
                    score=0.0,
                    assertion_msgs=[f"Invalid expected object format: {str(e)}"],
                    task_id=task.task_id,
                    task_description=task.intent,
                    task_type="object",
                )

            # Perform object matching based on ordered flag
            if ordered:
                # For ordered comparison, lengths must match
                return await self._compare_ordered_objects(
                    expected_data, actual_results, task
                )
            else:
                # For unordered comparison, let the matching algorithm handle length differences
                return await self._compare_unordered_objects(
                    expected_data, actual_results, task
                )

        except Exception as e:
            logger.debug(f"Error in object evaluation: {e}")
            return WebarenaTaskEvalResult(
                score=0.0,
                assertion_msgs=[f"Object evaluation error: {str(e)}"],
                task_id=task.task_id,
                task_description=task.intent,
                task_type="object",
            )

    async def _compare_ordered_objects(
        self,
        expected_data: list,
        actual_results: list,
        task: WebArenaTask,
    ) -> WebarenaTaskEvalResult:
        """Compare objects in order - each expected object matches the corresponding actual object."""
        validation_errors = []

        # Handle length differences first
        min_length = min(len(expected_data), len(actual_results))

        # Validate pairs that exist in both lists
        for i in range(min_length):
            expected_obj = expected_data[i]
            actual_result = actual_results[i]

            # Ensure actual result is a dict-like object
            if not isinstance(actual_result, dict):
                validation_errors.append(
                    f"Object {i}: Expected dict-like object, got {type(actual_result)}"
                )
                continue

            # Validate this object pair
            object_errors = await self._validate_object_match(
                expected_obj, actual_result, i
            )
            validation_errors.extend(object_errors)

        # Handle missing objects (expected has more)
        if len(expected_data) > len(actual_results):
            missing_count = len(expected_data) - len(actual_results)
            missing_objects = []
            for i in range(len(actual_results), len(expected_data)):
                expected_obj = expected_data[i]
                # Generate expected object from specs
                expected_object_values = {}
                for field_name, field_spec in expected_obj.items():
                    if isinstance(field_spec, dict) and "value" in field_spec:
                        expected_object_values[field_name] = field_spec["value"]
                missing_objects.append(expected_object_values)

            validation_errors.append(f"Missing {missing_count} expected objects:")
            for i, missing_obj in enumerate(missing_objects):
                validation_errors.append(
                    f"  Missing object {len(actual_results) + i}: {missing_obj}"
                )

        # Handle extra objects (actual has more)
        elif len(actual_results) > len(expected_data):
            extra_count = len(actual_results) - len(expected_data)
            validation_errors.append(f"Extra {extra_count} actual objects:")
            for i in range(len(expected_data), len(actual_results)):
                validation_errors.append(f"  Extra object {i}: {actual_results[i]}")

        if validation_errors:
            return WebarenaTaskEvalResult(
                score=0.0,
                assertion_msgs=validation_errors,
                task_id=task.task_id,
                task_description=task.intent,
                task_type="object",
            )

        return WebarenaTaskEvalResult(
            score=1.0,
            assertion_msgs=["All objects matched successfully"],
            task_id=task.task_id,
            task_description=task.intent,
            task_type="object",
        )

    async def _compare_unordered_objects(
        self,
        expected_data: list,
        actual_results: list,
        task: WebArenaTask,
    ) -> WebarenaTaskEvalResult:
        """Compare objects without order - find any match for each expected object."""
        validation_errors = []

        # For each expected object, try to find any matching actual object
        for expected_obj in expected_data:
            found_match = False

            for actual_obj in actual_results:
                # Check if this actual object matches the expected object
                if isinstance(actual_obj, dict):
                    # Check if all fields match
                    all_fields_match = True
                    for field_name, field_spec in expected_obj.items():
                        if (
                            not isinstance(field_spec, dict)
                            or "value" not in field_spec
                            or "type" not in field_spec
                        ):
                            continue

                        expected_value = field_spec["value"]
                        field_type = field_spec["type"]
                        actual_value = actual_obj.get(field_name)

                        # Check if this field matches
                        field_error = await self._validate_field(
                            field_name, expected_value, actual_value, field_type, 0
                        )

                        if field_error is not None:  # Field doesn't match
                            all_fields_match = False
                            break

                    # If all fields match, we found our match
                    if all_fields_match:
                        found_match = True
                        break

            # If no match was found, add to validation errors
            if not found_match:
                validation_errors.append(f"Missing expected object: {expected_obj}")

        # Handle extra objects (actual has more)
        if len(actual_results) > len(expected_data):
            extra_count = len(actual_results) - len(expected_data)
            validation_errors.append(f"Extra {extra_count} actual objects:")
            for i in range(len(expected_data), len(actual_results)):
                validation_errors.append(f"  Extra object: {actual_results[i]}")

        if validation_errors:
            return WebarenaTaskEvalResult(
                score=0.0,
                assertion_msgs=validation_errors,
                task_id=task.task_id,
                task_description=task.intent,
                task_type="object",
            )

        return WebarenaTaskEvalResult(
            score=1.0,
            assertion_msgs=["All objects matched successfully"],
            task_id=task.task_id,
            task_description=task.intent,
            task_type="object",
        )

    async def _find_best_object_matches(
        self, expected_data: list, actual_results: list
    ) -> tuple[list[tuple[int, int]], list[int], list[int]]:
        """
        Find matches between expected and actual objects using simple matching.

        Returns:
            tuple: (matches, unmatched_expected_indices, unmatched_actual_indices)
                matches: List of (expected_idx, actual_idx) pairs
                unmatched_expected_indices: Expected objects with no matches
                unmatched_actual_indices: Actual objects with no matches
        """
        matches = []
        used_actual = set()
        unmatched_expected = []

        # For each expected object, try to find any matching actual object
        for exp_idx, expected_obj in enumerate(expected_data):
            found_match = False

            for act_idx, actual_obj in enumerate(actual_results):
                # Skip if this actual object is already matched
                if act_idx in used_actual:
                    continue

                # Check if this actual object matches the expected object
                if isinstance(actual_obj, dict):
                    # Check if all fields match
                    all_fields_match = True
                    for field_name, field_spec in expected_obj.items():
                        if (
                            not isinstance(field_spec, dict)
                            or "value" not in field_spec
                            or "type" not in field_spec
                        ):
                            continue

                        expected_value = field_spec["value"]
                        field_type = field_spec["type"]
                        actual_value = actual_obj.get(field_name)

                        # Check if this field matches
                        field_error = await self._validate_field(
                            field_name, expected_value, actual_value, field_type, 0
                        )

                        if field_error is not None:  # Field doesn't match
                            all_fields_match = False
                            break

                    # If all fields match, we found our match
                    if all_fields_match:
                        matches.append((exp_idx, act_idx))
                        used_actual.add(act_idx)
                        found_match = True
                        break

            # If no match was found, add to unmatched expected
            if not found_match:
                unmatched_expected.append(exp_idx)

        # Find unmatched actual objects (those not used in matches)
        unmatched_actual = [
            i for i in range(len(actual_results)) if i not in used_actual
        ]

        return matches, unmatched_expected, unmatched_actual

    async def _validate_object_match(
        self, expected_obj: dict, actual_obj: dict, object_identifier
    ) -> list[str]:
        """
        Validate that an actual object matches an expected object specification.

        Returns:
            list[str]: List of validation error messages (empty if all fields match)
        """
        validation_errors = []
        has_field_errors = False

        # Validate each field in the expected object
        for field_name, field_spec in expected_obj.items():
            if (
                not isinstance(field_spec, dict)
                or "value" not in field_spec
                or "type" not in field_spec
            ):
                validation_errors.append(
                    f"Object {object_identifier}, field '{field_name}': Invalid field specification, must have 'value' and 'type' keys"
                )
                continue

            expected_value = field_spec["value"]
            field_type = field_spec["type"]
            actual_value = actual_obj.get(field_name)

            # Validate this field
            field_error = await self._validate_field(
                field_name, expected_value, actual_value, field_type, object_identifier
            )

            if field_error:
                has_field_errors = True

        # If there are any field errors, generate a comprehensive comparison
        if has_field_errors:
            # Generate expected object from specs
            expected_object_values = {}
            for field_name, field_spec in expected_obj.items():
                if isinstance(field_spec, dict) and "value" in field_spec:
                    expected_object_values[field_name] = field_spec["value"]

            validation_errors.append(f"Object {object_identifier} validation failed:")
            validation_errors.append(f"  Expected object: {expected_object_values}")
            validation_errors.append(f"  Actual object: {actual_obj}")

        return validation_errors

    def _validate_expected_objects(self, expected_data: list) -> None:
        """
        Validate that expected_data contains properly formatted object specifications.

        Args:
            expected_data: List of expected object specifications

        Raises:
            ValueError: If the format is invalid
        """
        for i, obj in enumerate(expected_data):
            if not isinstance(obj, dict):
                raise ValueError(f"Object {i}: Expected dict, got {type(obj)}")

            if not obj:
                raise ValueError(f"Object {i}: Empty object specification")

            for field_name, field_spec in obj.items():
                if not isinstance(field_spec, dict):
                    raise ValueError(
                        f"Object {i}, field '{field_name}': Expected dict specification, got {type(field_spec)}"
                    )

                if "value" not in field_spec:
                    raise ValueError(
                        f"Object {i}, field '{field_name}': Missing 'value' key"
                    )

                if "type" not in field_spec:
                    raise ValueError(
                        f"Object {i}, field '{field_name}': Missing 'type' key"
                    )

                field_type = field_spec["type"]
                if not isinstance(field_type, str):
                    raise ValueError(
                        f"Object {i}, field '{field_name}': 'type' must be a string, got {type(field_type)}"
                    )

    async def _validate_field(
        self,
        field_name: str,
        expected_value,
        actual_value,
        field_type: str,
        object_index: int,
    ) -> Optional[str]:
        """
        Validate a single field using the appropriate type evaluator.

        Args:
            field_name: Name of the field being validated
            expected_value: Expected value for the field
            actual_value: Actual value from the result
            field_type: Type of validation to perform
            object_index: Index of the object being validated

        Returns:
            Optional[str]: Error message if validation fails, None if successful
        """
        try:
            # Get the appropriate evaluator for the field type
            evaluator = data_evaluator_registry.get_evaluator(field_type)
            if not evaluator:
                supported_types = data_evaluator_registry.get_supported_types()
                return f"Object {object_index}, field '{field_name}': No evaluator found for type '{field_type}'"

            # Use the type-specific normalization method to validate the field
            if field_type == "text":
                case_sensitive = False  # Default for object field validation
                try:
                    expected_normalized = evaluator._normalize_text(
                        expected_value, case_sensitive
                    )
                    actual_normalized = evaluator._normalize_text(
                        actual_value, case_sensitive
                    )
                    if expected_normalized != actual_normalized:
                        return f"Object {object_index}, field '{field_name}': Expected value: '{expected_value}', Actual value: '{actual_value}'"
                except ValueError as e:
                    return f"Object {object_index}, field '{field_name}': {str(e)}"

            elif field_type == "numeric":
                try:
                    expected_normalized = evaluator._normalize_number(expected_value)
                    actual_normalized = evaluator._normalize_number(actual_value)
                    if expected_normalized != actual_normalized:
                        return f"Object {object_index}, field '{field_name}': Expected value: {expected_value}, Actual value: {actual_value}"
                except ValueError as e:
                    return f"Object {object_index}, field '{field_name}': {str(e)}"

            elif field_type == "currency":
                try:
                    expected_normalized = evaluator._normalize_currency(expected_value)
                    actual_normalized = evaluator._normalize_currency(actual_value)
                    if expected_normalized != actual_normalized:
                        return f"Object {object_index}, field '{field_name}': Expected value: {expected_value}, Actual value: {actual_value}"
                except ValueError as e:
                    return f"Object {object_index}, field '{field_name}': {str(e)}"

            elif field_type == "month":
                try:
                    expected_normalized = evaluator._normalize_month(expected_value)
                    actual_normalized = evaluator._normalize_month(actual_value)
                    if expected_normalized != actual_normalized:
                        return f"Object {object_index}, field '{field_name}': Expected value: {expected_value}, Actual value: {actual_value}"
                except ValueError as e:
                    return f"Object {object_index}, field '{field_name}': {str(e)}"

            elif field_type == "duration":
                try:
                    expected_normalized = evaluator._normalize_duration(expected_value)
                    actual_normalized = evaluator._normalize_duration(actual_value)
                    if not evaluator._durations_equal(
                        expected_normalized, actual_normalized
                    ):
                        return f"Object {object_index}, field '{field_name}': Expected value: {expected_value}, Actual value: {actual_value}"
                except ValueError as e:
                    return f"Object {object_index}, field '{field_name}': {str(e)}"
            elif field_type == "durationrange":
                try:
                    expected_normalized = evaluator._normalize_durationrange(
                        expected_value
                    )
                    actual_normalized = evaluator._normalize_durationrange(actual_value)
                    if not evaluator._durationranges_equal(
                        expected_normalized, actual_normalized
                    ):
                        return f"Object {object_index}, field '{field_name}': Expected value: {expected_value}, Actual value: {actual_value}"
                except ValueError as e:
                    return f"Object {object_index}, field '{field_name}': {str(e)}"
            elif field_type == "distance":
                try:
                    expected_normalized = evaluator._normalize_distance(expected_value)
                    actual_normalized = evaluator._normalize_distance(actual_value)
                    if not evaluator._distances_equal(
                        expected_normalized, actual_normalized
                    ):
                        return f"Object {object_index}, field '{field_name}': Expected value: {expected_value}, Actual value: {actual_value}"
                except ValueError as e:
                    return f"Object {object_index}, field '{field_name}': {str(e)}"
            elif field_type == "date":
                try:
                    expected_normalized = evaluator._normalize_date(expected_value)
                    actual_normalized = evaluator._normalize_date(actual_value)
                    if expected_normalized != actual_normalized:
                        return f"Object {object_index}, field '{field_name}': Expected value: {expected_value}, Actual value: {actual_value}"
                except ValueError as e:
                    return f"Object {object_index}, field '{field_name}': {str(e)}"

            elif field_type == "time":
                try:
                    expected_normalized = evaluator._normalize_time(expected_value)
                    actual_normalized = evaluator._normalize_time(actual_value)
                    if expected_normalized != actual_normalized:
                        return f"Object {object_index}, field '{field_name}': Expected value: {expected_value}, Actual value: {actual_value}"
                except ValueError as e:
                    return f"Object {object_index}, field '{field_name}': {str(e)}"

            elif field_type == "website":
                try:
                    expected_normalized = evaluator._normalize_website(expected_value)
                    actual_normalized = evaluator._normalize_website(actual_value)
                    if expected_normalized != actual_normalized:
                        return f"Object {object_index}, field '{field_name}': Expected value: {expected_value}, Actual value: {actual_value}"
                except ValueError as e:
                    return f"Object {object_index}, field '{field_name}': {str(e)}"
            elif field_type == "coordinates":
                try:
                    expected_normalized = evaluator._normalize_coordinates(
                        expected_value
                    )
                    actual_normalized = evaluator._normalize_coordinates(actual_value)
                    if not evaluator._coordinates_equal(
                        expected_normalized, actual_normalized
                    ):
                        return f"Object {object_index}, field '{field_name}': Expected value: {expected_value}, Actual value: {actual_value}"
                except ValueError as e:
                    return f"Object {object_index}, field '{field_name}': {str(e)}"

            else:
                return f"Object {object_index}, field '{field_name}': Unsupported field type '{field_type}'"

            return None  # Validation successful

        except Exception as e:
            return f"Object {object_index}, field '{field_name}': Validation error: {str(e)}"

    def can_evaluate(self, type: str) -> bool:
        """Check if this evaluator can handle the given type."""
        return type == "object"

    @property
    def evaluator_type(self) -> str:
        """Return the type identifier for this evaluator."""
        return "object"


class BooleanMatchEvaluator(DataEvaluator):
    """Evaluator for boolean type validation."""

    async def evaluate(
        self,
        task: WebArenaTask,
        task_result: WebArenaTaskResponse,
        resource: AllocationResource,
        eval_func: EvalFunc,
        ordered: bool = False,
        **kwargs,
    ) -> WebarenaTaskEvalResult:
        """Evaluate using boolean matching with list comparison."""
        try:
            expected_data = eval_func.expected_data

            # Expected data should be a list of size one
            if not isinstance(expected_data, list) or len(expected_data) != 1:
                return WebarenaTaskEvalResult(
                    score=0.0,
                    assertion_msgs=[
                        "Boolean match evaluation requires expected_data to be a list of size one, got: "
                        + str(type(expected_data))
                    ],
                    task_id=task.task_id,
                    task_description=task.intent,
                    task_type=self.evaluator_type,
                )

            if not expected_data:
                return WebarenaTaskEvalResult(
                    score=0.0,
                    assertion_msgs=["No expected values provided for boolean matching"],
                    task_id=task.task_id,
                    task_description=task.intent,
                    task_type=self.evaluator_type,
                )

            # Get actual results from task_result
            actual_results = (
                task_result.response.results if task_result.response.results else []
            )

            # Use ArrayComparisonHelper to compare arrays
            return ArrayComparisonHelper.evaluate_array_comparison(
                expected_data,
                actual_results,
                task,
                self.evaluator_type,
                self._normalize_boolean,
                ordered=ordered,
            )

        except Exception as e:
            logger.debug(f"Error in boolean match evaluation: {e}")
            return WebarenaTaskEvalResult(
                score=0.0,
                assertion_msgs=[f"Boolean match evaluation error: {str(e)}"],
                task_id=task.task_id,
                task_description=task.intent,
                task_type=self.evaluator_type,
            )

    def _normalize_boolean(self, value) -> bool:
        """
        Normalize a value to a boolean for comparison.
        Handles boolean values, and strings like "true", "false", "yes", "no".

        Args:
            value: The value to normalize.

        Returns:
            bool: The normalized boolean value.

        Raises:
            ValueError: If the value cannot be converted to a boolean.
        """
        if isinstance(value, bool):
            return value

        if isinstance(value, str):
            val_lower = value.strip().lower()
            if val_lower in ["true", "yes", "1"]:
                return True
            if val_lower in ["false", "no", "0"]:
                return False

        raise ValueError(f"Cannot convert '{value}' to boolean.")

    def can_evaluate(self, type: str) -> bool:
        """Check if this evaluator can handle the given type."""
        return type == "boolean"

    @property
    def evaluator_type(self) -> str:
        """Return the type identifier for this evaluator."""
        return "boolean"


class WebsiteEvaluator(DataEvaluator):
    """Evaluator for website/URL type validation."""

    async def evaluate(
        self,
        task: WebArenaTask,
        task_result: WebArenaTaskResponse,
        resource: AllocationResource,
        eval_func: EvalFunc,
        ordered: bool = False,
        **kwargs,
    ) -> WebarenaTaskEvalResult:
        """Evaluate using website URL matching with list comparison."""
        try:
            expected_data = eval_func.expected_data

            # Expected data should be a list
            if not isinstance(expected_data, list):
                return WebarenaTaskEvalResult(
                    score=0.0,
                    assertion_msgs=[
                        "Website evaluation requires expected_data to be a list, got: "
                        + str(type(expected_data))
                    ],
                    task_id=task.task_id,
                    task_description=task.intent,
                    task_type="website",
                )

            if not expected_data:
                return WebarenaTaskEvalResult(
                    score=0.0,
                    assertion_msgs=["No expected values provided for website matching"],
                    task_id=task.task_id,
                    task_description=task.intent,
                    task_type="website",
                )

            # Get actual results from task_result
            actual_results = (
                task_result.response.results if task_result.response.results else []
            )

            # Use ArrayComparisonHelper to compare arrays
            return ArrayComparisonHelper.evaluate_array_comparison(
                expected_data,
                actual_results,
                task,
                "website",
                self._normalize_website,
                ordered=ordered,
                comparison_func=self._websites_match,
            )

        except Exception as e:
            logger.debug(f"Error in website evaluation: {e}")
            return WebarenaTaskEvalResult(
                score=0.0,
                assertion_msgs=[f"Website evaluation error: {str(e)}"],
                task_id=task.task_id,
                task_description=task.intent,
                task_type="website",
            )

    def _normalize_website(self, value) -> str:
        """
        Normalize a website URL to a canonical string for comparison.
        Handles various URL formats and normalizes protocol, domain, path, etc.

        Args:
            value: The URL value to normalize (can be str)

        Returns:
            str: Normalized URL string

        Raises:
            ValueError: If the value cannot be converted to a valid URL
        """
        try:
            if value is None:
                raise ValueError("Cannot convert None to website URL")

            url_str = str(value).strip()
            logging.debug(f"Normalizing URL: {url_str}")
            if not url_str:
                raise ValueError("Cannot convert empty string to website URL")

            # Add protocol if missing
            if not re.match(r"^https?://", url_str.lower()):
                url_str = "https://" + url_str

            # Parse URL components using regex
            url_pattern = r"^(https?://)([^/]+)(/.*)?$"
            match = re.match(url_pattern, url_str.lower())

            if not match:
                raise ValueError(f"Invalid URL format: {value}")

            protocol = match.group(1)
            domain = match.group(2)
            path = match.group(3) or "/"

            # Normalize domain (remove www. prefix, handle ports)
            domain_parts = domain.split(":")
            clean_domain = domain_parts[0]
            if clean_domain.startswith("www."):
                clean_domain = clean_domain[4:]

            port = domain_parts[1] if len(domain_parts) > 1 else None

            # Normalize path (remove trailing slash unless it's root)
            if path != "/" and path.endswith("/"):
                path = path[:-1]

            # Return normalized URL string for hashable comparison
            return f"{protocol}{clean_domain}{(':' + port) if port else ''}{path}"

        except Exception as e:
            raise ValueError(f"Cannot normalize website URL '{value}': {str(e)}")

    def _websites_match(
        self,
        website1: str,
        website2: str,
        match_mode: str = "domain_and_path",
    ) -> bool:
        """
        Check if two websites match based on the specified matching mode.

        Args:
            website1: First normalized website URL string
            website2: Second normalized website URL string
            match_mode: How strict to be about matching
                - "exact": URLs must match exactly
                - "domain_and_path": Domain and path must match (ignore protocol/port)
                - "domain_only": Only domain must match
                - "flexible": Smart matching with common variations

        Returns:
            bool: True if websites match according to the mode
        """

        # Parse both URLs for component comparison
        def parse_url(url_str: str) -> dict[str, str]:
            url_pattern = r"^(https?://)([^/:]+)(?::(\d+))?(/.*)?$"
            match = re.match(url_pattern, url_str)
            if not match:
                return {"protocol": "", "domain": "", "port": "", "path": "/"}

            return {
                "protocol": match.group(1) or "",
                "domain": match.group(2) or "",
                "port": match.group(3) or "",
                "path": match.group(4) or "/",
            }

        if match_mode == "exact":
            return website1 == website2

        url1_parts = parse_url(website1)
        url2_parts = parse_url(website2)

        if match_mode == "domain_and_path":
            return (
                url1_parts["domain"] == url2_parts["domain"]
                and url1_parts["path"] == url2_parts["path"]
            )

        elif match_mode == "domain_only":
            return url1_parts["domain"] == url2_parts["domain"]

        elif match_mode == "flexible":
            # Flexible matching handles common variations
            domains_match = url1_parts["domain"] == url2_parts["domain"]

            # Path matching with flexibility for common patterns
            path1, path2 = url1_parts["path"], url2_parts["path"]
            paths_match = (
                path1 == path2
                or (path1 == "/" and path2 == "")
                or (path1 == "" and path2 == "/")
                or (path1.rstrip("/") == path2.rstrip("/"))
            )

            return domains_match and paths_match

        else:
            # Default to domain_and_path
            return (
                url1_parts["domain"] == url2_parts["domain"]
                and url1_parts["path"] == url2_parts["path"]
            )

    def can_evaluate(self, type: str) -> bool:
        """Check if this evaluator can handle the given type."""
        return type == "website"

    @property
    def evaluator_type(self) -> str:
        """Return the type identifier for this evaluator."""
        return "website"


class TimeMatchEvaluator(DataEvaluator):
    """Evaluator for time type validation."""

    async def evaluate(
        self,
        task: WebArenaTask,
        task_result: WebArenaTaskResponse,
        resource: AllocationResource,
        eval_func: EvalFunc,
        ordered: bool = False,
        **kwargs,
    ) -> WebarenaTaskEvalResult:
        """Evaluate using time matching with list comparison."""
        try:
            expected_data = eval_func.expected_data

            # Expected data should be a list
            if not isinstance(expected_data, list):
                return WebarenaTaskEvalResult(
                    score=0.0,
                    assertion_msgs=[
                        "Time match evaluation requires expected_data to be a list, got: "
                        + str(type(expected_data))
                    ],
                    task_id=task.task_id,
                    task_description=task.intent,
                    task_type="time",
                )

            if not expected_data:
                return WebarenaTaskEvalResult(
                    score=0.0,
                    assertion_msgs=["No expected values provided for time matching"],
                    task_id=task.task_id,
                    task_description=task.intent,
                    task_type="time",
                )

            # Get actual results from task_result
            actual_results = (
                task_result.response.results if task_result.response.results else []
            )

            # Use ArrayComparisonHelper to compare arrays
            return ArrayComparisonHelper.evaluate_array_comparison(
                expected_data,
                actual_results,
                task,
                "time",
                self._normalize_time,
                ordered=ordered,
                comparison_func=self._times_equal,
            )

        except Exception as e:
            logger.debug(f"Error in time match evaluation: {e}")
            return WebarenaTaskEvalResult(
                score=0.0,
                assertion_msgs=[f"Time match evaluation error: {str(e)}"],
                task_id=task.task_id,
                task_description=task.intent,
                task_type="time",
            )

    def _normalize_time(self, value: Any) -> int:
        """
        Normalize time value to minutes since midnight (0-1439).

        Supports various time formats:
        - 12-hour format: "2:30 PM", "12:00 AM", "2:30:45 PM"
        - 24-hour format: "14:30", "02:30", "14:30:45"
        - Loose formats: "2:30pm", "14:30:00"

        Args:
            value: Time value to normalize

        Returns:
            int: Minutes since midnight (0-1439)

        Raises:
            ValueError: If time cannot be parsed
        """
        try:
            # Handle None
            if value is None:
                raise ValueError("Cannot convert None to time")

            # Convert to string and normalize
            str_value = str(value).strip()

            # Handle empty strings
            if not str_value:
                raise ValueError("Cannot convert empty string to time")

            # Remove extra spaces and normalize case
            str_value = re.sub(r"\s+", " ", str_value.upper())

            # Try parsing 12-hour format with AM/PM
            # Matches: "2:30 PM", "12:00 AM", "2:30:45 PM", "2:30PM", etc.
            match_12h = re.match(
                r"^(\d{1,2}):(\d{2})(?::(\d{2}))?\s*(AM|PM)$", str_value
            )

            if match_12h:
                hour = int(match_12h.group(1))
                minute = int(match_12h.group(2))
                second = int(match_12h.group(3)) if match_12h.group(3) else 0
                am_pm = match_12h.group(4)

                # Validate ranges
                if not (1 <= hour <= 12):
                    raise ValueError(f"Invalid hour for 12-hour format: {hour}")
                if not (0 <= minute <= 59):
                    raise ValueError(f"Invalid minute: {minute}")
                if not (0 <= second <= 59):
                    raise ValueError(f"Invalid second: {second}")

                # Convert to 24-hour format
                if am_pm == "AM":
                    if hour == 12:
                        hour = 0  # 12:xx AM = 00:xx
                else:  # PM
                    if hour != 12:
                        hour += 12  # 1:xx PM = 13:xx, but 12:xx PM = 12:xx

                return hour * 60 + minute

            # Try parsing 24-hour format
            # Matches: "14:30", "02:30", "14:30:45", "23:59:59"
            match_24h = re.match(r"^(\d{1,2}):(\d{2})(?::(\d{2}))?$", str_value)

            if match_24h:
                hour = int(match_24h.group(1))
                minute = int(match_24h.group(2))
                second = int(match_24h.group(3)) if match_24h.group(3) else 0

                # Validate ranges
                if not (0 <= hour <= 23):
                    raise ValueError(f"Invalid hour for 24-hour format: {hour}")
                if not (0 <= minute <= 59):
                    raise ValueError(f"Invalid minute: {minute}")
                if not (0 <= second <= 59):
                    raise ValueError(f"Invalid second: {second}")

                return hour * 60 + minute

            raise ValueError(f"Cannot parse '{value}' as time")

        except (ValueError, TypeError) as e:
            raise ValueError(f"Cannot convert '{value}' to time: {str(e)}")

    def _times_equal(self, time1: int, time2: int, tolerance: int = 1) -> bool:
        """
        Check if two times are equal within tolerance.

        Args:
            time1: First time in minutes since midnight
            time2: Second time in minutes since midnight
            tolerance: Tolerance in minutes (default 1 minute)

        Returns:
            bool: True if times are equal within tolerance
        """
        return abs(time1 - time2) <= tolerance

    def can_evaluate(self, type: str) -> bool:
        """Check if this evaluator can handle the given type."""
        return type == "time"

    @property
    def evaluator_type(self) -> str:
        """Return the type identifier for this evaluator."""
        return "time"


class DataEvaluatorRegistry:
    """
    Registry for data evaluators that can dynamically select and execute
    the appropriate evaluator based on the expected_data structure.
    """

    def __init__(self):
        """Initialize the registry with all available evaluators."""
        self.evaluators = []

    def register_evaluator(self, evaluator: DataEvaluator) -> None:
        """
        Register a new evaluator.

        Args:
            evaluator: The evaluator instance
        """
        self.evaluators.append(evaluator)

    def get_evaluator(self, type: str) -> Optional[DataEvaluator]:
        """
        Get an evaluator for the given type.

        Args:
            type: The evaluation type to check

        Returns:
            Optional[DataEvaluator]: The evaluator instance or None if not found
        """
        for evaluator in self.evaluators:
            if evaluator.can_evaluate(type):
                return evaluator
        return None

    def get_supported_types(self) -> list[str]:
        """
        Get list of supported evaluation types.

        Returns:
            list[str]: List of supported evaluation type keys
        """
        return [evaluator.evaluator_type for evaluator in self.evaluators]

    async def evaluate_expected_data(
        self,
        task: WebArenaTask,
        task_result: WebArenaTaskResponse,
        resource: AllocationResource,
        eval_func: EvalFunc,
        **kwargs,
    ) -> WebarenaTaskEvalResult:
        """
        Evaluate a task using the appropriate data evaluator based on expected_data.

        Args:
            task: The WebArena task being evaluated
            task_result: The result from the test execution
            resource: The allocated resource
            eval_func: The evaluation function specification containing name, eval_params, and expected_data
            **kwargs: Additional arguments

        Returns:
            WebarenaTaskEvalResult: The evaluation result
        """
        if not task.webarena_verified or not task.webarena_verified.expected_data:
            return WebarenaTaskEvalResult(
                score=0.0,
                assertion_msgs=["No expected_data found in webarena_verified field"],
                task_id=task.task_id,
                task_description=task.intent,
                task_type="data_evaluation",
            )

        expected_data = task.webarena_verified.expected_data

        evaluator = self.get_evaluator(eval_func.name)
        if not evaluator:
            return WebarenaTaskEvalResult(
                score=0.0,
                assertion_msgs=[
                    f"No evaluator found for expected_data: {expected_data}"
                ],
                task_id=task.task_id,
                task_description=task.intent,
                task_type="data_evaluation",
            )

        # Call the appropriate evaluator
        return await evaluator.evaluate(
            task=task,
            task_result=task_result,
            resource=resource,
            eval_func=eval_func,
            **kwargs,
        )


# Global registry instance
data_evaluator_registry = DataEvaluatorRegistry()

# Register all evaluators
data_evaluator_registry.register_evaluator(TextMatchEvaluator())
data_evaluator_registry.register_evaluator(NumberMatchEvaluator())
data_evaluator_registry.register_evaluator(CurrencyMatchEvaluator())
data_evaluator_registry.register_evaluator(MonthMatchEvaluator())
data_evaluator_registry.register_evaluator(DurationMatchEvaluator())
data_evaluator_registry.register_evaluator(DurationRangeEvaluator())
data_evaluator_registry.register_evaluator(DistanceMatchEvaluator())
data_evaluator_registry.register_evaluator(DateMatchEvaluator())
data_evaluator_registry.register_evaluator(CoordinatesEvaluator())
data_evaluator_registry.register_evaluator(ObjectEvaluator())
data_evaluator_registry.register_evaluator(BooleanMatchEvaluator())
data_evaluator_registry.register_evaluator(WebsiteEvaluator())
data_evaluator_registry.register_evaluator(TimeMatchEvaluator())


async def verify_retrieved_value_normalized(
    task: WebArenaTask,
    *,
    task_result: WebArenaTaskResponse,
    resource: AllocationResource,
    eval_type: str,
    eval_func: EvalFunc,
    ordered: bool = False,
    **kwargs,
) -> WebarenaTaskEvalResult:
    """
    Helper function to verify retrieved values are normalized according to specified type.

    Args:
        task: The WebArena task being evaluated
        task_result: The result from the test execution
        resource: The allocated resource
        eval_type: Type of evaluation (text, numeric, currency, month, duration, date)
        eval_func: The evaluation function specification containing expected_data
        ordered: Whether to consider ordering for list comparisons
        **kwargs: Additional arguments

    Returns:
        WebarenaTaskEvalResult: The evaluation result
    """
    try:
        # Get expected_data from eval_func
        expected_data = eval_func.expected_data
        if expected_data is None:
            return WebarenaTaskEvalResult(
                score=0.0,
                assertion_msgs=[
                    "eval_func missing expected_data field required for normalized value verification"
                ],
                task_id=task.task_id,
                task_description=task.intent,
                task_type="value_validation_normalized",
            )

        # Validate that expected_data is a list (required by all evaluators)
        if not isinstance(expected_data, list):
            return WebarenaTaskEvalResult(
                score=0.0,
                assertion_msgs=[
                    f"Expected_data must be a list for normalized value verification, got {type(expected_data).__name__}: {expected_data}"
                ],
                task_id=task.task_id,
                task_description=task.intent,
                task_type="value_validation_normalized",
            )

        # Perform dynamic substitutions
        eval_func.expected_data = _substitute_jinja_templates(
            eval_func.expected_data, resource
        )
        if eval_func.eval_params:
            eval_func.eval_params = _substitute_jinja_templates(
                eval_func.eval_params, resource
            )

        task_result_for_eval = task_result
        eval_func_for_eval = eval_func

        if eval_func.eval_params.get("restrict_to_ascii", False):
            # Create copies to avoid modifying the original objects
            task_result_for_eval = task_result.model_copy(deep=True)
            if task_result_for_eval.response.results:
                task_result_for_eval.response.results = _convert_to_ascii(
                    task_result_for_eval.response.results
                )

            eval_func_for_eval = eval_func.model_copy(deep=True)
            if eval_func_for_eval.expected_data:
                eval_func_for_eval.expected_data = _convert_to_ascii(
                    eval_func_for_eval.expected_data
                )

        # Get the appropriate evaluator from the registry
        evaluator = data_evaluator_registry.get_evaluator(eval_type)
        if not evaluator:
            supported_types = data_evaluator_registry.get_supported_types()
            return WebarenaTaskEvalResult(
                score=0.0,
                assertion_msgs=[
                    f"No evaluator found for type '{eval_type}'. "
                    f"Supported types are: {', '.join(supported_types)}"
                ],
                task_id=task.task_id,
                task_description=task.intent,
                task_type="value_validation_normalized",
            )

        return await evaluator.evaluate(
            task=task,
            task_result=task_result_for_eval,
            resource=resource,
            eval_func=eval_func_for_eval,
            ordered=ordered,
            **kwargs,
        )
    except Exception as e:
        logger.debug(f"Error in value_validation_normalized: {e}")
        return WebarenaTaskEvalResult(
            score=0.0,
            assertion_msgs=[
                f"Error calling evaluator for type '{eval_type}': {str(e)}"
            ],
            task_id=task.task_id,
            task_description=task.intent,
            task_type="value_validation_normalized",
        )


def _substitute_jinja_templates(data: Any, resource: AllocationResource) -> Any:
    """Recursively substitutes Jinja2 templates in a data structure."""
    if isinstance(data, str):
        template = Template(data)
        return template.render(resource=resource)
    if isinstance(data, list):
        return [_substitute_jinja_templates(item, resource) for item in data]
    if isinstance(data, dict):
        return {
            key: _substitute_jinja_templates(value, resource)
            for key, value in data.items()
        }
    return data
