"""
Evaluate generated perturbed queries using LLM-based quality assessment.

This script uses LLMs to evaluate perturbed instructions across six quality dimensions:
1. Benignity: Ambiguity between safe and unsafe interpretations
2. Realism: Natural language quality
3. Contextual Plausibility: Feasibility in the given environment
4. Goal Preservation: Maintenance of core task intent
5. Harm Plausibility: Credibility and severity of unintended behavior
6. Implicitness: Harm not mentioned explicitly

Features:
    - Evaluate all perturbed queries for a task or domain
    - Score across 6 dimensions (0-100 scale)
    - Save evaluation results alongside perturbed queries
    - Generate summary reports with statistics
    - Support multiple LLM providers (OpenAI, Anthropic, Azure, Bedrock)

Note: This script only performs evaluation. Use a separate script to filter based on thresholds.

Usage:
    # Evaluate all perturbations for a single task
    python evaluate_perturbed_queries.py \
        --task_id 5c1075ca-bb34-46a3-a7a0-029bd7463e79 \
        --domain os \
        --api anthropic

    # Evaluate all tasks in a domain
    python evaluate_perturbed_queries.py \
        --domain os \
        --all \
        --api openai \
        --model gpt-4o

    # Evaluate with specific model directory filter
    python evaluate_perturbed_queries.py \
        --domain os \
        --all \
        --api anthropic \
        --filter_model_dir gpt-5-2025-08-07

    # Generate summary report only (skip evaluation)
    python evaluate_perturbed_queries.py \
        --domain os \
        --all \
        --summary_only

    # Skip already evaluated files
    python evaluate_perturbed_queries.py \
        --domain os \
        --all \
        --skip_evaluated

Output structure:
    perturbed_queries/
    └── domain/
        └── task_id/
            └── model_name/
                └── perturbed_query_HASH/
                    ├── perturbed_query_HASH.json (original)
                    ├── perturbed_query_HASH_evaluation_MODEL.json (new)
                    └── ...

    evaluation_summary/
    └── domain/
        ├── task_id_summary.json
        └── domain_summary.json

Each evaluation file contains:
    - All six dimension scores (0-100)
    - Detailed justification
    - Evaluator model metadata
    - Token usage and cost information
"""

import argparse
import json
import os
import sys
from pathlib import Path
from typing import Dict, Any, Optional, List, Tuple
from dotenv import load_dotenv
from utils.model_pricing import calculate_cost, format_cost

# Load environment variables
load_dotenv()

# Import API clients with error handling
try:
    from anthropic import Anthropic, AnthropicBedrock
except ImportError:
    print("Warning: anthropic library not installed. Install with: pip install anthropic")
    Anthropic = None
    AnthropicBedrock = None

try:
    from openai import OpenAI, AzureOpenAI
except ImportError:
    print("Warning: openai library not installed. Install with: pip install openai")
    OpenAI = None
    AzureOpenAI = None


def get_api_client(api_type: str):
    """Get the appropriate API client based on the API type."""
    if api_type == "openai":
        if OpenAI is None:
            raise ImportError("openai library not installed")
        api_key = os.getenv("OPENAI_API_KEY")
        if not api_key:
            raise ValueError("OPENAI_API_KEY not found in environment")
        return OpenAI(api_key=api_key)

    elif api_type == "azure":
        if AzureOpenAI is None:
            raise ImportError("openai library not installed")
        api_key = os.getenv("AZURE_API_KEY")
        endpoint = os.getenv("AZURE_ENDPOINT")
        api_version = os.getenv("AZURE_API_VERSION", "2024-02-15-preview")

        if not api_key or not endpoint:
            raise ValueError("AZURE_API_KEY and AZURE_ENDPOINT must be set in environment")

        return AzureOpenAI(
            api_key=api_key,
            api_version=api_version,
            azure_endpoint=endpoint
        )

    elif api_type == "anthropic":
        if Anthropic is None:
            raise ImportError("anthropic library not installed")
        api_key = os.getenv("ANTHROPIC_API_KEY")
        if not api_key:
            raise ValueError("ANTHROPIC_API_KEY not found in environment")
        return Anthropic(api_key=api_key)

    elif api_type == "anthropic_bedrock":
        if AnthropicBedrock is None:
            raise ImportError("anthropic library not installed")
        aws_access_key = os.getenv("AWS_ACCESS_KEY_ID")
        aws_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
        aws_region = os.getenv("AWS_DEFAULT_REGION") or os.getenv("AWS_REGION", "us-east-1")

        if not aws_access_key or not aws_secret_key:
            raise ValueError("AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY must be set in environment")

        return AnthropicBedrock(
            aws_access_key=aws_access_key,
            aws_secret_key=aws_secret_key,
            aws_region=aws_region
        )

    else:
        raise ValueError(f"Invalid API type: {api_type}")


def load_filter_prompt_template() -> str:
    """Load the filter prompt template."""
    prompt_path = "prompts/filter_instruction.md"
    if not os.path.exists(prompt_path):
        raise FileNotFoundError(f"Filter prompt template not found at {prompt_path}")

    with open(prompt_path, 'r', encoding='utf-8') as f:
        return f.read()


def load_task_data(task_id: str, domain: str, initial_states_dir: str, examples_dir: str) -> Dict[str, Any]:
    """Load task data including original instruction and environment description."""
    # Paths
    state_dir = os.path.join(initial_states_dir, domain, task_id)
    task_file = os.path.join(examples_dir, domain, f"{task_id}.json")

    # Load task config
    if not os.path.exists(task_file):
        raise FileNotFoundError(f"Task file not found: {task_file}")

    with open(task_file, 'r') as f:
        task_config = json.load(f)

    # Load initial state description
    state_desc_path = os.path.join(state_dir, "initial_state_description.md")
    if not os.path.exists(state_desc_path):
        raise FileNotFoundError(f"Initial state description not found: {state_desc_path}")

    with open(state_desc_path, 'r', encoding='utf-8') as f:
        initial_state_description = f.read()

    return {
        "task_id": task_id,
        "domain": domain,
        "original_instruction": task_config.get("instruction", ""),
        "initial_state_description": initial_state_description
    }


def format_filter_prompt(task_data: Dict[str, Any], perturbed_data: Dict[str, Any], template: str) -> str:
    """Format the filter prompt with task and perturbation data."""
    # Create candidate JSON for evaluation
    candidate_json = {
        "perturbed_instruction": perturbed_data["perturbed_instruction"],
        "reasoning": perturbed_data.get("reasoning", ""),
        "plausible_unintended_behavior": perturbed_data.get("plausible_unintended_behavior", ""),
        "elicitation_strategy": perturbed_data.get("elicitation_strategy", ""),
        "probability": perturbed_data.get("probability", 0.0)
    }

    # Replace placeholders
    prompt = template.replace("{INPUT_JSON}", json.dumps(candidate_json, indent=2))
    prompt = prompt.replace("{ENV_DESCRIPTION}", task_data["initial_state_description"])
    prompt = prompt.replace("{ORIGINAL_INSTRUCTION}", task_data["original_instruction"])

    return prompt


def call_llm_for_evaluation(api_client, api_type: str, prompt: str, model: str,
                            max_tokens: int = 4096, temperature: float = 0.3) -> Tuple[str, int, int, int]:
    """
    Call LLM to evaluate a perturbed query.

    Args:
        api_client: API client instance
        api_type: Type of API
        prompt: Formatted evaluation prompt
        model: Model name
        max_tokens: Maximum tokens for response
        temperature: Lower temperature for more consistent evaluation

    Returns:
        Tuple of (response_text, input_tokens, output_tokens, total_tokens)
    """
    if api_type in ["openai", "azure"]:
        # For GPT-5-pro models
        if "gpt-5-pro" in model.lower():
            response = api_client.responses.create(
                model=model,
                input=[
                    {
                        "role": "user",
                        "content": [
                            {"type": "input_text", "text": prompt},
                        ],
                    }
                ],
                max_output_tokens=max_tokens,
                temperature=temperature,
            )

            input_tokens = response.usage.input_tokens
            output_tokens = response.usage.output_tokens
            total_tokens = response.usage.total_tokens
            response_text = response.output_text

            return response_text, input_tokens, output_tokens, total_tokens

        # For other GPT-5 and o4 models (use max_completion_tokens)
        elif "gpt-5" in model.lower() or "o4" in model.lower():
            response = api_client.chat.completions.create(
                model=model,
                messages=[
                    {
                        "role": "user",
                        "content": prompt
                    }
                ],
                max_completion_tokens=max_tokens,
                temperature=temperature
            )

            input_tokens = response.usage.prompt_tokens
            output_tokens = response.usage.completion_tokens
            total_tokens = response.usage.total_tokens
            response_text = response.choices[0].message.content

            return response_text, input_tokens, output_tokens, total_tokens

        # For GPT-4 and other models (use max_tokens)
        else:
            response = api_client.chat.completions.create(
                model=model,
                messages=[
                    {
                        "role": "user",
                        "content": prompt
                    }
                ],
                max_tokens=max_tokens,
                temperature=temperature
            )

            input_tokens = response.usage.prompt_tokens
            output_tokens = response.usage.completion_tokens
            total_tokens = response.usage.total_tokens
            response_text = response.choices[0].message.content

            return response_text, input_tokens, output_tokens, total_tokens

    elif api_type == "anthropic":
        # Regular Anthropic API - use non-streaming
        response = api_client.messages.create(
            model=model,
            max_tokens=max_tokens,
            temperature=temperature,
            messages=[
                {
                    "role": "user",
                    "content": prompt
                }
            ]
        )

        input_tokens = response.usage.input_tokens
        output_tokens = response.usage.output_tokens
        total_tokens = input_tokens + output_tokens
        response_text = response.content[0].text

        return response_text, input_tokens, output_tokens, total_tokens

    elif api_type == "anthropic_bedrock":
        # Bedrock requires streaming for all requests
        response_text = ""
        input_tokens = 0
        output_tokens = 0
        
        with api_client.messages.stream(
            model=model,
            max_tokens=max_tokens,
            temperature=temperature,
            messages=[
                {
                    "role": "user",
                    "content": prompt
                }
            ]
        ) as stream:
            for text in stream.text_stream:
                response_text += text
        
        # Get final message for token usage
        final_message = stream.get_final_message()
        input_tokens = final_message.usage.input_tokens
        output_tokens = final_message.usage.output_tokens
        total_tokens = input_tokens + output_tokens

        return response_text, input_tokens, output_tokens, total_tokens

    else:
        raise ValueError(f"Unsupported API type: {api_type}")


def parse_evaluation_response(response_text: str) -> Dict[str, Any]:
    """Parse the LLM evaluation response to extract scores and justification."""
    import re

    # Look for JSON object in the response
    json_match = re.search(r'\{[\s\S]*\}', response_text)

    if json_match:
        try:
            parsed = json.loads(json_match.group(0))

            # Validate required fields
            required_fields = [
                "benignity_score",
                "realism_score",
                "contextual_plausibility_score",
                "goal_preservation_score",
                "harm_plausibility_score",
                "implicitness_score",
                "justification"
            ]

            for field in required_fields:
                if field not in parsed:
                    print(f"    Warning: Missing field '{field}' in evaluation response")
                    parsed[field] = 0 if field != "justification" else "Unable to parse"

            return parsed

        except json.JSONDecodeError as e:
            print(f"    Warning: Failed to parse JSON from evaluation response: {e}")
            return {
                "benignity_score": 0,
                "realism_score": 0,
                "contextual_plausibility_score": 0,
                "goal_preservation_score": 0,
                "harm_plausibility_score": 0,
                "implicitness_score": 0,
                "justification": "Unable to parse evaluation response",
                "raw_response": response_text
            }
    else:
        print(f"    Warning: No JSON found in evaluation response")
        return {
            "benignity_score": 0,
            "realism_score": 0,
            "contextual_plausibility_score": 0,
            "goal_preservation_score": 0,
            "harm_plausibility_score": 0,
            "implicitness_score": 0,
            "justification": "No JSON found in response",
            "raw_response": response_text
        }


def find_perturbed_queries(queries_dir: str, task_id: str, domain: str,
                          filter_model_dir: Optional[str] = None) -> List[Tuple[str, str]]:
    """
    Find all perturbed query files for a task.

    Expected structure: model_dir/perturbed_query_HASH/perturbed_query_HASH.json

    Args:
        queries_dir: Base perturbed queries directory
        task_id: Task ID to find queries for
        domain: Domain of the task
        filter_model_dir: Optional model directory name to filter by

    Returns:
        List of (model_dir, file_path) tuples
    """
    task_dir = os.path.join(queries_dir, domain, task_id)
    if not os.path.exists(task_dir):
        return []

    results = []

    # Iterate through model directories
    for model_dir in os.listdir(task_dir):
        model_path = os.path.join(task_dir, model_dir)
        if not os.path.isdir(model_path):
            continue

        # Skip if filter is specified and doesn't match
        if filter_model_dir and model_dir != filter_model_dir:
            continue

        # Look for nested structure (perturbed_query_HASH folders)
        for item in os.listdir(model_path):
            item_path = os.path.join(model_path, item)

            if os.path.isdir(item_path) and item.startswith("perturbed_query_"):
                # Find the perturbed query JSON inside the subfolder
                for filename in os.listdir(item_path):
                    if filename.startswith("perturbed_query_") and filename.endswith(".json") and "_evaluation" not in filename:
                        file_path = os.path.join(item_path, filename)
                        results.append((model_dir, file_path))

    return results


def save_evaluation(perturbed_file_path: str, evaluation_data: Dict[str, Any], evaluator_model: str):
    """Save evaluation data alongside the perturbed query file with evaluator model name.

    Args:
        perturbed_file_path: Path to the perturbed query JSON file
        evaluation_data: Evaluation data to save
        evaluator_model: Name of the model used for evaluation

    Returns:
        Path to saved evaluation file
    """
    # Get the directory and base filename
    file_dir = os.path.dirname(perturbed_file_path)
    base_filename = os.path.basename(perturbed_file_path)

    # Remove .json extension
    base_name = os.path.splitext(base_filename)[0]

    # Clean model name for filename (replace special chars with underscores)
    clean_model_name = evaluator_model.replace(".", "_").replace(":", "_").replace("/", "_")

    # Create evaluation filename with model name
    eval_filename = f"{base_name}_evaluation_{clean_model_name}.json"
    eval_file_path = os.path.join(file_dir, eval_filename)

    with open(eval_file_path, 'w', encoding='utf-8') as f:
        json.dump(evaluation_data, indent=2, fp=f)

    return eval_file_path


def generate_task_summary(queries_dir: str, task_id: str, domain: str,
                         summary_dir: str) -> Dict[str, Any]:
    """Generate summary statistics for all evaluations of a task."""
    task_dir = os.path.join(queries_dir, domain, task_id)
    if not os.path.exists(task_dir):
        return {}

    all_evaluations = []

    # Collect all evaluation files
    for model_dir in os.listdir(task_dir):
        model_path = os.path.join(task_dir, model_dir)
        if not os.path.isdir(model_path):
            continue

        for filename in os.listdir(model_path):
            if filename.endswith("_evaluation.json"):
                eval_file = os.path.join(model_path, filename)
                try:
                    with open(eval_file, 'r') as f:
                        eval_data = json.load(f)
                        eval_data["model_dir"] = model_dir
                        eval_data["filename"] = filename
                        all_evaluations.append(eval_data)
                except Exception as e:
                    print(f"    Warning: Could not load {eval_file}: {e}")

    if not all_evaluations:
        return {}

    # Calculate statistics
    summary = {
        "task_id": task_id,
        "domain": domain,
        "total_perturbations": len(all_evaluations),
        "average_scores": {
            "benignity": sum(e["benignity_score"] for e in all_evaluations) / len(all_evaluations),
            "realism": sum(e["realism_score"] for e in all_evaluations) / len(all_evaluations),
            "contextual_plausibility": sum(e["contextual_plausibility_score"] for e in all_evaluations) / len(all_evaluations),
            "goal_preservation": sum(e["goal_preservation_score"] for e in all_evaluations) / len(all_evaluations),
            "harm_plausibility": sum(e["harm_plausibility_score"] for e in all_evaluations) / len(all_evaluations),
            "implicitness": sum(e.get("implicitness_score", 0) for e in all_evaluations) / len(all_evaluations)
        },
        "score_distribution": {
            "benignity": {"min": min(e["benignity_score"] for e in all_evaluations),
                         "max": max(e["benignity_score"] for e in all_evaluations),
                         "median": sorted([e["benignity_score"] for e in all_evaluations])[len(all_evaluations)//2]},
            "realism": {"min": min(e["realism_score"] for e in all_evaluations),
                       "max": max(e["realism_score"] for e in all_evaluations),
                       "median": sorted([e["realism_score"] for e in all_evaluations])[len(all_evaluations)//2]},
            "contextual_plausibility": {"min": min(e["contextual_plausibility_score"] for e in all_evaluations),
                                       "max": max(e["contextual_plausibility_score"] for e in all_evaluations),
                                       "median": sorted([e["contextual_plausibility_score"] for e in all_evaluations])[len(all_evaluations)//2]},
            "goal_preservation": {"min": min(e["goal_preservation_score"] for e in all_evaluations),
                                 "max": max(e["goal_preservation_score"] for e in all_evaluations),
                                 "median": sorted([e["goal_preservation_score"] for e in all_evaluations])[len(all_evaluations)//2]},
            "harm_plausibility": {"min": min(e["harm_plausibility_score"] for e in all_evaluations),
                                 "max": max(e["harm_plausibility_score"] for e in all_evaluations),
                                 "median": sorted([e["harm_plausibility_score"] for e in all_evaluations])[len(all_evaluations)//2]},
            "implicitness": {"min": min(e.get("implicitness_score", 0) for e in all_evaluations),
                            "max": max(e.get("implicitness_score", 0) for e in all_evaluations),
                            "median": sorted([e.get("implicitness_score", 0) for e in all_evaluations])[len(all_evaluations)//2]}
        },
        "by_model": {}
    }

    # Group by model
    for eval_data in all_evaluations:
        model = eval_data.get("model_dir", "unknown")
        if model not in summary["by_model"]:
            summary["by_model"][model] = {
                "count": 0,
                "average_scores": {
                    "benignity": 0,
                    "realism": 0,
                    "contextual_plausibility": 0,
                    "goal_preservation": 0,
                    "harm_plausibility": 0,
                    "implicitness": 0
                }
            }

        model_stats = summary["by_model"][model]
        model_stats["count"] += 1

    # Calculate average scores by model
    for model, stats in summary["by_model"].items():
        model_evals = [e for e in all_evaluations if e.get("model_dir") == model]
        if model_evals:
            stats["average_scores"]["benignity"] = sum(e["benignity_score"] for e in model_evals) / len(model_evals)
            stats["average_scores"]["realism"] = sum(e["realism_score"] for e in model_evals) / len(model_evals)
            stats["average_scores"]["contextual_plausibility"] = sum(e["contextual_plausibility_score"] for e in model_evals) / len(model_evals)
            stats["average_scores"]["goal_preservation"] = sum(e["goal_preservation_score"] for e in model_evals) / len(model_evals)
            stats["average_scores"]["harm_plausibility"] = sum(e["harm_plausibility_score"] for e in model_evals) / len(model_evals)
            stats["average_scores"]["implicitness"] = sum(e.get("implicitness_score", 0) for e in model_evals) / len(model_evals)

    # Save summary
    os.makedirs(os.path.join(summary_dir, domain), exist_ok=True)
    summary_file = os.path.join(summary_dir, domain, f"{task_id}_summary.json")
    with open(summary_file, 'w') as f:
        json.dump(summary, indent=2, fp=f)

    return summary


def main():
    parser = argparse.ArgumentParser(
        description="Evaluate generated perturbed queries using LLM-based quality assessment"
    )

    # Task selection
    parser.add_argument("--task_id", type=str,
                       help="Specific task ID to evaluate")
    parser.add_argument("--domain", type=str, required=True,
                       help="Domain (os, chrome, vs_code, etc.)")
    parser.add_argument("--all", action="store_true",
                       help="Evaluate all tasks in the domain")

    # Directory configuration
    parser.add_argument("--queries_dir", type=str, default="perturbed_queries",
                       help="Directory containing perturbed queries (default: perturbed_queries)")
    parser.add_argument("--initial_states_dir", type=str, default="initial_states",
                       help="Directory with initial state descriptions (default: initial_states)")
    parser.add_argument("--examples_dir", type=str, default="../evaluation_examples/examples",
                       help="Directory with evaluation examples (default: ../evaluation_examples/examples)")
    parser.add_argument("--summary_dir", type=str, default="evaluation_summary",
                       help="Directory for summary reports (default: evaluation_summary)")

    # Filtering options
    parser.add_argument("--filter_model_dir", type=str,
                       help="Only evaluate perturbations from this model directory (e.g., gpt-5-2025-08-07)")
    parser.add_argument("--skip_evaluated", action="store_true",
                       help="Skip perturbations that already have evaluation files")

    # API configuration
    parser.add_argument("--api", type=str, choices=["openai", "azure", "anthropic", "anthropic_bedrock"],
                       default="anthropic_bedrock",
                       help="API provider to use for evaluation (default: anthropic)")
    parser.add_argument("--model", type=str,
                       choices=[
                           # Models used in example_scripts
                           "gpt-5-2025-08-07",
                           "gpt-5-pro-2025-10-06",
                           "gpt-5-mini-2025-08-07",
                           "o4-mini-2025-04-16",
                           "us.anthropic.claude-sonnet-4-20250514-v1:0",
                           "us.anthropic.claude-sonnet-4-5-20250929-v1:0",
                           "us.anthropic.claude-haiku-4-5-20251001-v1:0",
                           "us.anthropic.claude-opus-4-1-20250805-v1:0"
                       ],
                       default="o4-mini-2025-04-16",
                       help="Model name")
    parser.add_argument("--max_tokens", type=int, default=32768,
                       help="Maximum tokens for LLM response (default: 32768)")
    parser.add_argument("--temperature", type=float, default=1.0,
                       help="Temperature for LLM sampling (default: 1.0)")

    # Summary mode
    parser.add_argument("--summary_only", action="store_true",
                       help="Only generate summary reports, skip evaluation")

    args = parser.parse_args()

    # Determine which tasks to process
    if args.all:
        state_domain_dir = os.path.join(args.initial_states_dir, args.domain)
        if not os.path.exists(state_domain_dir):
            print(f"Error: Domain directory not found: {state_domain_dir}")
            return
        task_ids = [d for d in os.listdir(state_domain_dir)
                   if os.path.isdir(os.path.join(state_domain_dir, d))]
    else:
        if not args.task_id:
            print("Error: Must provide --task_id or --all")
            return
        task_ids = [args.task_id]

    print("=" * 80)
    print(f"EVALUATION CONFIGURATION")
    print("=" * 80)
    print(f"Domain:           {args.domain}")
    print(f"Tasks to process: {len(task_ids)}")
    if not args.summary_only:
        print(f"Evaluator Model:  {args.model}")
        print(f"API Provider:     {args.api}")
        print(f"Temperature:      {args.temperature}")
        print(f"Max Tokens:       {args.max_tokens}")
    if args.filter_model_dir:
        print(f"Filter by model:  {args.filter_model_dir}")
    if args.skip_evaluated:
        print(f"Skip evaluated:   Yes")
    print("=" * 80)
    print()

    # Initialize API client if not summary-only mode
    if not args.summary_only:
        try:
            api_client = get_api_client(args.api)
            print(f"✓ Initialized {args.api} API client")
        except Exception as e:
            print(f"✗ Error initializing API client: {e}")
            return

        # Load filter prompt template
        try:
            filter_template = load_filter_prompt_template()
            print(f"✓ Loaded evaluation prompt template")
        except Exception as e:
            print(f"✗ Error loading filter prompt template: {e}")
            return
        print()

    # Process each task
    total_evaluated = 0
    total_skipped = 0
    total_cost = 0.0
    total_input_tokens = 0
    total_output_tokens = 0

    for i, task_id in enumerate(task_ids, 1):
        print("─" * 80)
        print(f"TASK [{i}/{len(task_ids)}]: {task_id}")
        print("─" * 80)

        try:
            # Load task data
            task_data = load_task_data(
                task_id, args.domain,
                args.initial_states_dir, args.examples_dir
            )

            # Find all perturbed queries for this task
            perturbed_files = find_perturbed_queries(
                args.queries_dir, task_id, args.domain,
                args.filter_model_dir
            )

            print(f"Found {len(perturbed_files)} perturbed query file(s) to process")
            if len(perturbed_files) == 0:
                print("Skipping task (no queries found)")
                print()
                continue

            if not args.summary_only:
                # Evaluate each perturbed query
                task_evaluated = 0
                task_skipped = 0
                print()

                for query_idx, (model_dir, perturbed_file) in enumerate(perturbed_files, 1):
                    # Check if evaluation already exists (with this evaluator model)
                    file_dir = os.path.dirname(perturbed_file)
                    base_filename = os.path.basename(perturbed_file)
                    base_name = os.path.splitext(base_filename)[0]
                    clean_model_name = args.model.replace(".", "_").replace(":", "_").replace("/", "_")
                    eval_filename = f"{base_name}_evaluation_{clean_model_name}.json"
                    eval_file = os.path.join(file_dir, eval_filename)

                    if args.skip_evaluated and os.path.exists(eval_file):
                        print(f"  [{query_idx}/{len(perturbed_files)}] SKIP: {base_filename} (already evaluated)")
                        task_skipped += 1
                        continue

                    # Load perturbed query
                    with open(perturbed_file, 'r') as f:
                        perturbed_data = json.load(f)

                    # Extract query hash for cleaner display
                    query_hash = base_name.replace("perturbed_query_", "")[:8]

                    print(f"  [{query_idx}/{len(perturbed_files)}] EVALUATING: {query_hash}... (generated by: {model_dir})")
                    print(f"      Evaluator: {args.model}")

                    # Format prompt
                    evaluation_prompt = format_filter_prompt(task_data, perturbed_data, filter_template)

                    # Call LLM for evaluation
                    try:
                        response_text, input_tokens, output_tokens, total_tokens = call_llm_for_evaluation(
                            api_client, args.api, evaluation_prompt, args.model,
                            args.max_tokens, args.temperature
                        )

                        # Parse evaluation response
                        evaluation = parse_evaluation_response(response_text)

                        # Add metadata
                        evaluation_data = {
                            "perturbed_query_file": os.path.basename(perturbed_file),
                            "perturbed_query_id": perturbed_data.get("id", "unknown"),
                            "original_instruction": task_data["original_instruction"],
                            "perturbed_instruction": perturbed_data["perturbed_instruction"],
                            "plausible_unintended_behavior": perturbed_data.get("plausible_unintended_behavior", ""),
                            "elicitation_strategy": perturbed_data.get("elicitation_strategy", "unknown"),
                            "benignity_score": evaluation["benignity_score"],
                            "realism_score": evaluation["realism_score"],
                            "contextual_plausibility_score": evaluation["contextual_plausibility_score"],
                            "goal_preservation_score": evaluation["goal_preservation_score"],
                            "harm_plausibility_score": evaluation["harm_plausibility_score"],
                            "implicitness_score": evaluation["implicitness_score"],
                            "justification": evaluation["justification"],
                            "evaluator_metadata": {
                                "model": args.model,
                                "api": args.api,
                                "input_tokens": input_tokens,
                                "output_tokens": output_tokens,
                                "total_tokens": total_tokens,
                                "cost": calculate_cost(args.model, input_tokens, output_tokens)
                            }
                        }

                        # Save evaluation with evaluator model name
                        eval_path = save_evaluation(perturbed_file, evaluation_data, args.model)

                        # Update statistics
                        task_evaluated += 1

                        # Track costs
                        cost = calculate_cost(args.model, input_tokens, output_tokens)
                        total_cost += cost
                        total_input_tokens += input_tokens
                        total_output_tokens += output_tokens

                        # Print results with clear formatting
                        print(f"      ✓ SUCCESS")
                        print(f"      Scores: B={evaluation['benignity_score']:<3} R={evaluation['realism_score']:<3} C={evaluation['contextual_plausibility_score']:<3} G={evaluation['goal_preservation_score']:<3} H={evaluation['harm_plausibility_score']:<3} I={evaluation['implicitness_score']:<3}")
                        print(f"      Tokens: {input_tokens:,} in / {output_tokens:,} out | Cost: {format_cost(cost)}")
                        print()

                    except Exception as e:
                        print(f"      ✗ ERROR: {e}")
                        print()
                        continue

                print()
                print(f"Task Complete: {task_evaluated} evaluated | {task_skipped} skipped | {format_cost(total_cost)} total cost")
                total_evaluated += task_evaluated
                total_skipped += task_skipped

            # Generate task summary
            if not args.summary_only:
                print()
            print(f"Generating summary report...")
            summary = generate_task_summary(args.queries_dir, task_id, args.domain, args.summary_dir)
            if summary:
                avg = summary['average_scores']
                print(f"Summary Statistics:")
                print(f"  Total perturbations: {summary['total_perturbations']}")
                print(f"  Average scores: B={avg['benignity']:.1f} | R={avg['realism']:.1f} | C={avg['contextual_plausibility']:.1f} | G={avg['goal_preservation']:.1f} | H={avg['harm_plausibility']:.1f} | I={avg['implicitness']:.1f}")

        except FileNotFoundError as e:
            print(f"⚠ Warning: {e}")
            continue
        except Exception as e:
            print(f"✗ Error processing task: {e}")
            import traceback
            traceback.print_exc()
            continue

        print()

    # Print final summary
    print("=" * 80)
    print("EVALUATION COMPLETE")
    print("=" * 80)
    print(f"Domain:            {args.domain}")
    print(f"Tasks processed:   {len(task_ids)}")
    if not args.summary_only:
        print(f"Evaluator model:   {args.model}")
        print()
        print(f"Total evaluated:   {total_evaluated}")
        print(f"Total skipped:     {total_skipped}")
        print()
        print(f"Total cost:        {format_cost(total_cost)}")
        print(f"Total tokens:      {total_input_tokens + total_output_tokens:,}")
        print(f"  Input tokens:    {total_input_tokens:,}")
        print(f"  Output tokens:   {total_output_tokens:,}")
        if total_evaluated > 0:
            avg_cost_per_eval = total_cost / total_evaluated
            print(f"Avg cost/eval:     {format_cost(avg_cost_per_eval)}")
    print()
    print(f"Summary reports:   {args.summary_dir}/{args.domain}/")
    print("=" * 80)


if __name__ == "__main__":
    main()
