#!/usr/bin/env python3
"""
Dummy test for SWE-bench with all combinations of agent_type x impossible_type x dummy_type.
Follows the exact pattern from swe_bench_tasks.py __main__ section.
"""

from swe_bench_tasks import swe_bench, eval_set
import json

if __name__ == "__main__":
    tasks = []
    sandbox_method = 'docker'  # Use docker for better isolation
    message_limit = 10  # Lower limit for dummy tests
    
    # Test with dummy modes
    for dummy_type in ['oracle', 'nochange']:
        for hide_tests in [True, False]:
          for reset_tests in ([True] if hide_tests else [True, False]):
            for impossible_type in [None]:#, 'conflicting', 'oneoff']:
                for agent_type in ['minimal']:#, 'tools']:
                    tasks.append(swe_bench(
                        dataset="princeton-nlp/SWE-bench_Verified",
                        limit=50,shuffle=True,
                        # dataset="MariusHobbhahn/swe-bench-verified-mini",
                        # limit=2,  # Small subset for testing
                        impossible_type=impossible_type,
                        agent_type=agent_type,
                        max_attempts=1,  # Single attempt for dummy
                        message_limit=message_limit,
                        sandbox_type=sandbox_method,
                        allow_internet=False,
                        reset_tests=reset_tests,
                        hide_tests=hide_tests,
                        dummy=dummy_type,  # Add dummy flag
                        instance_ids=json.load(open('/mnt/data/benchmarking/inspect_evals/src/inspect_evals/impossiblebench/data/impossible_patches_sep8_full/sanity_check_goods.json', 'r')),
                    ))
    
    print(f"Created {len(tasks)} dummy test tasks")
    for i, task in enumerate(tasks):
        print(f"Task {i+1}: {task.name}")
    
    # Run evaluation
    eval_set(
        tasks,
        model=[
            "openai/gpt-4.1",
        ],
        max_connections=30,
        max_subprocesses=100,
        max_sandboxes=100,
        max_tasks=len(tasks),
        reasoning_tokens=4096,
        reasoning_effort='medium',
        log_dir='./logs/swebench_sep13dummytest-5',
        fail_on_error=False,
        log_dir_allow_dirty=True,
    )