from inspect_ai import eval
from llm_self_awareness_of_capability.experiment3.utils.swebench_task import swe_bench_task


def run_swebench_predict(model, log_dir, limit=None):
    if model.startswith("openai/o"):
        # o-series models don't allow a tool called "python"
        python_tool_name = "python3"
    else:
        python_tool_name = "python"
    kwargs = {"python_tool_name":python_tool_name, "tool_call_limit": 70, "message_limit": 1000}
    config = {}
    config["fail_on_error"] = 10 # Number of failed samples after which the eval fails (the log file will still save all successful samples).
    if model=="anthropic/claude-3-7-sonnet-20250219":
        reasoning_tokens = 4096 # Our paper uses both 0 and 4096
        if reasoning_tokens > 0:
            config["reasoning_tokens"] = reasoning_tokens

    #kwargs["instance_ids"] = ["sphinx-doc__sphinx-7889"] # Specify specific tasks to include
    kwargs["exclude_ids"] = ["django__django-15278"] # The docker image for this sample doesn't work
    task = swe_bench_task(**kwargs)
    eval(task, model=model, log_dir=log_dir, limit=limit, **config)

if __name__=="__main__":
    model = "openai/gpt-4.1-nano"
    log_dir = "gpt-41-nano/"
    run_swebench_predict(model, log_dir, limit=1)