{
    "uuid": "ce95db65-95c3-55d5-8eda-3e80ef6d0775",
    "question": "Using only task-level prompts or using only example-specific prompts, which is better on the Multi-Domain test set?",
    "answer_format": "Your answer should be a single string, either \"task-level\" or \"example-specific\".",
    "tags": [
        "objective",
        "single",
        "table",
        "text"
    ],
    "conference": [],
    "reasoning_steps": [
        "Locate the experiment table that shows the model performances on the Multi-Domain test set with different distributions of prompts.",
        "Find out the mathematical symbols indicating the numbers of two types of prompts.",
        "Locate all the rows in the table related to the question.",
        "Compare the result data and determine the conclusion."
    ],
    "evaluator": {
        "eval_func": "eval_string_exact_match",
        "eval_kwargs": {
            "gold": "example-specific",
            "lowercase": true
        }
    },
    "state": {
        "gui-gpt-4o-2024-11-20": true
    },
    "annotator": "human",
    "anchor_pdf": [
        "5a2b95c1-12d6-5b77-82a1-ee24180d27ae"
    ],
    "reference_pdf": []
}