{
    "uuid": "927ff9af-42f7-5216-a6f9-f106e8ff6759",
    "question": "On the HEML sentence level with AUC metric, which baseline outperforms MIND on specific conditons?Is it the best variant according to the paper that proposed that baseline?",
    "answer_format": "Your answer should be a Python list of two strings. The first string is the name of the baseline (with variant) that outperforms MIND, as proposed in the anchor PDF. The second string is either `true` or `false`.",
    "tags": [
        "multiple",
        "text",
        "table",
        "objective"
    ],
    "anchor_pdf": [
        "621d42a1-dbab-5003-b7c5-625335653001"
    ],
    "reference_pdf": [
        "ab661558-432d-5e5e-b49c-a3660a40986e",
        "1a21b653-3db0-55e8-9d34-8b6cd3dcbefa",
        "85111b8b-4df0-5a9a-8d11-a7ae12eebcf6",
        "0597ce2b-cd8c-5b5b-b692-e8042d8548de",
        "6df0f3f3-e2e1-5d7a-9d70-3114ceac5939",
        "02f7fff5-cec7-5ac8-a037-f5eb117b9547"
    ],
    "conference": [],
    "reasoning_steps": [
        "Loate the table that evaluates different baselines.",
        "Identify the variant that surpasses MIND.",
        "Read the paper that proposed the variant to determine if it is the best variant."
    ],
    "evaluator": {
        "eval_func": "eval_structured_object_exact_match",
        "eval_kwargs": {
            "gold": ["SCG-NLI", "false"],
            "ignore_order": false,
            "lowercase": true
        }
    },
    "state": {
        "gui-gpt-4o-2024-11-20": false
    },
    "annotator": "human"
}