{
    "uuid": "b1ee7930-cebf-5b6d-8ebc-bbc0a6246aca",
    "question": "In which comparisons of models did the two papers reach similar conclusions?",
    "answer_format": "Your answer should be a python list of several strings. The string should be language model name.",
    "tags": [
        "multiple",
        "text",
        "table",
        "objective"
    ],
    "anchor_pdf": [
        "fc6daddf-131f-59b0-adc2-85b97b4ecd82",
        "2773b9a9-f232-5acb-a1dd-a0168e52cf0c"
    ],
    "reference_pdf": [],
    "conference": [],
    "reasoning_steps": [],
    "evaluator": {
        "eval_func": "eval_structured_object_exact_match",
        "eval_kwargs": {
            "gold": [
                "GPT-2",
                "BERT"
            ],
            "ignore_order": true
        }
    },
    "state": {
        "gui-gpt-4o-2024-11-20": false
    },
    "annotator": "human"
}