{
    "uuid": "897a955d-9e8a-5836-a3b6-0ed48575f2b9",
    "question": "In terms of evaluation results on SuperGLUE using RoBERTaBASE, on subtask ReCoRD, which model(s) achieve the overall best results? Additionally, which model(s) perform the best among the two-stage MTL models?",
    "answer_format": "Your answer should be a python dict(without \\n) containing two keys \"overall best\" and \"two-stage MTL best\", each value of the two keys is a python string list. e.g.{\"overall best\":[\"modelname1\"],\"two-stage MTL best\":[\"modelname2\",\"modelname3\"]} ",
    "tags": [
        "objective",
        "single",
        "table",
        "text"
    ],
    "conference": [],
    "reasoning_steps": [
        "Find information about two-stage MTL models(maybe a section or table mentioning which models or baselines are two-stage MTL).",
        "Locate the table about the performance results on SuperGLUE using RoBERTaBASE(usually a main experiment table containing different kinds of models and subtasks).",
        "Compare the performance of models on the subtask ReCoRD to determine which model(s) achieves the overall best results and which model(s) perform the best among the two-stage MTL models."
    ],
    "evaluator": {
        "eval_func": "eval_structured_object_exact_match",
        "eval_kwargs": {
            "gold": {
                "overall best": [
                    "PROPETL"
                ],
                "two-stage MTL best": [
                    "SCALEARNUNIFORM",
                    "SCALEARN++"
                ]
            },
            "ignore_order": true,
            "ignore_blank": true
        }
    },
    "state": {
        "gpt-4o-2024-11-20": false
    },
    "annotator": "human",
    "anchor_pdf": [
        "9b06b24b-0afc-5ccb-95fc-c662395d291d"
    ],
    "reference_pdf": []
}