{
    "uuid": "25fd4dd0-a865-541f-bcdd-246a56ba36ed",
    "question": "Both the papers use the Model Performance on EditEval to test their models. What existing models' data do they use in common?",
    "answer_format": "Your answer should be a python list, each elemet is a string , which refers to a model name.",
    "tags": [
        "multiple",
        "objective",
        "text",
        "table"
    ],
    "anchor_pdf": [
        "cabc7bed-6a8b-5030-a199-716eac881799",
        "f9c34aba-31a0-5b67-83a6-3cde37f2aecb"
    ],
    "reference_pdf": [],
    "conference": [],
    "reasoning_steps": [
        "First, find Model Performance on EditEval in the two papers.",
        "Then, compare the modal they used and get the answer."
    ],
    "evaluator": {
        "eval_func": "eval_structured_object_exact_match",
        "eval_kwargs": {
            "gold":[
                "T0++",
                "T0",
                "PEER-3",
                "PEER-11",
                "Tk",
                "PaLM 2"
            ],
            "ignore_order": true,
            "lowercase": true
        }
    },
    "state": {
        "gui-gpt-4o-2024-11-20": false
    },
    "annotator": "human"
}