{
    "uuid": "011dd1f5-52a8-5ab6-9eb1-d8432c4e614c",
    "question": "which term is mentionned in this paper (\"WINOPRON: Revisiting English Winogender Schemas for Consistency, Coverage, and Grammatical Case\")in terms of the result that Smaller FLAN-T5 models perform at chance level? What evaluation contrasts does the source paper of this term investigate?",
    "answer_format": "Your answer should be a single python list like [\"string1\", \"string2\"]. The first string should be the name of the term. The second string should be about the evaluation contrasts.",
    "tags": [
        "multiple",
        "text",
        "subjective"
    ],
    "anchor_pdf": [
        "52543c4f-6202-589d-b564-cf3421e3ce75"
    ],
    "reference_pdf": [
        "45d52c43-65df-5608-a6b6-75ea7beb27db"
    ],
    "conference": [],
    "reasoning_steps": [],
    "evaluator": {
        "eval_func": "eval_conjunction",
        "eval_kwargs": {
            "eval_func_list": [
                "eval_string_exact_match",
                "eval_reference_answer_with_llm"
            ],
            "eval_kwargs_list": [
                {
                    "gold": "demand gap",
                    "lowercase": true
                },
                {
                    "reference_answer": " production vs. forced choice, and metalinguistic judgment vs. probability measurement",
                    "question": "What evaluation contrasts does the source paper of this term investigate?"
                }
            ]
        }
    },
    "state": {
        "gui-gpt-4o-2024-11-20": false
    },
    "annotator": "human"
}