{
    "uuid": "d575f608-c3fc-5a6a-97ea-01443d949f57",
    "question": "How many more examples each model are used in the experiment of \"LLM Evaluators Recognize and Favor Their Own  Generations\" than in \"Benchmarking Cognitive Biases in Large Language Models as Evaluators\"?",
    "answer_format": "Your answer should be a integer.",
    "tags": [
        "multiple",
        "text",
        "objective"
    ],
    "anchor_pdf": [
        "3d218d94-1aa0-5a70-b23e-accb254141bd",
        "2f1c8d90-3428-52b0-b7ec-da132f9178e6"
    ],
    "reference_pdf": [],
    "conference": [],
    "reasoning_steps": [],
    "evaluator": {
        "eval_func": "eval_int_exact_match",
        "eval_kwargs": {
            "gold": 1950
        }
    },
    "state": {
        "gui-gpt-4o-2024-11-20": false
    },
    "annotator": "human"
}