{
    "uuid": "2f7da671-2337-5c7b-9a25-35c1b996fe80",
    "question": "In Figure 1 of the paper \"When Benchmarks are Targets: Revealing the Sensitivity of Large Language Model Leaderboards\", which model has the largest difference in ranking between Fixed Answer and Cloze Prompt? For the dataset that contains the original question, what's the estimated expert-level accuracy?",
    "answer_format": "Your answer should be a Python list of two strings, the first is the name of the model, as proposed in the figure, the second is the estimated expert-level accuracy, rounding to 1 decimal place, like \"12.3%\".",
    "tags": [
        "multiple",
        "text",
        "image",
        "objective"
    ],
    "anchor_pdf": [
        "5fe57755-14f1-5ee7-a4b4-ecaba6827045"
    ],
    "reference_pdf": [
        "c2c5bf1a-3d4a-508e-a217-b3e4b78ce7f7",
        "be088b19-03fb-584b-a62d-2ab4b5d7fdd8"
    ],
    "conference": [],
    "reasoning_steps": [
        "Watch figure 1 to answer the first question",
        "Read the caption of the figure or the other section to find the dataset.",
        "Read the corresponding paper to find the estimated expert-level accuracy."
    ],
    "evaluator": {
        "eval_func": "eval_structured_object_exact_match",
        "eval_kwargs": {
            "gold": ["Llama2-7b-chat", "89.8%"],
            "ignore_order": false,
            "lowercase": true
        }
    },
    "state": {
        "gui-gpt-4o-2024-11-20": false
    },
    "annotator": "human"
}