{
    "uuid": "3bec1f83-7dfa-5650-81b5-f70d0aaf5232",
    "question": "Among AlpacaEval, MT-Bench and MMLU, which ones collect open-ended questions accross different domains without providing concrete reference answers?",
    "answer_format": "Your answer should be a python list of 1-3 strings, and the strings should be AlpacaEval, MT-Bench or MMLU.",
    "tags": [
        "multiple",
        "text",
        "objective"
    ],
    "anchor_pdf": [
        "9156c181-6be2-5e8a-ba2e-4658dce594e7",
        "9156c181-6be2-5e8a-ba2e-4658dce594e7",
        "c2c5bf1a-3d4a-508e-a217-b3e4b78ce7f7"
    ],
    "reference_pdf": [],
    "conference": [],
    "reasoning_steps": [
        "Find the paper about AlpacaEval, MT-Bench and MMLU.",
        "Read the paper to find the answer."
    ],
    "evaluator": {
        "eval_func": "eval_structured_object_exact_match",
        "eval_kwargs": {
            "gold": [
                "AlpacaEval",
                "MT-Bench"
            ],
            "ignore_order": true
        }
    },
    "state": {
        "gui-gpt-4o-2024-11-20": false
    },
    "annotator": "human"
}