{
    "uuid": "170deef3-1b76-54ee-a27b-c1fe6bad1061",
    "question": "Which metric is used to evaluate the emergent abilities on BIG-Bench datasets, as is indicated in the anchor pdf? And why this metric is not favorable according to this paper?",
    "answer_format": "Your answer should be a Python list of two elements, where the first element is the name of the metric, and the second is the reason why this metric is not favorable according to this paper.",
    "tags": [
        "multiple",
        "objective",
        "text"
    ],
    "anchor_pdf": ["a424d02c-da19-5e1d-94e8-0b47cf2ded9c"],
    "reference_pdf": ["c302a979-c9a6-509a-a555-5fc9e5bb7bf8"],
    "conference": [],
    "reasoning_steps": [
        "In the anchor pdf, find the section that discusses the evaluation of emergent abilities on BIG-Bench datasets.",
        "Identify the specific metric used for this evaluation.",
        "In the reference pdf, locate the section that discusses the evaluation of emergent abilities on BIG-Bench datasets.",
        "Identify the specific metric used for this evaluation.",
        "Compare the two metrics and determine why the chosen metric is not favorable according to the reference paper."
    ],

    "evaluator": {
        "eval_func": "eval_conjunction",
        "eval_kwargs": {
            "eval_func_list": [
                "eval_string_exact_match",
                "eval_reference_answer_with_llm"
            ],
            "eval_kwargs_list": [
                {
                    "gold": "ppl",
                    "lowercase": true
                },
                {
                    "question": "Why PPL is not favorable according to this paper?",
                    "reference_answer": "Because ppl of correct options and incorrect options may decrease simultaneously."
                }
            ]
        }

    },
    "state": {
        "gui-gpt-4o-2024-11-20": false
    },
    "annotator": "human"
}