{
    "uuid": "e87fa3e0-7d2f-5909-8e01-5c2d8de2e64c",
    "question": "Which dataset did not get improved performance after applying the proposed RECOST method to Alpaca-gpt4, compared to the Random baseline? Tell me this worst-performing dataset. And what's the remaining performance gap for our best-performing RECOST method compared to the reported human upper bound on the testset for that dataset?",
    "answer_format": "Your answer should be a Python list of two elements, where the first element is the name of the dataset, and the second element is a float number, calculated by subtracting the performance of the best-performing RECOST method from the reported human upper bound performance for that dataset.",
    "tags": [
        "metadata",
        "multiple",
        "objective",
        "table",
        "text"
    ],
    "conference": [],
    "reasoning_steps": [
        "Firstly, find the main experiment section in the paper RECOST.",
        "According to the table content, find the worst-performing dataset name after applying the proposed RECOST method to Alpaca-gpt4.",
        "Search for the relevant paper for the dataset in the reference list.",
        "In the linked paper, find the experiment section, and locate the row or column which reports the human upper bound performance on the testset for that dataset.",
        "Finally, calculate the remaining performance gap by subtracting the performance of the best-performing RECOST method from the reported human upper bound performance for that dataset.",
        "Report the combination of the dataset name and the performance gap as a Python list."
    ],
    "evaluator": {
        "eval_func": "eval_conjunction",
        "eval_kwargs": {
            "eval_func_list": [
                "eval_string_exact_match",
                "eval_float_exact_match"
            ],
            "eval_kwargs_list": [
                {
                    "gold": "Hellaswag",
                    "lowercase": true
                },
                {
                    "gold": 14.97,
                    "ndigits": 2,
                    "tolerance": 0.0001
                }
            ]
        }
    },
    "state": {
        "gui-gpt-4o-2024-11-20": false
    },
    "annotator": "human",
    "anchor_pdf": [
        "0e1f319d-ad46-5528-9559-9208708536e9"
    ],
    "reference_pdf": [
        "7d4754c9-e8ac-51de-aa10-0bb4df7c4ff0"
    ]
}