{
    "uuid": "fea63b48-8759-5b18-93d3-748ab9953c6c",
    "question": "The anchor PDF used two benchmark datasets for evaluation. Overall, on which dataset did the methods perform better?",
    "answer_format": "Your answer should be a python strings about the name of the dataset. YOU MUST USE THE EXACT NAME FROM THE PAPER.",
    "tags": [
        "single",
        "table",
        "objective"
    ],
    "anchor_pdf": [
        "08b7c0d2-b64c-5fb1-8b0c-8326bb3d220b"
    ],
    "reference_pdf": [],
    "conference": [],
    "reasoning_steps": [
        "Find the two benchmark datasets used for evaluation in the anchor PDF, which are mentioned in the experiments section.",
        "Find all the performances of the methods on the two datasets, which are shown as the form of tables.",
        "Compare the overall performance of the methods on the two datasets and choose the one with better values of the metrics."
    ],
    "evaluator": {
        "eval_func": "eval_string_exact_match",
        "eval_kwargs": {
            "gold": "TVSum",
            "lowercase": true
        }
    },
    "state": {
        "gui-gpt-4o-2024-11-20": false
    },
    "annotator": "human"
}