{
    "uuid": "cd235027-4032-5403-964a-b2c7e7550966",
    "question": "How much percent does VerifiNER improve the F1 score of the three baseline models on average on GENIA?",
    "answer_format": "Your answer should be a Python float rounded to two decimal places WITHOUT ANY PUNCTUATION OR EXPLANATION. e.g. 21.30",
    "tags": [
        "objective",
        "single",
        "table"
    ],
    "anchor_pdf": [
        "220f0021-1bf8-599f-ab3d-5b46d56cb03e"
    ],
    "reference_pdf": [],
    "conference": [],
    "reasoning_steps": [
        "Firstly, locate the result of VerifiNER on GENIA.",
        "Fetch the improvement on three baseline models respectively.",
        "Finally, calculate the average improvement and round it to two decimal places."
    ],
    "evaluator": {
        "eval_func": "eval_float_exact_match",
        "eval_kwargs": {
            "gold": 7.05,
            "ndigits": 2,
            "tolerance": 1e-06
        }
    },
    "state": {
        "gui-gpt-4o-2024-11-20": false
    },
    "annotator": "human"
}