{
    "uuid": "9945247a-acf3-5768-9e8c-3015d272434d",
    "question": "According to the RAV paper, up to 2019, which method performs the best overall on evidence retrieval? Additionally, what's that method's FEVER score with 1 sentence selected for the subtask of recognizing textual entailment?",
    "answer_format": "Your answer should be a Python list of 2 elements, the first is a string, the name of the method, and the second is a float, rounding to 2 decimal places.",
    "tags": [
        "multiple",
        "text",
        "table",
        "objective"
    ],
    "anchor_pdf": [
        "5a971f9d-71f9-5381-9877-05e68e18ad80"
    ],
    "reference_pdf": [
        "366e4b37-75a5-5baf-ad8f-634309a1a35e",
        "c50df058-1617-58f1-9b89-c397fcdceb6f",
        "d3cfe89d-e84d-51b6-bd10-33a106a8e12b"
    ],
    "conference": [],
    "reasoning_steps": [],
    "evaluator": {
        "eval_func": "eval_structured_object_exact_match",
        "eval_kwargs": {
            "gold": ["ESIM", 63.64],
            "ignore_order": false,
            "lowercase": true,
            "ndigits": 2
        }
    },
    "state": {
        "gui-gpt-4o-2024-11-20": false
    },
    "annotator": "human"
}