{
    "uuid": "8f53d4bf-6a6f-59b8-90ed-c6b555240a59",
    "question": "Among the datasets proposed in the Introduction section of the paper \"Towards Robust Temporal Reasoning of Large Language Models via a Multi-Hop QA Dataset and Pseudo-Instruction Tuning\", which one has the least Q-A pairs?",
    "answer_format": "Your answer should be a single string, the name of the dataset as given in the Introduction section.",
    "tags": [
        "multiple",
        "text",
        "table",
        "objective"
    ],
    "anchor_pdf": [
        "2bf8095c-81d0-5988-858b-a5155e0cc985"
    ],
    "reference_pdf": [
        "b93e2dfe-7f58-5d96-8c64-39930d5c22ea",
        "e73a34ee-536b-5ea4-8de4-b2fb39b30042",
        "3e409d3a-1045-575f-b4ad-f4923916080a",
        "4d93d596-b0bd-54c6-bd9e-041037077bc7",
        "71cec673-84eb-579b-9419-2032699ac0e7"
    ],
    "conference": [],
    "reasoning_steps": [
        "Locate the Introduction section of the anchor PDF.",
        "Find the list of datasets proposed in the section.",
        "Read the PDF of each dataset, with special attention to tables.",
        "Identify the dataset with the least Q-A pairs."
    ],
    "evaluator": {
        "eval_func": "eval_string_exact_match",
        "eval_kwargs": {
            "gold": "SituatedQA",
            "lowercase": true
        }
    },
    "state": {
        "gui-gpt-4o-2024-11-20": false
    },
    "annotator": "human"
}