{
    "uuid": "6ec30967-a7aa-5ecb-8819-15e738ad4b50",
    "question": "In the two papers that updated ToMi according to the SimTom paper, what are the other datasets used to evaluate the models, besides ToMi?",
    "answer_format": "Your answer should be a Python list of strings, containing the name of the datasets.",
    "tags": [
        "multiple",
        "text",
        "objective"
    ],
    "anchor_pdf": [
        "2f861e4d-1888-5355-a9c2-26411f14efe4"
    ],
    "reference_pdf": [
        "56bb5074-0a00-578b-ad44-e24096458b1e",
        "1ff930b7-3ecb-50ee-bfb4-777d7d8636ad"
    ],
    "conference": [],
    "reasoning_steps": [
        "Read the anchor PDF to find the two papers that udpateed ToMi.",
        "Locate the section that describes the datasets used to evaluate the models in the papers.",
        "Identify the datasets besides ToMi that are used to evaluate the models."
    ],
    "evaluator": {
        "eval_func": "eval_structured_object_exact_match",
        "eval_kwargs": {
            "gold": ["ToM", "SocialIQA"],
            "lowercase": true,
            "ignore_order": true
        }
    },
    "state": {
        "gui-gpt-4o-2024-11-20": false
    },
    "annotator": "human"
}