{
    "uuid": "f987547b-e418-5424-8f8b-f8855bdf63cc",
    "question": "Which two datasets it combines, the dataset that Alchemist used to evaluate image modality?",
    "answer_format": "Your answer should be a Python list of two strings, the abbreviations of the datasets as given in the paper.",
    "tags": [
        "multiple",
        "text",
        "objective"
    ],
    "anchor_pdf": [
        "0e3f6c92-099b-5343-a589-7095452ddf16"
    ],
    "reference_pdf": [
        "b085c7de-cb5b-5a8d-a0a2-6e617182ff63"
    ],
    "conference": [],
    "reasoning_steps": [],
    "evaluator": {
        "eval_func": "eval_structured_object_exact_match",
        "eval_kwargs": {
            "gold": ["CUB", "Places"],
            "ignore_order": true,
            "lowercase": true
        }
    },
    "state": {
        "gui-gpt-4o-2024-11-20": false
    },
    "annotator": "human"
}