{
    "uuid": "66c5fd15-e82b-5a02-bce6-bb0aab05184f",
    "question": "Among the datasets of the benchmark that collects CHIP-CDN, what are the evaluation metrics they applied?",
    "answer_format": "Your answer should be a Python list of strings, containing the names of the evaluation metrics.",
    "tags": [
        "multiple",
        "text",
        "table",
        "objective"
    ],
    "anchor_pdf": [
        "3eb8365d-eb44-5413-adeb-7380c9824e3d"
    ],
    "reference_pdf": [
        "09811e9e-5c35-5695-a106-df02aaff357c"
    ],
    "conference": [],
    "reasoning_steps": [
        "Read the anchor PDF to identify the benchmark that collect CHIP-CDN.",
        "Read the PDF that introduces the benchmark to identify the evaluation metrics applied."
    ],
    "evaluator": {
        "eval_func": "eval_structured_object_exact_match",
        "eval_kwargs": {
            "gold": [
                "Micro F1",
                "Macro F1",
                "Accuracy"
            ],
            "ignore_order": true,
            "ignore_blank": true,
            "lowercase": true
        }
    },
    "state": {
        "gui-gpt-4o-2024-11-20": false
    },
    "annotator": "human"
}