{
    "uuid": "25c34c03-3d73-51df-bb4a-ba58f03bab41",
    "question": "On which benchmark does the author evaluate SnapKV against multiple baseline models? Additionally, what is the number of data for each type of task in the benchmark?",
    "answer_format": "Your answer should be a Python list containing two elements: the first element should be a string representing the benchmark name, and the second element should be a list of integers indicating the number of data points for each type of task in the benchmark. For example: [\"GLUE\", [100, 200, 300]].",
    "tags": [
        "multiple",
        "text",
        "image",
        "objective"
    ],
    "anchor_pdf": [
        "7359e988-66ba-5081-9831-29196c891581"
    ],
    "reference_pdf": [
        "d6b892b8-cf43-5b62-bde9-48c070c2e5dc"
    ],
    "conference": [],
    "reasoning_steps": [
        "Identify the benchmark used in the paper and examine the baseline models evaluated for each benchmark. Determine which benchmark includes multiple baseline models.",
        "Locate the reference benchmark paper.",
        "Determine the number of data for each type of task within the benchmark."    
    ],
    "evaluator": {
        "eval_func": "eval_conjunction",
        "eval_kwargs": {
            "eval_func_list": [
                "eval_string_exact_match",
                "eval_structured_object_exact_match"
            ],
            "eval_kwargs_list": [
                {
                    "gold": "LongBench",
                    "ignore_case": true
                },
                {
                    "gold": [750, 800, 600, 800, 600, 1000],
                    "ignore_order": true
                }
            ]
        }
    },
    "state": {
        "gui-gpt-4o-2024-11-20": false
    },
    "annotator": "human"
}