{
    "uuid": "31c0e826-57b0-5445-a16d-0e3d4adc46ab",
    "question": "Of the following three combinations, which reaches the highest pass@1 accuracy on HumanEval and what's the exact accuracy value: Codestral+MGDebugger, Reflexion+LDB(GPT-4), MetaGPT.",
    "answer_format": "Your answer should be a Python List of 2 elements, the first is the combination and the second is the exact accuracy value, rounded to one decimal places. Note that you should use the same names as in the question.",
    "tags": [
        "multiple",
        "text",
        "table",
        "image",
        "objective"
    ],
    "anchor_pdf": [],
    "reference_pdf": [
        "bafa4ba3-f7e9-5bf2-960d-cb11f11ec138",
        "460c65d7-a298-5bd3-baa2-dd8683885308",
        "80a14542-96a0-5a15-a501-959b9007a8b6"
    ],
    "conference": [],
    "reasoning_steps": [
        "Read the tables, images to identify the pass@1 accuracy of the three combinations.",
        "Notice the specific conditions for different combinations."
    ],
    "evaluator": {
        "eval_func": "eval_structured_object_exact_match",
        "eval_kwargs": {
            "gold": ["Reflexion+LDB(GPT-4)", 96.9],
            "lowercase": true,
            "ignore_order": false,
            "ndigits": 1
        }
    },
    "state": {
        "gui-gpt-4o-2024-11-20": false
    },
    "annotator": "human"
}