{
    "uuid": "56f3ff15-de1c-5769-ac00-6218e9d9a0a6",
    "question": "Among the previous methods applied in the FunCoder's experiments on open-source models, which one was proposed later? Additionally, which datasets was applied in the evaluation of that method, but not in FunCoder?",
    "answer_format": "Your answer should be a Python list of 2 elements, the first is the name of the method, the second is a python list of strings, the name of the datasets.",
    "tags": [
        "multiple",
        "text",
        "table",
        "objective"
    ],
    "anchor_pdf": [
        "0e060d12-30d9-5e34-b7e3-4874dd94be7b"
    ],
    "reference_pdf": [
        "98350979-1991-571f-bd8a-5f8624b832f3",
        "813a0918-58f6-57fa-aa79-e3065a5ff88a",
        "7b9d2080-37d8-593b-93e9-abfcb2aead4a",
        "bafa4ba3-f7e9-5bf2-960d-cb11f11ec138",
        "460c65d7-a298-5bd3-baa2-dd8683885308"
    ],
    "conference": [],
    "reasoning_steps": [
        "Find the table that compares different methods, notice the limitations.",
        "Identify the method.",
        "Read the corresponding paper to find the evaluation datasets.",
        "Compare the datasets with the ones used in FunCoder."
    ],
    "evaluator": {
        "eval_func": "eval_conjunction",
        "eval_kwargs": {
            "eval_func_list": [
                "eval_string_exact_match",
                "eval_structured_object_exact_match"
            ],
            "eval_kwargs_list": [
                {
                    "gold": "CodeT",
                    "lowercase": true
                },
                {
                    "gold": ["APPS", "CodeContests"],
                    "lowercase": true,
                    "ignore_order": true
                }
            ]
        }
    },
    "state": {
        "gui-gpt-4o-2024-11-20": false
    },
    "annotator": "human"
}