{
    "uuid": "c4048cbf-71e6-55ec-a0e9-ba082c5a2954",
    "question": "In the PPTC benchmark paper, among the works that focus on LLMs' tool-use ability to generate APIs for solving user instructions, which one doesn't apply AST accuracy?",
    "answer_format": "Your answer should be a string, the name of the method or model.",
    "tags": [
        "multiple",
        "text",
        "objective"
    ],
    "anchor_pdf": [
        "3f195c86-04e5-5c9d-826b-63672b5ff9a3"
    ],
    "reference_pdf": [
        "4261dbce-3665-5261-9125-09c96905ca64",
        "1dffea3e-12d5-5a96-82db-480f1579040e",
        "8967d40f-af4b-5754-848a-0d84d923e39d"
    ],
    "conference": [],
    "reasoning_steps": [
        "Locate the three works.",
        "Read the sections that mention evaluation.",
        "Identify the work that doesn't apply AST accuracy."
    ],
    "evaluator": {
        "eval_func": "eval_string_exact_match",
        "eval_kwargs": {
            "gold": "Toolformer",
            "lowercase": true
        }
    },
    "state": {
        "gui-gpt-4o-2024-11-20": false
    },
    "annotator": "human"
}