{
    "uuid": "21dffb03-dede-5282-93f4-b0bb6a16d10b",
    "question": "Are there any works that has experimented on benchmark BIRD and is published in the main conference as long papers in ACL 2024? Tell me the paper titles and the reported text-to-SQL execution accuracy on BIRD for each paper.",
    "answer_format": "Your answer should be a Python list of tuples (List[Tuple[str, float]]). For each tuple in the list, the first element is the raw paper title and the second element is a Python float representing the execution accuracy rounded to 1 decimal (e.g., 30.1). If the concrete end-to-end execution accuracy is unknown or not reported, please use 0.0 as the accuracy.",
    "tags": [
        "comprehensive",
        "text",
        "table",
        "objective"
    ],
    "anchor_pdf": [],
    "reference_pdf": [],
    "conference": [
        "acl2024"
    ],
    "reasoning_steps": [],
    "evaluator": {
        "eval_func": "eval_structured_object_exact_match",
        "eval_kwargs": {
            "gold": [
                [
                    "Is Table Retrieval a Solved Problem? Exploring Join-Aware Multi-Table Retrieval",
                    0.0
                ],
                [
                    "Integrate the Essence and Eliminate the Dross: Fine-Grained Self-Consistency for Free-Form Language Generation",
                    51.4
                ],
                [
                    "When is Tree Search Useful for LLM Planning? It Depends on the Discriminator",
                    0.0
                ],
                [
                    "Synthesizing Text-to-SQL Data from Weak and Strong LLMs",
                    63.4
                ]
            ],
            "ignore_order": true,
            "ndigits": 1,
            "threshold": 95,
            "fuzz_method": "ratio"
        }
    },
    "state": {
        "gui-gpt-4o-2024-11-20": false
    },
    "annotator": "human"
}