{
    "uuid": "6aefdbec-8411-50e8-a9f3-b26afe188083",
    "question": "In the paper that proposes the only comparable interactive theorem prover applied as a baseline by AIPS, where are the evaluation samples chosen from?",
    "answer_format": "Your answer should be the a raw text from the papers.",
    "tags": [
        "multiple",
        "text",
        "table",
        "subjective"
    ],
    "anchor_pdf": [
        "0e91df50-6832-5615-baf8-af56e93ea272"
    ],
    "reference_pdf": [
        "f1c80ac8-4588-586b-bf00-1151edd91acd",
        "67f92ccd-ca66-5cd5-b6ac-3852a53255e2"
    ],
    "conference": [],
    "reasoning_steps": [],
    "evaluator": {
        "eval_func": "eval_reference_answer_with_llm",
        "eval_kwargs": {
            "reference_answer": "We perform experiments on theorems from \"Mathematics in Lean\" : a book for beginners to formalize and prove mathematical theorems in Lean. It has 233 theorem proving exercises, covering topics from sets and functions to topology, calculus, and measure theory. For evaluation, we randomly selected 50 theorems, and their proofs have 5.52 tactics on average.",
            "question":"In the paper that proposes LeanCopilot, where are the evaluation samples chosen from?"
        }
    },
    "state": {
        "gui-gpt-4o-2024-11-20": false
    },
    "annotator": "human"
}