{
    "uuid": "b50d066a-9ed9-5aac-b79c-a32e3bef9734",
    "question": "Which dataset performs better on the LLaMA model, PRM800K or Math-Shepherd? In the source paper of PRM800K, which methods are compared with PRM?",
    "answer_format": "Your answer should be a python list of two items. The first item is a python string. The second item is a python list of strings.",
    "tags": ["multiple","objective","text","image"],
    "anchor_pdf": ["85b588f5-13e2-5aaa-9ce8-76c52426b40e","80b0a0f4-7247-5b9e-8782-0a4dd4a2ae4b"],
    "reference_pdf": [],
    "conference": [],
    "reasoning_steps": [
        "First, locate Section 5.1 in the original paper of Math-Shepherd to find the comparison between PRM800K and Math-Shepherd.",
        "Next, turn to the paper of PRM800K",
        "Locate Figure 3 and identify the various methods compared in the figure."
    ],
    "evaluator": {
        "eval_func": "eval_conjunction",
        "eval_kwargs": {
            "eval_func_list": [
                "eval_string_exact_match",
                "eval_element_list_included"
            ],
            "eval_kwargs_list": [
                {
                    "gold": "Math-Sheperd"
                },
                {
                    "gold": ["ORM", "Majority Voting"],
                    "ignore_order": true
                }
            ]

        }
    },
    "state": {
        "gui-gpt-4o-2024-11-20": false
    },
    "annotator": "human"
}