{
    "uuid": "921a63d8-1cb7-5162-bc06-b9546498e519",
    "question": "Among StudentEval, HumanEval and MBPP, which one has the most test cases per problem?",
    "answer_format": "Your answer should be a string, indicating the name of the dataset.",
    "tags": [
        "multiple",
        "text",
        "objective"
    ],
    "anchor_pdf": [
        "4b53feaf-4e33-590c-a8bb-9c8c7005bf6b",
        "a70723f6-7139-5165-a9c7-9dcdd34e3514",
        "0e57b18a-c261-582f-8527-2337f0aeda90"
    ],
    "reference_pdf": [],
    "conference": [],
    "reasoning_steps": [
        "Read the description of the datasets to identify the number of test cases per problem."
    ],
    "evaluator": {
        "eval_func": "eval_string_exact_match",
        "eval_kwargs": {
            "gold": "HumanEval",
            "lowercase": true
        }
    },
    "state": {
        "gui-gpt-4o-2024-11-20": false
    },
    "annotator": "human"
}