{
    "uuid": "b8034b03-c46a-5b8c-8bdd-09e67ad45f9f",
    "question": "What is the composition of the training dataset in the paper?",
    "answer_format": "Your answer should be a python strings about the dataset.",
    "tags": [
        "multiple",
        "subjective",
        "text"
    ],
    "anchor_pdf": [
        "9c7c7762-0132-583c-987a-0fbc89847c55"
    ],
    "reference_pdf": [
        "8c267034-d2a4-53d9-a4e0-0fdc761cde75"
    ],
    "conference": [],
    "reasoning_steps": [
        "Retrieve the name of the training dataset, which is mentioned in the experiment section of the paper.",
        "Locate the related pdf in the reference PDFs.",
        "Find the description of the training dataset in the reference PDF."
    ],
    "evaluator": {
        "eval_func": "eval_reference_answer_with_llm",
        "eval_kwargs": {
            "reference_answer": "WebVid-2M consists of 2.5M video-text pairs. The data was scraped from the web following a similar procedure to Google Conceptual Caption. The dataset consists of manually generated captions, that are for the most part well formed sentences. And the captions are aligned with the video and describe visual content.",
            "question": "What is the composition of the training dataset in the paper?"
        }
    },
    "state": {
        "gui-gpt-4o-2024-11-20": false
    },
    "annotator": "human"
}