{
    "uuid": "a803c5e9-ad61-5580-8819-66875022e19b",
    "question": "In order to improve Parrot's abilities, which method proposed in the paper \"Direct Preference Optimization: Your Language Model is Secretly a Reward Model\" is used to train the model? Which method is compared with it under distribution shifts?",
    "answer_format": "Your answer should be a Python list of two strings, answering the two questions respectively. You must use abbreviations as given in the papers.",
    "tags": [
        "multiple",
        "text",
        "objective"
    ],
    "anchor_pdf": [
        "a8508753-17fc-5f40-8abf-245ecbfe151e",
        "d35109d5-9f0a-5d99-ae90-dcaabf4bb74e"
    ],
    "reference_pdf": [],
    "conference": [],
    "reasoning_steps": [
        "First of all, locate the paper related to Parrot.",
        "Secondly, read the paper\"Direct Preference Optimization: Your Language Model is Secretly a Reward Model\" and find the methods used to train Parrot.",
        "Finally, find the two methods in the paper."
    ],
    "evaluator": {
        "eval_func": "eval_structured_object_exact_match",
        "eval_kwargs": {
            "gold":[
                "DPO",
                "PPO"
            ],
            "lowercase": false,
            "ignore_order": false
        }
    },
    "state": {
        "gui-gpt-4o-2024-11-20": false
    },
    "annotator": "human"
}