{
    "uuid": "d170af87-1580-52a1-b6d1-814f2ddbfac4",
    "question": "What's the evaluation baseline used in the paper titled \"Generative Adversarial Training with Perturbed Token Detection for Model Robustness\"? What's the contributions that makes this baseline different from existing adversarial datasets?",
    "answer_format": "Your answer should be a python list of two strings, the first element is the baseline name(the abbrievation format is enough), and the second element is the contributions.",
    "tags": [
        "text",
        "multiple",
        "subjective"
    ],
    "conference": [],
    "reasoning_steps": [
        "First, locate the section about evaluation in the anchor paper.",
        "Second, find the evaluation baseline used in the anchor paper and its source paper.",
        "Finally, turn to the source paper to find the contributions that makes this baseline different from existing adversarial datasets."
    ],
    "evaluator": {
        "eval_func": "eval_conjunction",
        "eval_kwargs": {
            "eval_func_list": [
                "eval_string_exact_match",
                "eval_partial_scoring_points_with_llm"
            ],
            "eval_kwargs_list": [
                {
                    "gold": ["AdvGLUE"],
                    "lowercase": true
                },
                {
                    "scoring_points":["Comprehensive Coverage:AdvGLUE is able to cover as many adversarial linguistic phenomena as possible.","Systematic Annotations: this is the first work that performs systematic and comprehensive evaluation and annotation over 14 different textual adversarial examples. ","General Compatibility:  AdvGLUE covers the widely-used GLUE tasks and creates an adversarial version of the GLUE benchmark to evaluate the robustness of language models","High Transferability and Effectiveness: AdvGLUE has high adversarial transferability and can effectively attack a wide range of state-of-the-art models."],
                    "question": "What's the contributions that makes this baseline different from existing adversarial datasets?",
                    "count": 3
                }
            ]
        }
    },
    "state": {
        "gpt-4o-2024-05-13": false
    },
    "annotator": "human",
    "anchor_pdf": [
        "8c5db97f-f499-5641-8d74-d0d64d980f53"
    ],
    "reference_pdf": ["32a1dee2-310a-5ead-8d2f-c957cc59e3dc"]
}