{
    "uuid": "546b830f-aca5-56e1-8ebc-cffda2bd6ad6",
    "question": "Except WKM, which method performs the best on WebShop? Whether the two methods' papers use the same evaluation datasets or not?",
    "answer_format": "Your answer should be a Python list of two strings, the first is the abbreviation of the method, the second is either `true` or `false`.",
    "tags": [
        "multiple",
        "text",
        "table",
        "objective"
    ],
    "anchor_pdf": [
        "0d7108da-de5c-5e4c-9865-7c4141672767"
    ],
    "reference_pdf": [
        "722fbe98-4e3d-5d07-aea6-e4261418a8c8"
    ],
    "conference": [],
    "reasoning_steps": [
        "Find the table that compares different methods and models.",
        "Identify the second best method on WebShop.",
        "Read the corresponding papers to find the evaluation datasets."
    ],
    "evaluator": {
        "eval_func": "eval_structured_object_exact_match",
        "eval_kwargs": {
            "gold": ["ETO", "true"],
            "ignore_order": false,
            "lowercase": true
        }
    },
    "state": {
        "gui-gpt-4o-2024-11-20": false
    },
    "annotator": "human"
}