{
    "uuid": "c63f7ad7-00de-56f0-8b74-2fdf420ceaa2",
    "question": "How many models are evaluated in WebArena? What's the success rate of the most powerful model?",
    "answer_format": "Your answer should be a Python list of 2 elements. The first one is an integer. The second one is a float.",
    "tags": ["single", "text", "table", "objective"],
    "anchor_pdf": ["5a2b0d5c-6b51-5bbd-a001-a15f19f65a98"],
    "reference_pdf": [],
    "conference": [],
    "reasoning_steps": [
		"Find the section of experiment setting or baseline introduction or the experiment table",
		"Count the numbers of evaluated models",
		"Determine which model is the most powerful one",
		"Extract the performance of the mose powerful model"
	],
    "evaluator": {
        "eval_func": "eval_conjunction",
        "eval_kwargs": {
			"eval_func_list": [
				"eval_int_exact_match",
				"eval_float_exact_match"
			],
			"eval_kwargs_list": [
				{"gold": 3},
				{"gold": 14.41}
			]
		}
    },
    "state": {
        "gui-gpt-4o-2024-11-20": false
    },
    "annotator": "human"
}
