{
    "uuid": "198666fc-a067-52c2-b80f-fb804bc80034",
    "question": "What is the ranking of the average performance of the models compared in the experiment across all languages where each model has a value in the all-language finetuning, from highest to lowest?",
    "answer_format": "Your answer should be a Python list of elements, each element is a model name string, e.g., [\"model_name 1\", \"model_name 2\", ...].",
    "tags": [
        "objective",
        "single",
        "table"
    ],
    "conference": [],
    "reasoning_steps": [
        "Firstly, find all section titles in the paper to locate the experiment section.",
        "Identify the finer requirements on all-language finetuning.",
        "Next, find the relevant table, figure or text that contains the average performance of the models across all languages.",
        "Finally, calculate or retrieve the exact value to figure out the ranking of the models."
    ],
    "evaluator": {
        "eval_func": "eval_structured_object_exact_match",
        "eval_kwargs": {
            "gold": [
                "mCLIP",
                "mCLIP+",
                "UC2",
                "M3P"
            ]
        }
    },
    "state": {
        "gui-gpt-4o-2024-11-20": false
    },
    "annotator": "human",
    "anchor_pdf": [
        "ff651d37-e725-5752-9c38-3361bc54723d"
    ],
    "reference_pdf": []
}