{
    "uuid": "541435a6-878e-540f-8b6a-86bf7920dc82",
    "question": "What is the main design of Auto-GUI framework from the aspects of the encoder, interaction, and decoder?",
    "answer_format": "Your answer should be a Python list of text strings, with each element being one core stage of this framework, you\"d better use the origin text, e.g., [\"stage 1\", \"stage 2\", ...].",
    "tags": [
        "single",
        "subjective",
        "text"
    ],
    "conference": [],
    "reasoning_steps": [
        "Usually, the framework of a model is mentioned in the methodology or related work section. Search the corresponding paragraphs of these two parts.",
        "If the core stages are organized as evident bullet points, directly use them as the answer.",
        "Otherwise, try to summarize the core stages from the main text."
    ],
    "evaluator": {
        "eval_func": "eval_scoring_points_with_llm",
        "eval_kwargs": {
            "scoring_points": [
                "Encoding: Acquire encoded features from both vision and language inputs. The vision input is encoded by a frozen vision encoder, the language input is encoded by a language encoder.",
                "Interaction: The encoded vision and language representations are integrated by a single-head self-attention network and a gated fusion",
                "Decoding: The fused representation is fed to the decoder to generate a chain of future action plans. The target predictions consist of a chain of future action plans and the current action prediction separated by specific prompts"
            ],
            "question": "What is the main design of Auto-GUI framework?"
        }
    },
    "state": {
        "gui-gpt-4o-2024-11-20": false
    },
    "annotator": "human",
    "anchor_pdf": [
        "648e3d50-375b-5189-b6b0-e0520626716e"
    ],
    "reference_pdf": []
}