{
    "uuid": "c1027cf8-184a-5c77-8c53-6247abe0160d",
    "question": "On which model does the paper(titled \"Making Large Language Models Better Reasoners with Orchestrated Streaming Experiences\") conduct the most analysis experiments?Is there any other size of parameters for this model? ",
    "answer_format": "Your answer should be a single python list formatted like [\"model_name\", [\"10B\",\"20B\",...]].The first element of the list is a string representing the name of the model, the second element of the list is a list representing other size of params(Note that you shouldnot include the size already employed in the paper).",
    "tags": [
        "multiple",
        "text",
        "table",
        "objective"
    ],
    "anchor_pdf": [
        "f47e106f-c4ef-5814-85fe-a895c754fe40"
    ],
    "reference_pdf": [
        "6b887e82-ca3f-59e1-ae8a-f528919c1334"
    ],
    "conference": [],
    "reasoning_steps": [],
    "evaluator": {
        "eval_func": "eval_conjunction",
        "eval_kwargs": {
            "eval_func_list": [
                "eval_string_exact_match",
                "eval_structured_object_exact_match"
            ],
            "eval_kwargs_list": [
                {
                    "gold": "LLaMA2-13B-Chat",
                    "lowercase": true
                },
                {
                    "gold": [
                        "7B",
                        "34B",
                        "70B"
                    ],
                    "ignore_order": true,
                    "lowercase": true
                }
            ]
        }
    },
    "state": {
        "gui-gpt-4o-2024-11-20": false
    },
    "annotator": "human"
}