{
  "metadata": {
    "experiment_type": "natural_length",
    "model": "qwen2.5-7b",
    "dataset": "mixed",
    "task_type": "reading_comprehension",
    "num_samples": 1000,
    "timestamp": "2025-12-24T20:52:49.987692"
  },
  "results": [
    {
      "sample_id": "narrativeqa_narrativeqa_18672",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 109051,
      "natural_ratio": 0.8319931030273438,
      "max_context_tokens": 131072,
      "context_length": 217986,
      "question_length": 77,
      "prediction": "Nicky \"Fingers\" Bonnano",
      "reference": "Nickey Fingers",
      "metrics": {
        "f1": 0.846153846153846
      },
      "elapsed_time": 3.086653232574463,
      "timestamp": "2025-12-24T20:36:03.404320"
    },
    {
      "sample_id": "squad_572bc0c3111d821400f38f70",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 669,
      "natural_ratio": 0.00510406494140625,
      "max_context_tokens": 131072,
      "context_length": 1235,
      "question_length": 64,
      "prediction": "哲学教育可以是关于教育过程的哲学。",
      "reference": "philosophy of the process of education or the philosophy of the discipline of education",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.13576340675354004,
      "timestamp": "2025-12-24T20:36:03.540649"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_24993",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 81153,
      "natural_ratio": 0.6191482543945312,
      "max_context_tokens": 131072,
      "context_length": 162192,
      "question_length": 75,
      "prediction": "三艘船，三百九十个男人。",
      "reference": "17 ships and 1500 men",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.5014662742614746,
      "timestamp": "2025-12-24T20:36:07.042438"
    },
    {
      "sample_id": "squad_57270b64708984140094d905",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 505,
      "natural_ratio": 0.00385284423828125,
      "max_context_tokens": 131072,
      "context_length": 881,
      "question_length": 89,
      "prediction": "营养 literacy",
      "reference": "nutrition literacy",
      "metrics": {
        "f1": 0.7826086956521738
      },
      "elapsed_time": 0.05119800567626953,
      "timestamp": "2025-12-24T20:36:07.093813"
    },
    {
      "sample_id": "squad_572b3656be1ee31400cb82a5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 394,
      "natural_ratio": 0.0030059814453125,
      "max_context_tokens": 131072,
      "context_length": 704,
      "question_length": 44,
      "prediction": "Hume said that people's ideas come from their impressions.",
      "reference": "their \"impressions\"",
      "metrics": {
        "f1": 0.7096774193548387
      },
      "elapsed_time": 0.1456623077392578,
      "timestamp": "2025-12-24T20:36:07.239618"
    },
    {
      "sample_id": "squad_5728d223ff5b5019007da750",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 346,
      "natural_ratio": 0.0026397705078125,
      "max_context_tokens": 131072,
      "context_length": 587,
      "question_length": 65,
      "prediction": "12",
      "reference": "12",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04441189765930176,
      "timestamp": "2025-12-24T20:36:07.284157"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_8685",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 99028,
      "natural_ratio": 0.755523681640625,
      "max_context_tokens": 131072,
      "context_length": 197983,
      "question_length": 34,
      "prediction": "在勃艮第公爵的营地，伊莎贝尔女王出现时，乔anne被指控为巫师。",
      "reference": "Reims",
      "metrics": {
        "f1": 0.05714285714285715
      },
      "elapsed_time": 3.6608099937438965,
      "timestamp": "2025-12-24T20:36:10.945338"
    },
    {
      "sample_id": "squad_56d12cc017492d1400aabb58",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 241,
      "natural_ratio": 0.00183868408203125,
      "max_context_tokens": 131072,
      "context_length": 388,
      "question_length": 55,
      "prediction": "2007年",
      "reference": "2007",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 0.08173632621765137,
      "timestamp": "2025-12-24T20:36:11.027325"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_27810",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 96960,
      "natural_ratio": 0.73974609375,
      "max_context_tokens": 131072,
      "context_length": 193837,
      "question_length": 43,
      "prediction": "一个装有手术刀的医生包",
      "reference": "A scalpel.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 4.7303924560546875,
      "timestamp": "2025-12-24T20:36:15.758144"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_20952",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 109513,
      "natural_ratio": 0.8355178833007812,
      "max_context_tokens": 131072,
      "context_length": 218948,
      "question_length": 38,
      "prediction": "Nina Sayers is a ballerina.",
      "reference": "She is a professional ballet dancer.",
      "metrics": {
        "f1": 0.7142857142857143
      },
      "elapsed_time": 2.78861665725708,
      "timestamp": "2025-12-24T20:36:18.547190"
    },
    {
      "sample_id": "squad_572ea4b0c246551400ce445a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 385,
      "natural_ratio": 0.00293731689453125,
      "max_context_tokens": 131072,
      "context_length": 671,
      "question_length": 59,
      "prediction": "甲烷在南极逃逸到中层大气（暖斑附近）。",
      "reference": "the stratosphere near the pole.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.19515013694763184,
      "timestamp": "2025-12-24T20:36:18.743841"
    },
    {
      "sample_id": "squad_573041b7947a6a140053d356",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 331,
      "natural_ratio": 0.00252532958984375,
      "max_context_tokens": 131072,
      "context_length": 558,
      "question_length": 65,
      "prediction": "goalkeeper",
      "reference": "goalkeeper",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04729294776916504,
      "timestamp": "2025-12-24T20:36:18.791355"
    },
    {
      "sample_id": "squad_5726da61f1498d1400e8ed3b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 340,
      "natural_ratio": 0.002593994140625,
      "max_context_tokens": 131072,
      "context_length": 567,
      "question_length": 73,
      "prediction": "Tha Feelstyle",
      "reference": "Tha Feelstyle",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06488895416259766,
      "timestamp": "2025-12-24T20:36:18.856405"
    },
    {
      "sample_id": "squad_57108ff2a58dae1900cd6a2f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 538,
      "natural_ratio": 0.0041046142578125,
      "max_context_tokens": 131072,
      "context_length": 944,
      "question_length": 93,
      "prediction": "French 和 Latin",
      "reference": "French and Latin",
      "metrics": {
        "f1": 0.9166666666666666
      },
      "elapsed_time": 0.05657649040222168,
      "timestamp": "2025-12-24T20:36:18.913131"
    },
    {
      "sample_id": "squad_56df20e5c65bf219000b3f7b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 233,
      "natural_ratio": 0.00177764892578125,
      "max_context_tokens": 131072,
      "context_length": 395,
      "question_length": 31,
      "prediction": "今天的以色列北部。",
      "reference": "Nazareth",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.06500649452209473,
      "timestamp": "2025-12-24T20:36:18.978298"
    },
    {
      "sample_id": "squad_56f8cee09e9bad19000a0554",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 402,
      "natural_ratio": 0.0030670166015625,
      "max_context_tokens": 131072,
      "context_length": 701,
      "question_length": 64,
      "prediction": "prefrontal cortex",
      "reference": "prefrontal cortex",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0652625560760498,
      "timestamp": "2025-12-24T20:36:19.043735"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_32717",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 77887,
      "natural_ratio": 0.5942306518554688,
      "max_context_tokens": 131072,
      "context_length": 155695,
      "question_length": 39,
      "prediction": "senior",
      "reference": "senior",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 3.2385010719299316,
      "timestamp": "2025-12-24T20:36:22.282615"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_26567",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 122894,
      "natural_ratio": 0.9376068115234375,
      "max_context_tokens": 131072,
      "context_length": 245670,
      "question_length": 78,
      "prediction": "一个巨大的、可怕的、变异的、黏液覆盖的、吸盘脸的生物头部从地板上的裂缝中升起。",
      "reference": "Volcanoes",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.710171699523926,
      "timestamp": "2025-12-24T20:36:25.993225"
    },
    {
      "sample_id": "squad_5731cbc10fdd8d15006c653b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 543,
      "natural_ratio": 0.00414276123046875,
      "max_context_tokens": 131072,
      "context_length": 1010,
      "question_length": 36,
      "prediction": "最早的 priesthoods 是 flamines（祭司），通常归功于国王努马。",
      "reference": "flamines",
      "metrics": {
        "f1": 0.326530612244898
      },
      "elapsed_time": 0.21299052238464355,
      "timestamp": "2025-12-24T20:36:26.206781"
    },
    {
      "sample_id": "squad_56d5fb031c85041400946e8e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 396,
      "natural_ratio": 0.003021240234375,
      "max_context_tokens": 131072,
      "context_length": 712,
      "question_length": 41,
      "prediction": "cats are known as obligate carnivores",
      "reference": "obligate",
      "metrics": {
        "f1": 0.35555555555555557
      },
      "elapsed_time": 0.10478878021240234,
      "timestamp": "2025-12-24T20:36:26.311704"
    },
    {
      "sample_id": "squad_572f7d20947a6a140053c9b8",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 625,
      "natural_ratio": 0.00476837158203125,
      "max_context_tokens": 131072,
      "context_length": 1171,
      "question_length": 39,
      "prediction": "1991年",
      "reference": "1991",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 0.07894563674926758,
      "timestamp": "2025-12-24T20:36:26.390785"
    },
    {
      "sample_id": "squad_5727a5812ca10214002d92c0",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 353,
      "natural_ratio": 0.00269317626953125,
      "max_context_tokens": 131072,
      "context_length": 614,
      "question_length": 52,
      "prediction": "传统上，政党资金的主要来源是金钱和实物捐赠给政党或其领导成员。",
      "reference": "Money and gifts",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.21325111389160156,
      "timestamp": "2025-12-24T20:36:26.604177"
    },
    {
      "sample_id": "squad_5729117aaf94a219006aa01e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 577,
      "natural_ratio": 0.00440216064453125,
      "max_context_tokens": 131072,
      "context_length": 1090,
      "question_length": 24,
      "prediction": "William C. Boyd was a mid-20th-century anthropologist.",
      "reference": "mid-20th-century anthropologist",
      "metrics": {
        "f1": 0.7294117647058824
      },
      "elapsed_time": 0.17438697814941406,
      "timestamp": "2025-12-24T20:36:26.778696"
    },
    {
      "sample_id": "squad_5730ed42497a881900248a69",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 302,
      "natural_ratio": 0.0023040771484375,
      "max_context_tokens": 131072,
      "context_length": 540,
      "question_length": 25,
      "prediction": " ostrich",
      "reference": "ostrich",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04378056526184082,
      "timestamp": "2025-12-24T20:36:26.822614"
    },
    {
      "sample_id": "squad_57277fdcf1498d1400e8f9fa",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 326,
      "natural_ratio": 0.0024871826171875,
      "max_context_tokens": 131072,
      "context_length": 582,
      "question_length": 30,
      "prediction": "Roche moutonnée range in length from less than a meter to several hundred meters long。",
      "reference": "less than a meter to several hundred meters long",
      "metrics": {
        "f1": 0.7164179104477612
      },
      "elapsed_time": 0.2239856719970703,
      "timestamp": "2025-12-24T20:36:27.046751"
    },
    {
      "sample_id": "squad_572a65ae7a1753140016af0d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 314,
      "natural_ratio": 0.0023956298828125,
      "max_context_tokens": 131072,
      "context_length": 545,
      "question_length": 43,
      "prediction": "可能会导致恢复延迟。",
      "reference": "there might be a delay in recovery.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.07416009902954102,
      "timestamp": "2025-12-24T20:36:27.121041"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_23918",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 103137,
      "natural_ratio": 0.7868728637695312,
      "max_context_tokens": 131072,
      "context_length": 206196,
      "question_length": 38,
      "prediction": "Mr. Reinhart设计了《桂儿·多夫利》的舞台布景。",
      "reference": "Alfred Parsons",
      "metrics": {
        "f1": 0.2631578947368421
      },
      "elapsed_time": 3.463350772857666,
      "timestamp": "2025-12-24T20:36:30.584777"
    },
    {
      "sample_id": "squad_57336f6e4776f41900660ace",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 620,
      "natural_ratio": 0.004730224609375,
      "max_context_tokens": 131072,
      "context_length": 1084,
      "question_length": 116,
      "prediction": "Bear Stearns",
      "reference": "Bear Stearns",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07436752319335938,
      "timestamp": "2025-12-24T20:36:30.659300"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_638",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 78933,
      "natural_ratio": 0.6022109985351562,
      "max_context_tokens": 131072,
      "context_length": 157786,
      "question_length": 40,
      "prediction": "Lord George Jennett",
      "reference": "Doricourt",
      "metrics": {
        "f1": 0.47058823529411764
      },
      "elapsed_time": 2.7222096920013428,
      "timestamp": "2025-12-24T20:36:33.381899"
    },
    {
      "sample_id": "squad_5726efa6dd62a815002e95bf",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 459,
      "natural_ratio": 0.00350189208984375,
      "max_context_tokens": 131072,
      "context_length": 820,
      "question_length": 59,
      "prediction": "No。",
      "reference": "do not contain programs of study or academic departments",
      "metrics": {
        "f1": 0.2
      },
      "elapsed_time": 0.04977273941040039,
      "timestamp": "2025-12-24T20:36:33.431849"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_9482",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 94745,
      "natural_ratio": 0.7228469848632812,
      "max_context_tokens": 131072,
      "context_length": 189380,
      "question_length": 71,
      "prediction": "Felix Bush wants to plan a funeral party before he dies with the help of the town's minister.",
      "reference": "A funeral party",
      "metrics": {
        "f1": 0.3
      },
      "elapsed_time": 3.052260160446167,
      "timestamp": "2025-12-24T20:36:36.484493"
    },
    {
      "sample_id": "squad_56f8f1b49e9bad19000a074d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 431,
      "natural_ratio": 0.00328826904296875,
      "max_context_tokens": 131072,
      "context_length": 782,
      "question_length": 40,
      "prediction": "William Miller signed the death warrant, so to speak, of the Age of Empires.",
      "reference": "signed the death warrant, so to speak, of the Age of Empires",
      "metrics": {
        "f1": 0.8823529411764706
      },
      "elapsed_time": 0.2088148593902588,
      "timestamp": "2025-12-24T20:36:36.693466"
    },
    {
      "sample_id": "squad_57275ec9dd62a815002e9bba",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 433,
      "natural_ratio": 0.00330352783203125,
      "max_context_tokens": 131072,
      "context_length": 764,
      "question_length": 63,
      "prediction": "80小时",
      "reference": "80",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.054862022399902344,
      "timestamp": "2025-12-24T20:36:36.748456"
    },
    {
      "sample_id": "squad_572409ef0ba9f01400d97b42",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 366,
      "natural_ratio": 0.0027923583984375,
      "max_context_tokens": 131072,
      "context_length": 627,
      "question_length": 65,
      "prediction": "Isle of Wight",
      "reference": "on the Isle of Wight",
      "metrics": {
        "f1": 0.9565217391304348
      },
      "elapsed_time": 0.07374167442321777,
      "timestamp": "2025-12-24T20:36:36.822324"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_18667",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 109031,
      "natural_ratio": 0.8318405151367188,
      "max_context_tokens": 131072,
      "context_length": 217986,
      "question_length": 37,
      "prediction": "Shelly",
      "reference": "Shelly Kaplow",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.15186333656311035,
      "timestamp": "2025-12-24T20:36:36.974520"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_23194",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 57538,
      "natural_ratio": 0.4389801025390625,
      "max_context_tokens": 131072,
      "context_length": 115003,
      "question_length": 33,
      "prediction": "The party is to discuss Mr. Boddy's long-standing confidential and painful financial liability.",
      "reference": "To enact revenge upon Mr Boddy after revealing him as the blackmailer.",
      "metrics": {
        "f1": 0.9130434782608695
      },
      "elapsed_time": 2.0065629482269287,
      "timestamp": "2025-12-24T20:36:38.981769"
    },
    {
      "sample_id": "squad_57278aac708984140094e05b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 433,
      "natural_ratio": 0.00330352783203125,
      "max_context_tokens": 131072,
      "context_length": 780,
      "question_length": 47,
      "prediction": "在纽约，Supreme Courts是作为审判法院（Trial Courts of General Jurisdiction）服务的。",
      "reference": "trial courts of general jurisdiction",
      "metrics": {
        "f1": 0.6990291262135923
      },
      "elapsed_time": 0.2398362159729004,
      "timestamp": "2025-12-24T20:36:39.221786"
    },
    {
      "sample_id": "squad_57282ec23acd2414000df67b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 567,
      "natural_ratio": 0.00432586669921875,
      "max_context_tokens": 131072,
      "context_length": 1055,
      "question_length": 40,
      "prediction": "300mile(s)",
      "reference": "300",
      "metrics": {
        "f1": 0.4615384615384615
      },
      "elapsed_time": 0.08969831466674805,
      "timestamp": "2025-12-24T20:36:39.311612"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_25059",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 83271,
      "natural_ratio": 0.6353073120117188,
      "max_context_tokens": 131072,
      "context_length": 166442,
      "question_length": 60,
      "prediction": "故事结尾处，观众被警告要保持警惕，直到天亮。具体来说，当比利和特蕾西被困时，他们被告知必须待在一起并对抗这些生物，至少直到可以得到帮助。",
      "reference": "Check for gremlins if things are going wrong",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.228428602218628,
      "timestamp": "2025-12-24T20:36:42.540428"
    },
    {
      "sample_id": "squad_572820842ca10214002d9e7e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 366,
      "natural_ratio": 0.0027923583984375,
      "max_context_tokens": 131072,
      "context_length": 625,
      "question_length": 67,
      "prediction": "无固定时间或规模限制",
      "reference": "no ex-ante time or size limit",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.08940935134887695,
      "timestamp": "2025-12-24T20:36:42.630002"
    },
    {
      "sample_id": "squad_572948b7af94a219006aa25d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 410,
      "natural_ratio": 0.0031280517578125,
      "max_context_tokens": 131072,
      "context_length": 733,
      "question_length": 47,
      "prediction": "Bermudians began to turn to maritime trades.",
      "reference": "turn to maritime trades",
      "metrics": {
        "f1": 0.6865671641791045
      },
      "elapsed_time": 0.13491559028625488,
      "timestamp": "2025-12-24T20:36:42.765050"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_9572",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 103424,
      "natural_ratio": 0.7890625,
      "max_context_tokens": 131072,
      "context_length": 206786,
      "question_length": 22,
      "prediction": "Peter Sullivan",
      "reference": "Sullivan.",
      "metrics": {
        "f1": 0.7000000000000001
      },
      "elapsed_time": 2.826122760772705,
      "timestamp": "2025-12-24T20:36:45.591576"
    },
    {
      "sample_id": "squad_572eaf07dfa6aa1500f8d2ba",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 304,
      "natural_ratio": 0.0023193359375,
      "max_context_tokens": 131072,
      "context_length": 513,
      "question_length": 56,
      "prediction": "约50万",
      "reference": "United States, which was already home to over 10,000 Salvadorans, making Salvadorans Americans the fourth-largest Hispanic and Latino American group",
      "metrics": {
        "f1": 0.06666666666666668
      },
      "elapsed_time": 0.06961607933044434,
      "timestamp": "2025-12-24T20:36:45.661365"
    },
    {
      "sample_id": "squad_56fa08a8f34c681400b0bf7b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 565,
      "natural_ratio": 0.00431060791015625,
      "max_context_tokens": 131072,
      "context_length": 1029,
      "question_length": 62,
      "prediction": "Trust Territory of Somalia",
      "reference": "the Trust Territory of Somalia",
      "metrics": {
        "f1": 0.962962962962963
      },
      "elapsed_time": 0.0697321891784668,
      "timestamp": "2025-12-24T20:36:45.731236"
    },
    {
      "sample_id": "squad_5735e8d3012e2f140011a0d5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 337,
      "natural_ratio": 0.00257110595703125,
      "max_context_tokens": 131072,
      "context_length": 592,
      "question_length": 43,
      "prediction": "一些考古学家和古人类学家",
      "reference": "archaeologists and paleoanthropologists",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.09354019165039062,
      "timestamp": "2025-12-24T20:36:45.824905"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_4207",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 112441,
      "natural_ratio": 0.8578567504882812,
      "max_context_tokens": 131072,
      "context_length": 224790,
      "question_length": 52,
      "prediction": "The German government is working on building a World War I submarine.",
      "reference": "Mycroft",
      "metrics": {
        "f1": 0.29629629629629634
      },
      "elapsed_time": 3.9697999954223633,
      "timestamp": "2025-12-24T20:36:49.795076"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_21951",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 54154,
      "natural_ratio": 0.4131622314453125,
      "max_context_tokens": 131072,
      "context_length": 108198,
      "question_length": 70,
      "prediction": "Thomas Tyler",
      "reference": "The lady Queen Elisabeth",
      "metrics": {
        "f1": 0.64
      },
      "elapsed_time": 1.479222297668457,
      "timestamp": "2025-12-24T20:36:51.274971"
    },
    {
      "sample_id": "squad_57280e204b864d1900164335",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 406,
      "natural_ratio": 0.0030975341796875,
      "max_context_tokens": 131072,
      "context_length": 667,
      "question_length": 105,
      "prediction": "34",
      "reference": "34",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0482945442199707,
      "timestamp": "2025-12-24T20:36:51.323454"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_27221",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 62382,
      "natural_ratio": 0.4759368896484375,
      "max_context_tokens": 131072,
      "context_length": 124676,
      "question_length": 48,
      "prediction": "Jessica does not bleed and the incident is related to voodoo beliefs about turning someone into a zombie.",
      "reference": "She doesn't bleed.",
      "metrics": {
        "f1": 0.6470588235294118
      },
      "elapsed_time": 2.3570189476013184,
      "timestamp": "2025-12-24T20:36:53.680815"
    },
    {
      "sample_id": "squad_56de3a30cffd8e1900b4b69c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 439,
      "natural_ratio": 0.00334930419921875,
      "max_context_tokens": 131072,
      "context_length": 749,
      "question_length": 90,
      "prediction": "The Crusades",
      "reference": "the Crusades",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06097555160522461,
      "timestamp": "2025-12-24T20:36:53.742012"
    },
    {
      "sample_id": "squad_570716129e06ca38007e93df",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 373,
      "natural_ratio": 0.00284576416015625,
      "max_context_tokens": 131072,
      "context_length": 656,
      "question_length": 50,
      "prediction": "Durango",
      "reference": "Durango",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04833722114562988,
      "timestamp": "2025-12-24T20:36:53.790552"
    },
    {
      "sample_id": "squad_572827feff5b5019007d9e3e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 357,
      "natural_ratio": 0.00272369384765625,
      "max_context_tokens": 131072,
      "context_length": 636,
      "question_length": 39,
      "prediction": "torrential rain",
      "reference": "torrential rain",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.058771610260009766,
      "timestamp": "2025-12-24T20:36:53.849532"
    },
    {
      "sample_id": "squad_5726b0b6708984140094cdea",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 455,
      "natural_ratio": 0.00347137451171875,
      "max_context_tokens": 131072,
      "context_length": 808,
      "question_length": 63,
      "prediction": "Milton Winternitz was driven by his dedication to new scientific medicine and \"social medicine.\"",
      "reference": "social medicine",
      "metrics": {
        "f1": 0.2702702702702703
      },
      "elapsed_time": 0.2248835563659668,
      "timestamp": "2025-12-24T20:36:54.074595"
    },
    {
      "sample_id": "squad_570d3c44b3d812140066d5bd",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 319,
      "natural_ratio": 0.00243377685546875,
      "max_context_tokens": 131072,
      "context_length": 559,
      "question_length": 40,
      "prediction": "Range and height of the target",
      "reference": "Range and height of the target",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0844125747680664,
      "timestamp": "2025-12-24T20:36:54.159164"
    },
    {
      "sample_id": "squad_5727beae2ca10214002d9548",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 303,
      "natural_ratio": 0.00231170654296875,
      "max_context_tokens": 131072,
      "context_length": 534,
      "question_length": 33,
      "prediction": "高速公路和快速路系统",
      "reference": "highways",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.07431602478027344,
      "timestamp": "2025-12-24T20:36:54.233621"
    },
    {
      "sample_id": "squad_570e6b5f0b85d914000d7ec1",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 316,
      "natural_ratio": 0.002410888671875,
      "max_context_tokens": 131072,
      "context_length": 543,
      "question_length": 50,
      "prediction": "Cfb",
      "reference": "Cfb",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04354286193847656,
      "timestamp": "2025-12-24T20:36:54.277295"
    },
    {
      "sample_id": "squad_570fa65280d9841400ab360b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 343,
      "natural_ratio": 0.00261688232421875,
      "max_context_tokens": 131072,
      "context_length": 598,
      "question_length": 49,
      "prediction": "女性对人类和非人类刺激物的性唤起，包括电影中展示的人类两性及同性性行为的场景和视频中展示的倭黑猩猩性行为的场景。",
      "reference": "both human and nonhuman stimuli",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.44219064712524414,
      "timestamp": "2025-12-24T20:36:54.719622"
    },
    {
      "sample_id": "squad_5726783af1498d1400e8e0ca",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 318,
      "natural_ratio": 0.0024261474609375,
      "max_context_tokens": 131072,
      "context_length": 534,
      "question_length": 63,
      "prediction": "seven",
      "reference": "seven",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.034913063049316406,
      "timestamp": "2025-12-24T20:36:54.754663"
    },
    {
      "sample_id": "squad_572e9f64cb0c0d14000f1389",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 456,
      "natural_ratio": 0.00347900390625,
      "max_context_tokens": 131072,
      "context_length": 756,
      "question_length": 116,
      "prediction": "普鲁士和弗里德里希",
      "reference": "Frederick and Prussia",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.1143648624420166,
      "timestamp": "2025-12-24T20:36:54.869155"
    },
    {
      "sample_id": "squad_56d4fa2e2ccc5a1400d833cc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 920,
      "natural_ratio": 0.00701904296875,
      "max_context_tokens": 131072,
      "context_length": 1728,
      "question_length": 73,
      "prediction": "\"I Was Here\"",
      "reference": "I Was Here",
      "metrics": {
        "f1": 0.9090909090909091
      },
      "elapsed_time": 0.07131481170654297,
      "timestamp": "2025-12-24T20:36:54.940602"
    },
    {
      "sample_id": "squad_572808403acd2414000df2cb",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 379,
      "natural_ratio": 0.00289154052734375,
      "max_context_tokens": 131072,
      "context_length": 692,
      "question_length": 26,
      "prediction": "Good Pope John",
      "reference": "Good Pope John",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05393528938293457,
      "timestamp": "2025-12-24T20:36:54.994665"
    },
    {
      "sample_id": "squad_56d66c621c8504140094710d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 232,
      "natural_ratio": 0.00177001953125,
      "max_context_tokens": 131072,
      "context_length": 349,
      "question_length": 76,
      "prediction": "政府威胁要逮捕抗议的人。",
      "reference": "threat of arrest",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.09105634689331055,
      "timestamp": "2025-12-24T20:36:55.085851"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_2978",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 64161,
      "natural_ratio": 0.48950958251953125,
      "max_context_tokens": 131072,
      "context_length": 128239,
      "question_length": 44,
      "prediction": "因为刘家龙提到要杀他来报复，所以李决定去洛杉矶调查。",
      "reference": "James Carter convinced him to come to L.A. with him when Lee took Carter to the airport.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 2.7828996181488037,
      "timestamp": "2025-12-24T20:36:57.869131"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_14037",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 93034,
      "natural_ratio": 0.7097930908203125,
      "max_context_tokens": 131072,
      "context_length": 185986,
      "question_length": 42,
      "prediction": "因为公主Rosalba戴了黑魔女赐予的魔法戒指，使她看起来非常美丽，而狮子被她的美貌所吸引，所以没有伤害她。",
      "reference": "They are the same lions she grew up with as a child.",
      "metrics": {
        "f1": 0.15384615384615383
      },
      "elapsed_time": 3.518528699874878,
      "timestamp": "2025-12-24T20:37:01.388033"
    },
    {
      "sample_id": "squad_570dec460dc6ce1900204d0f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 303,
      "natural_ratio": 0.00231170654296875,
      "max_context_tokens": 131072,
      "context_length": 518,
      "question_length": 48,
      "prediction": "pole antarctike",
      "reference": "pole antartike",
      "metrics": {
        "f1": 0.9565217391304348
      },
      "elapsed_time": 0.08060288429260254,
      "timestamp": "2025-12-24T20:37:01.468856"
    },
    {
      "sample_id": "squad_57276f39f1498d1400e8f809",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 304,
      "natural_ratio": 0.0023193359375,
      "max_context_tokens": 131072,
      "context_length": 514,
      "question_length": 54,
      "prediction": "West River",
      "reference": "West River",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.044911861419677734,
      "timestamp": "2025-12-24T20:37:01.513923"
    },
    {
      "sample_id": "squad_5733f8f0d058e614000b66d0",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 347,
      "natural_ratio": 0.00264739990234375,
      "max_context_tokens": 131072,
      "context_length": 608,
      "question_length": 47,
      "prediction": "葡萄牙 Escudo",
      "reference": "Portuguese Escudo",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.054296016693115234,
      "timestamp": "2025-12-24T20:37:01.568369"
    },
    {
      "sample_id": "squad_57280ac93acd2414000df2ff",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 450,
      "natural_ratio": 0.0034332275390625,
      "max_context_tokens": 131072,
      "context_length": 799,
      "question_length": 62,
      "prediction": "80 GB",
      "reference": "80 GB",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.055188655853271484,
      "timestamp": "2025-12-24T20:37:01.623698"
    },
    {
      "sample_id": "squad_5726240738643c19005ad078",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 392,
      "natural_ratio": 0.00299072265625,
      "max_context_tokens": 131072,
      "context_length": 693,
      "question_length": 51,
      "prediction": "Sir James Lancaster",
      "reference": "Sir James Lancaster",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05484819412231445,
      "timestamp": "2025-12-24T20:37:01.678676"
    },
    {
      "sample_id": "squad_5727b1c13acd2414000de9eb",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 418,
      "natural_ratio": 0.0031890869140625,
      "max_context_tokens": 131072,
      "context_length": 786,
      "question_length": 11,
      "prediction": "Advaita",
      "reference": "Advaita",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.054143428802490234,
      "timestamp": "2025-12-24T20:37:01.732950"
    },
    {
      "sample_id": "squad_572fabd004bcaa1900d76baa",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 327,
      "natural_ratio": 0.00249481201171875,
      "max_context_tokens": 131072,
      "context_length": 554,
      "question_length": 61,
      "prediction": "4%",
      "reference": "4%",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04437732696533203,
      "timestamp": "2025-12-24T20:37:01.777459"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_18864",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 89995,
      "natural_ratio": 0.6866073608398438,
      "max_context_tokens": 131072,
      "context_length": 179911,
      "question_length": 40,
      "prediction": "他们给了教授一个难题。",
      "reference": "A problem.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.2995898723602295,
      "timestamp": "2025-12-24T20:37:05.077423"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_9362",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 38305,
      "natural_ratio": 0.29224395751953125,
      "max_context_tokens": 131072,
      "context_length": 76509,
      "question_length": 62,
      "prediction": "Jarvis推测那个晶体具有类似硬X射线或伽马辐射的特性，能够破坏病变组织同时保留健康组织。",
      "reference": "The crystal emits a form of radiation that destroys unhealty tissue, but does not harm what is healthy.",
      "metrics": {
        "f1": 0.125
      },
      "elapsed_time": 1.20481538772583,
      "timestamp": "2025-12-24T20:37:06.282551"
    },
    {
      "sample_id": "squad_570b0e40ec8fbc190045b813",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 365,
      "natural_ratio": 0.00278472900390625,
      "max_context_tokens": 131072,
      "context_length": 640,
      "question_length": 50,
      "prediction": "改革了儒略历（作为格里高利十三世时代之前的使用）。",
      "reference": "Julian calendar",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.19760799407958984,
      "timestamp": "2025-12-24T20:37:06.480340"
    },
    {
      "sample_id": "squad_56d38ac959d6e414001466cf",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 319,
      "natural_ratio": 0.00243377685546875,
      "max_context_tokens": 131072,
      "context_length": 540,
      "question_length": 58,
      "prediction": "tuberculosis",
      "reference": "tuberculosis",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05401325225830078,
      "timestamp": "2025-12-24T20:37:06.534487"
    },
    {
      "sample_id": "squad_570ce8b4fed7b91900d45ac6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 562,
      "natural_ratio": 0.0042877197265625,
      "max_context_tokens": 131072,
      "context_length": 1052,
      "question_length": 32,
      "prediction": "这些bars是由玻璃纤维覆盖木质层压板制成的。",
      "reference": "fiberglass covered in wood laminate",
      "metrics": {
        "f1": 0.1951219512195122
      },
      "elapsed_time": 0.15556120872497559,
      "timestamp": "2025-12-24T20:37:06.690183"
    },
    {
      "sample_id": "squad_56df7f755ca0a614008f9b62",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 139,
      "natural_ratio": 0.00106048583984375,
      "max_context_tokens": 131072,
      "context_length": 168,
      "question_length": 70,
      "prediction": "北方",
      "reference": "north",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.03006291389465332,
      "timestamp": "2025-12-24T20:37:06.720372"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_11868",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 115074,
      "natural_ratio": 0.8779449462890625,
      "max_context_tokens": 131072,
      "context_length": 230075,
      "question_length": 33,
      "prediction": "Jane怀孕了。",
      "reference": "She is pregnant",
      "metrics": {
        "f1": 0.3157894736842105
      },
      "elapsed_time": 4.020688056945801,
      "timestamp": "2025-12-24T20:37:10.741433"
    },
    {
      "sample_id": "squad_5726ca635951b619008f7e35",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 687,
      "natural_ratio": 0.00524139404296875,
      "max_context_tokens": 131072,
      "context_length": 1274,
      "question_length": 60,
      "prediction": "HIV/AIDS",
      "reference": "HIV/AIDS",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0659933090209961,
      "timestamp": "2025-12-24T20:37:10.807938"
    },
    {
      "sample_id": "squad_57279e8c4b864d19001638ae",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 443,
      "natural_ratio": 0.00337982177734375,
      "max_context_tokens": 131072,
      "context_length": 769,
      "question_length": 78,
      "prediction": "New Haven Black Panther trials",
      "reference": "New Haven Black Panther trials",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0758671760559082,
      "timestamp": "2025-12-24T20:37:10.883943"
    },
    {
      "sample_id": "squad_572b9405be1ee31400cb8450",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 460,
      "natural_ratio": 0.003509521484375,
      "max_context_tokens": 131072,
      "context_length": 841,
      "question_length": 39,
      "prediction": "Thomas Alva Edison",
      "reference": "Thomas Alva Edison",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06486153602600098,
      "timestamp": "2025-12-24T20:37:10.948933"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_24943",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 82462,
      "natural_ratio": 0.6291351318359375,
      "max_context_tokens": 131072,
      "context_length": 164852,
      "question_length": 32,
      "prediction": "Beth is Peggy Sue Bodell's daughter.",
      "reference": "She is her daughter.",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 3.955263137817383,
      "timestamp": "2025-12-24T20:37:14.904586"
    },
    {
      "sample_id": "squad_57332ba74776f41900660730",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 378,
      "natural_ratio": 0.0028839111328125,
      "max_context_tokens": 131072,
      "context_length": 601,
      "question_length": 116,
      "prediction": "Collateralized Debt Obligation (CDO)",
      "reference": "collateralized debt obligation",
      "metrics": {
        "f1": 0.9090909090909091
      },
      "elapsed_time": 0.1297013759613037,
      "timestamp": "2025-12-24T20:37:15.034500"
    },
    {
      "sample_id": "squad_56dc7c9d14d3a41400c268ff",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 248,
      "natural_ratio": 0.00189208984375,
      "max_context_tokens": 131072,
      "context_length": 403,
      "question_length": 54,
      "prediction": "Margaret Thatcher",
      "reference": "Margaret Thatcher",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05452227592468262,
      "timestamp": "2025-12-24T20:37:15.089225"
    },
    {
      "sample_id": "squad_57071f039e06ca38007e9475",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 329,
      "natural_ratio": 0.00251007080078125,
      "max_context_tokens": 131072,
      "context_length": 583,
      "question_length": 35,
      "prediction": "1910",
      "reference": "1910",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06505203247070312,
      "timestamp": "2025-12-24T20:37:15.154423"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_17766",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 86074,
      "natural_ratio": 0.6566925048828125,
      "max_context_tokens": 131072,
      "context_length": 172028,
      "question_length": 80,
      "prediction": "Dunwittie does not appreciate series that depict black people in a positive light, as evidenced by his desire to replace Mantan with Honeycutt when Mantan refuses to participate in the stereotypical show.",
      "reference": "He rejects them.",
      "metrics": {
        "f1": 0.5142857142857143
      },
      "elapsed_time": 4.74648118019104,
      "timestamp": "2025-12-24T20:37:19.901269"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_21589",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 61500,
      "natural_ratio": 0.469207763671875,
      "max_context_tokens": 131072,
      "context_length": 122908,
      "question_length": 52,
      "prediction": "Tumnus",
      "reference": "A faun.",
      "metrics": {
        "f1": 0.3636363636363636
      },
      "elapsed_time": 1.3759818077087402,
      "timestamp": "2025-12-24T20:37:21.277553"
    },
    {
      "sample_id": "squad_5731ab21b9d445190005e44f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 775,
      "natural_ratio": 0.00591278076171875,
      "max_context_tokens": 131072,
      "context_length": 1447,
      "question_length": 64,
      "prediction": "Ovid",
      "reference": "Ovid",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.051698923110961914,
      "timestamp": "2025-12-24T20:37:21.329419"
    },
    {
      "sample_id": "squad_571a98164faf5e1900b8aafc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 465,
      "natural_ratio": 0.00354766845703125,
      "max_context_tokens": 131072,
      "context_length": 817,
      "question_length": 73,
      "prediction": "许多多种族的美国人通过与白人结婚并融入白人社会来获得社会和经济上的优势。",
      "reference": "Many of majority European ancestry and appearance \"married white\" and assimilated into white society",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.25412607192993164,
      "timestamp": "2025-12-24T20:37:21.583695"
    },
    {
      "sample_id": "squad_5727b8684b864d1900163b64",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 360,
      "natural_ratio": 0.00274658203125,
      "max_context_tokens": 131072,
      "context_length": 610,
      "question_length": 70,
      "prediction": "Anglo-American settlers",
      "reference": "Anglo-American settlers",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06727337837219238,
      "timestamp": "2025-12-24T20:37:21.651100"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_19087",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 43014,
      "natural_ratio": 0.3281707763671875,
      "max_context_tokens": 131072,
      "context_length": 85941,
      "question_length": 47,
      "prediction": "存储罐泄露，爆炸导致大量人员死亡，包括主人公Guy Burckhardt。",
      "reference": "It exploded and everyone died. ",
      "metrics": {
        "f1": 0.25000000000000006
      },
      "elapsed_time": 1.2290260791778564,
      "timestamp": "2025-12-24T20:37:22.880502"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_30131",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 88047,
      "natural_ratio": 0.6717453002929688,
      "max_context_tokens": 131072,
      "context_length": 175985,
      "question_length": 69,
      "prediction": "Phil lost his job and had a fight with his wife Julie, which led to him moving out.",
      "reference": "He had two explosive fights with Julie.",
      "metrics": {
        "f1": 0.8571428571428571
      },
      "elapsed_time": 2.2793729305267334,
      "timestamp": "2025-12-24T20:37:25.160249"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_21875",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 17273,
      "natural_ratio": 0.13178253173828125,
      "max_context_tokens": 131072,
      "context_length": 34471,
      "question_length": 36,
      "prediction": "他自己",
      "reference": "Himself.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.395235538482666,
      "timestamp": "2025-12-24T20:37:25.555750"
    },
    {
      "sample_id": "squad_5728294a2ca10214002d9f99",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 388,
      "natural_ratio": 0.002960205078125,
      "max_context_tokens": 131072,
      "context_length": 678,
      "question_length": 59,
      "prediction": "2",
      "reference": "two",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.03557538986206055,
      "timestamp": "2025-12-24T20:37:25.591499"
    },
    {
      "sample_id": "squad_572efd2003f9891900756b24",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 611,
      "natural_ratio": 0.00466156005859375,
      "max_context_tokens": 131072,
      "context_length": 1133,
      "question_length": 50,
      "prediction": "社会在德川时期基于严格的阶级等级制度。",
      "reference": "strict class hierarchy",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.13875389099121094,
      "timestamp": "2025-12-24T20:37:25.730391"
    },
    {
      "sample_id": "squad_5726712af1498d1400e8dfc2",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 538,
      "natural_ratio": 0.0041046142578125,
      "max_context_tokens": 131072,
      "context_length": 972,
      "question_length": 64,
      "prediction": "Jeffrey R. Holland认为 Brigham Young University（BYU）最强的资产是其宗教性质。",
      "reference": "its religious nature",
      "metrics": {
        "f1": 0.489795918367347
      },
      "elapsed_time": 0.23437261581420898,
      "timestamp": "2025-12-24T20:37:25.964919"
    },
    {
      "sample_id": "squad_570c2e35ec8fbc190045bd3f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 326,
      "natural_ratio": 0.0024871826171875,
      "max_context_tokens": 131072,
      "context_length": 569,
      "question_length": 44,
      "prediction": "60",
      "reference": "60",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.043578147888183594,
      "timestamp": "2025-12-24T20:37:26.008632"
    },
    {
      "sample_id": "squad_5726be91dd62a815002e8f2d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 333,
      "natural_ratio": 0.00254058837890625,
      "max_context_tokens": 131072,
      "context_length": 565,
      "question_length": 62,
      "prediction": "Faleiva",
      "reference": "Faleiva",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05423855781555176,
      "timestamp": "2025-12-24T20:37:26.062999"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_20349",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 115683,
      "natural_ratio": 0.8825912475585938,
      "max_context_tokens": 131072,
      "context_length": 231257,
      "question_length": 69,
      "prediction": "Alex将项链扔开。",
      "reference": "He throws it in the ocean",
      "metrics": {
        "f1": 0.1818181818181818
      },
      "elapsed_time": 3.947566032409668,
      "timestamp": "2025-12-24T20:37:30.010913"
    },
    {
      "sample_id": "squad_56ddb03066d3e219004dac9b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 330,
      "natural_ratio": 0.0025177001953125,
      "max_context_tokens": 131072,
      "context_length": 543,
      "question_length": 78,
      "prediction": "University of Technology",
      "reference": "University of Technology",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05960512161254883,
      "timestamp": "2025-12-24T20:37:30.071030"
    },
    {
      "sample_id": "squad_5727bc74ff5b5019007d93ca",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 363,
      "natural_ratio": 0.00276947021484375,
      "max_context_tokens": 131072,
      "context_length": 649,
      "question_length": 37,
      "prediction": "Han Chinese",
      "reference": "Han Chinese",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04511141777038574,
      "timestamp": "2025-12-24T20:37:30.116273"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_22566",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 93492,
      "natural_ratio": 0.713287353515625,
      "max_context_tokens": 131072,
      "context_length": 186884,
      "question_length": 60,
      "prediction": "Margot和Pauline去了一个小型的、剥落的绿色墙到墙地毯的小酒店。",
      "reference": "Margot and Pauline flee to a motel.",
      "metrics": {
        "f1": 0.5
      },
      "elapsed_time": 3.0503766536712646,
      "timestamp": "2025-12-24T20:37:33.167016"
    },
    {
      "sample_id": "squad_57267897dd62a815002e8638",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 624,
      "natural_ratio": 0.0047607421875,
      "max_context_tokens": 131072,
      "context_length": 1175,
      "question_length": 33,
      "prediction": "1902年",
      "reference": "1902",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 0.0848240852355957,
      "timestamp": "2025-12-24T20:37:33.252003"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_14014",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 46560,
      "natural_ratio": 0.355224609375,
      "max_context_tokens": 131072,
      "context_length": 93035,
      "question_length": 46,
      "prediction": "Andrew Moreton called for the establishment of an university in London, an hospital for foundlings, and the formation of an academy of sciences at Christ's Hospital.",
      "reference": "A hospital",
      "metrics": {
        "f1": 0.5294117647058824
      },
      "elapsed_time": 1.4207689762115479,
      "timestamp": "2025-12-24T20:37:34.673067"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_20740",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 93039,
      "natural_ratio": 0.7098312377929688,
      "max_context_tokens": 131072,
      "context_length": 185985,
      "question_length": 54,
      "prediction": "因为Lee被认为与绑架事件有关，可能是被雇佣来制造假象，所以FBI希望他离开调查现场。",
      "reference": "To avoid negative attention if something happens Lee.",
      "metrics": {
        "f1": 0.14285714285714282
      },
      "elapsed_time": 4.306864500045776,
      "timestamp": "2025-12-24T20:37:38.980289"
    },
    {
      "sample_id": "squad_570a8df96d058f1900182f93",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 446,
      "natural_ratio": 0.0034027099609375,
      "max_context_tokens": 131072,
      "context_length": 744,
      "question_length": 109,
      "prediction": "1",
      "reference": "1",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04095888137817383,
      "timestamp": "2025-12-24T20:37:39.021409"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_8666",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 93900,
      "natural_ratio": 0.716400146484375,
      "max_context_tokens": 131072,
      "context_length": 187723,
      "question_length": 38,
      "prediction": "Honorine",
      "reference": "Maurice",
      "metrics": {
        "f1": 0.4615384615384615
      },
      "elapsed_time": 3.0114336013793945,
      "timestamp": "2025-12-24T20:37:42.033221"
    },
    {
      "sample_id": "squad_570c30996b8089140040fbf4",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 297,
      "natural_ratio": 0.00226593017578125,
      "max_context_tokens": 131072,
      "context_length": 515,
      "question_length": 40,
      "prediction": "Controlled Substances Act of 1970",
      "reference": "Controlled Substances Act of 1970",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.13683462142944336,
      "timestamp": "2025-12-24T20:37:42.170228"
    },
    {
      "sample_id": "squad_5728ec472ca10214002daa98",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 554,
      "natural_ratio": 0.0042266845703125,
      "max_context_tokens": 131072,
      "context_length": 1007,
      "question_length": 62,
      "prediction": "shipping costs",
      "reference": "shipping costs",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0466763973236084,
      "timestamp": "2025-12-24T20:37:42.217045"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_24411",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 117540,
      "natural_ratio": 0.896759033203125,
      "max_context_tokens": 131072,
      "context_length": 234996,
      "question_length": 44,
      "prediction": "Prideaux",
      "reference": "Prideaux.",
      "metrics": {
        "f1": 0.9411764705882353
      },
      "elapsed_time": 4.052598714828491,
      "timestamp": "2025-12-24T20:37:46.270030"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_17573",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 71296,
      "natural_ratio": 0.5439453125,
      "max_context_tokens": 131072,
      "context_length": 142518,
      "question_length": 34,
      "prediction": "Septimius decapitated Pompey the Great.",
      "reference": "Lucius Septimius",
      "metrics": {
        "f1": 0.6666666666666667
      },
      "elapsed_time": 2.718738555908203,
      "timestamp": "2025-12-24T20:37:48.989464"
    },
    {
      "sample_id": "squad_5730f90c497a881900248ab5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 701,
      "natural_ratio": 0.00534820556640625,
      "max_context_tokens": 131072,
      "context_length": 1264,
      "question_length": 99,
      "prediction": "根据文档，在玛丽出现之前，先辈们已经了解了诺亚方舟、雅各的梯子、摩西的不焚之灌木、不可攻破的堡垒、围栏花园、上帝之城、上帝的圣殿以及其他许多类似的圣经象征意义。",
      "reference": "the Fathers saw foreshadowings of Mary's \"wondrous abundance of divine gifts and original innocence",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.6098392009735107,
      "timestamp": "2025-12-24T20:37:49.599556"
    },
    {
      "sample_id": "squad_570aa9644103511400d598f0",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 553,
      "natural_ratio": 0.00421905517578125,
      "max_context_tokens": 131072,
      "context_length": 995,
      "question_length": 72,
      "prediction": "所有飞行阶段",
      "reference": "all phases",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.05780625343322754,
      "timestamp": "2025-12-24T20:37:49.657523"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_18848",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 90017,
      "natural_ratio": 0.6867752075195312,
      "max_context_tokens": 131072,
      "context_length": 179911,
      "question_length": 84,
      "prediction": "含有四部分的数字",
      "reference": "They are easily memorized",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.20512652397155762,
      "timestamp": "2025-12-24T20:37:49.862925"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_10079",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 94822,
      "natural_ratio": 0.7234344482421875,
      "max_context_tokens": 131072,
      "context_length": 189560,
      "question_length": 45,
      "prediction": "Isabella Hudson is in labor and has been in a hospital, where she nearly died due to a nurse named Kalarjian choking her, but was saved by a defibrillator.",
      "reference": "She is pregnant.",
      "metrics": {
        "f1": 0.631578947368421
      },
      "elapsed_time": 3.4694786071777344,
      "timestamp": "2025-12-24T20:37:53.332746"
    },
    {
      "sample_id": "squad_5732696fe17f3d140042295f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 518,
      "natural_ratio": 0.0039520263671875,
      "max_context_tokens": 131072,
      "context_length": 931,
      "question_length": 65,
      "prediction": "由于俄罗斯拒绝允许任何形式的检查。",
      "reference": "refusal of the Russians to permit any sort of inspections",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.1105506420135498,
      "timestamp": "2025-12-24T20:37:53.443473"
    },
    {
      "sample_id": "squad_56e16a26e3433e1400422ed7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 422,
      "natural_ratio": 0.0032196044921875,
      "max_context_tokens": 131072,
      "context_length": 776,
      "question_length": 28,
      "prediction": "Henry Cavendish和Antoine Lavoisier共同发现了氢气，但通常认为Henry Cavendish首先认出了氢气作为一种独立的物质。",
      "reference": "Robert Boyle",
      "metrics": {
        "f1": 0.28571428571428575
      },
      "elapsed_time": 0.3530714511871338,
      "timestamp": "2025-12-24T20:37:53.796693"
    },
    {
      "sample_id": "squad_572b5afd34ae481900deadc0",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 543,
      "natural_ratio": 0.00414276123046875,
      "max_context_tokens": 131072,
      "context_length": 979,
      "question_length": 68,
      "prediction": "20:1",
      "reference": "20 to 1",
      "metrics": {
        "f1": 0.6
      },
      "elapsed_time": 0.06836462020874023,
      "timestamp": "2025-12-24T20:37:53.865199"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_22194",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 116897,
      "natural_ratio": 0.8918533325195312,
      "max_context_tokens": 131072,
      "context_length": 233700,
      "question_length": 55,
      "prediction": "Hammond is a stranger to Lex and Tim Murphy.",
      "reference": "He is their grandfather",
      "metrics": {
        "f1": 0.7096774193548387
      },
      "elapsed_time": 4.414763927459717,
      "timestamp": "2025-12-24T20:37:58.280322"
    },
    {
      "sample_id": "squad_573247070fdd8d15006c68c7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 650,
      "natural_ratio": 0.0049591064453125,
      "max_context_tokens": 131072,
      "context_length": 1202,
      "question_length": 58,
      "prediction": "Jehovah Witnesses' leadership is said to provide \"divine guidance\" through its teachings.",
      "reference": "divine guidance",
      "metrics": {
        "f1": 0.28846153846153844
      },
      "elapsed_time": 0.22229266166687012,
      "timestamp": "2025-12-24T20:37:58.503131"
    },
    {
      "sample_id": "squad_56df7edd56340a1900b29c2f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 424,
      "natural_ratio": 0.00323486328125,
      "max_context_tokens": 131072,
      "context_length": 751,
      "question_length": 57,
      "prediction": "English Civil War",
      "reference": "English Civil War",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.055706024169921875,
      "timestamp": "2025-12-24T20:37:58.558970"
    },
    {
      "sample_id": "squad_5726b450708984140094ce60",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 658,
      "natural_ratio": 0.0050201416015625,
      "max_context_tokens": 131072,
      "context_length": 1229,
      "question_length": 47,
      "prediction": "2500 BC",
      "reference": "2500 BC",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0787811279296875,
      "timestamp": "2025-12-24T20:37:58.637883"
    },
    {
      "sample_id": "squad_572844f92ca10214002da213",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 668,
      "natural_ratio": 0.005096435546875,
      "max_context_tokens": 131072,
      "context_length": 1256,
      "question_length": 41,
      "prediction": "南部阿富汗",
      "reference": "southern Afghanistan",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.04875802993774414,
      "timestamp": "2025-12-24T20:37:58.686769"
    },
    {
      "sample_id": "squad_572c9ab62babe914003c29a7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 610,
      "natural_ratio": 0.0046539306640625,
      "max_context_tokens": 131072,
      "context_length": 1107,
      "question_length": 73,
      "prediction": "23.8%",
      "reference": "23.8",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 0.07534599304199219,
      "timestamp": "2025-12-24T20:37:58.762243"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_5294",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 102512,
      "natural_ratio": 0.7821044921875,
      "max_context_tokens": 131072,
      "context_length": 204946,
      "question_length": 39,
      "prediction": "Julian Grendel",
      "reference": "Julian Grendlel. ",
      "metrics": {
        "f1": 0.9565217391304348
      },
      "elapsed_time": 2.666710376739502,
      "timestamp": "2025-12-24T20:38:01.429309"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_24864",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 84560,
      "natural_ratio": 0.6451416015625,
      "max_context_tokens": 131072,
      "context_length": 169053,
      "question_length": 27,
      "prediction": "Rene is European.",
      "reference": "French",
      "metrics": {
        "f1": 0.3529411764705882
      },
      "elapsed_time": 2.565673828125,
      "timestamp": "2025-12-24T20:38:03.995353"
    },
    {
      "sample_id": "squad_571a076110f8ca1400304edd",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 548,
      "natural_ratio": 0.004180908203125,
      "max_context_tokens": 131072,
      "context_length": 1015,
      "question_length": 41,
      "prediction": "$231 billion",
      "reference": "$231 billion",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08043026924133301,
      "timestamp": "2025-12-24T20:38:04.075957"
    },
    {
      "sample_id": "squad_572825df2ca10214002d9f08",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 485,
      "natural_ratio": 0.00370025634765625,
      "max_context_tokens": 131072,
      "context_length": 887,
      "question_length": 43,
      "prediction": "Jordan River",
      "reference": "Jordan River",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04565906524658203,
      "timestamp": "2025-12-24T20:38:04.121752"
    },
    {
      "sample_id": "squad_57261df0ec44d21400f3d909",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 477,
      "natural_ratio": 0.00363922119140625,
      "max_context_tokens": 131072,
      "context_length": 869,
      "question_length": 46,
      "prediction": "Prince of Wales",
      "reference": "Prince of Wales",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0552525520324707,
      "timestamp": "2025-12-24T20:38:04.177132"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_26939",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 67445,
      "natural_ratio": 0.5145645141601562,
      "max_context_tokens": 131072,
      "context_length": 134786,
      "question_length": 65,
      "prediction": "Amber wins the title of Minnesota American Teen Princess by impressing the judges with her talent performance, despite facing initial setbacks and challenges.",
      "reference": "all of the other girls get food poisoning from seafood",
      "metrics": {
        "f1": 0.7804878048780487
      },
      "elapsed_time": 3.2475733757019043,
      "timestamp": "2025-12-24T20:38:07.425027"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_21550",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 115289,
      "natural_ratio": 0.8795852661132812,
      "max_context_tokens": 131072,
      "context_length": 230452,
      "question_length": 87,
      "prediction": "根据提供的剧本内容，没有提到年轻埃里克（Eric）试图到达他的父母时发生的事情涉及集中营的门。剧本中没有这部分情节描述。",
      "reference": "It bends",
      "metrics": {
        "f1": 0.07142857142857142
      },
      "elapsed_time": 4.217660188674927,
      "timestamp": "2025-12-24T20:38:11.643041"
    },
    {
      "sample_id": "squad_570abc016d058f19001830b8",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 331,
      "natural_ratio": 0.00252532958984375,
      "max_context_tokens": 131072,
      "context_length": 595,
      "question_length": 28,
      "prediction": "CATOBAR允许部署携带满载武器的重型喷气机，尤其是用于地面攻击任务。",
      "reference": "deployment of heavy jets with full loadouts",
      "metrics": {
        "f1": 0.11111111111111112
      },
      "elapsed_time": 0.23905682563781738,
      "timestamp": "2025-12-24T20:38:11.882655"
    },
    {
      "sample_id": "squad_572952a31d046914007792a2",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 460,
      "natural_ratio": 0.003509521484375,
      "max_context_tokens": 131072,
      "context_length": 849,
      "question_length": 32,
      "prediction": "位于下咽之后。",
      "reference": "behind the hypopharynx",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.07629919052124023,
      "timestamp": "2025-12-24T20:38:11.959116"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_17005",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 93371,
      "natural_ratio": 0.7123641967773438,
      "max_context_tokens": 131072,
      "context_length": 186676,
      "question_length": 26,
      "prediction": "Osip",
      "reference": "Osip",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 3.396318197250366,
      "timestamp": "2025-12-24T20:38:15.355775"
    },
    {
      "sample_id": "squad_5726c4905951b619008f7d9c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 503,
      "natural_ratio": 0.00383758544921875,
      "max_context_tokens": 131072,
      "context_length": 908,
      "question_length": 58,
      "prediction": "Queen + Wyclef Jean",
      "reference": "Wyclef Jean",
      "metrics": {
        "f1": 0.7333333333333334
      },
      "elapsed_time": 0.09500765800476074,
      "timestamp": "2025-12-24T20:38:15.450969"
    },
    {
      "sample_id": "squad_56cec79caab44d1400b88a36",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 546,
      "natural_ratio": 0.0041656494140625,
      "max_context_tokens": 131072,
      "context_length": 1023,
      "question_length": 30,
      "prediction": "3,000",
      "reference": "around 3,000",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.07966184616088867,
      "timestamp": "2025-12-24T20:38:15.530789"
    },
    {
      "sample_id": "squad_5726dad6708984140094d3ad",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 592,
      "natural_ratio": 0.0045166015625,
      "max_context_tokens": 131072,
      "context_length": 1068,
      "question_length": 76,
      "prediction": "两个月零二十五天",
      "reference": "two and a half months",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.06896758079528809,
      "timestamp": "2025-12-24T20:38:15.599908"
    },
    {
      "sample_id": "squad_57318751e6313a140071d03f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 340,
      "natural_ratio": 0.002593994140625,
      "max_context_tokens": 131072,
      "context_length": 554,
      "question_length": 87,
      "prediction": "urban town life and medieval guilds",
      "reference": "urban town life and medieval guilds",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0940861701965332,
      "timestamp": "2025-12-24T20:38:15.694139"
    },
    {
      "sample_id": "squad_57269e385951b619008f77ee",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 468,
      "natural_ratio": 0.003570556640625,
      "max_context_tokens": 131072,
      "context_length": 867,
      "question_length": 30,
      "prediction": "钢",
      "reference": "steel",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.034899234771728516,
      "timestamp": "2025-12-24T20:38:15.729182"
    },
    {
      "sample_id": "squad_5728c8083acd2414000dfe3f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 381,
      "natural_ratio": 0.00290679931640625,
      "max_context_tokens": 131072,
      "context_length": 650,
      "question_length": 72,
      "prediction": "对应论truth as correspondence",
      "reference": "truth as correspondence",
      "metrics": {
        "f1": 0.9387755102040816
      },
      "elapsed_time": 0.07395195960998535,
      "timestamp": "2025-12-24T20:38:15.803280"
    },
    {
      "sample_id": "squad_57279338708984140094e153",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 469,
      "natural_ratio": 0.00357818603515625,
      "max_context_tokens": 131072,
      "context_length": 878,
      "question_length": 21,
      "prediction": "pedobaptism是指婴儿洗礼或浸礼派的教义，即认为应该给婴儿施洗。",
      "reference": "infant baptism",
      "metrics": {
        "f1": 0.3414634146341463
      },
      "elapsed_time": 0.2555708885192871,
      "timestamp": "2025-12-24T20:38:16.059003"
    },
    {
      "sample_id": "squad_5719ca564faf5e1900b8a7f7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 466,
      "natural_ratio": 0.0035552978515625,
      "max_context_tokens": 131072,
      "context_length": 836,
      "question_length": 57,
      "prediction": "反华 Riot",
      "reference": "anti-Chinese riots",
      "metrics": {
        "f1": 0.5263157894736842
      },
      "elapsed_time": 0.05534982681274414,
      "timestamp": "2025-12-24T20:38:16.114499"
    },
    {
      "sample_id": "squad_57278d02f1498d1400e8fbc9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 330,
      "natural_ratio": 0.0025177001953125,
      "max_context_tokens": 131072,
      "context_length": 568,
      "question_length": 53,
      "prediction": " elegance",
      "reference": "elegance",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.03608989715576172,
      "timestamp": "2025-12-24T20:38:16.150730"
    },
    {
      "sample_id": "squad_570d2ad5fed7b91900d45c8c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 732,
      "natural_ratio": 0.005584716796875,
      "max_context_tokens": 131072,
      "context_length": 1379,
      "question_length": 46,
      "prediction": "相对笨重且较为过时",
      "reference": "relatively bulky and somewhat outdated",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.09940338134765625,
      "timestamp": "2025-12-24T20:38:16.250280"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_9641",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 118759,
      "natural_ratio": 0.9060592651367188,
      "max_context_tokens": 131072,
      "context_length": 237440,
      "question_length": 38,
      "prediction": "Tom发现妻子已经去世，然后抱住她。",
      "reference": "Embraces his wife.",
      "metrics": {
        "f1": 0.06451612903225808
      },
      "elapsed_time": 4.111124515533447,
      "timestamp": "2025-12-24T20:38:20.361769"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_29036",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 35751,
      "natural_ratio": 0.27275848388671875,
      "max_context_tokens": 131072,
      "context_length": 71424,
      "question_length": 38,
      "prediction": "文本中并未提到Forerunners被抹除的情节，因此无法回答此问题。",
      "reference": "Interstellar war",
      "metrics": {
        "f1": 0.2
      },
      "elapsed_time": 1.0270328521728516,
      "timestamp": "2025-12-24T20:38:21.389447"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_22138",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 20390,
      "natural_ratio": 0.1555633544921875,
      "max_context_tokens": 131072,
      "context_length": 40681,
      "question_length": 60,
      "prediction": "他听到一种尖锐、清晰、金属般的敲击声，像是铁匠在砧子上敲打的声音；看到森林中的树木排列有序，闻到花朵的香气，听到风在树枝间吹拂发出如奥尔良风笛般的声音。",
      "reference": "Whispered voices and strange constellations",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.9687941074371338,
      "timestamp": "2025-12-24T20:38:22.358547"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_11589",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 101503,
      "natural_ratio": 0.7744064331054688,
      "max_context_tokens": 131072,
      "context_length": 202913,
      "question_length": 53,
      "prediction": "Ali去斯特拉斯堡时失去了与Stephanie的联系。",
      "reference": "A combat sports training facility near Strasbourg. ",
      "metrics": {
        "f1": 0.3902439024390244
      },
      "elapsed_time": 3.2041256427764893,
      "timestamp": "2025-12-24T20:38:25.563066"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_9438",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 61424,
      "natural_ratio": 0.4686279296875,
      "max_context_tokens": 131072,
      "context_length": 122773,
      "question_length": 36,
      "prediction": "Apollo carries a drawn sword.",
      "reference": "A golden bow.",
      "metrics": {
        "f1": 0.7200000000000001
      },
      "elapsed_time": 1.8997128009796143,
      "timestamp": "2025-12-24T20:38:27.463076"
    },
    {
      "sample_id": "squad_5727131c5951b619008f8585",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 548,
      "natural_ratio": 0.004180908203125,
      "max_context_tokens": 131072,
      "context_length": 941,
      "question_length": 115,
      "prediction": "老鼠（或 mice）",
      "reference": "mice",
      "metrics": {
        "f1": 0.5714285714285715
      },
      "elapsed_time": 0.07966256141662598,
      "timestamp": "2025-12-24T20:38:27.542932"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_24689",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 121052,
      "natural_ratio": 0.923553466796875,
      "max_context_tokens": 131072,
      "context_length": 242002,
      "question_length": 63,
      "prediction": "H for Hoaxer",
      "reference": "Hoaxer",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 4.375284194946289,
      "timestamp": "2025-12-24T20:38:31.918632"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_14502",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 62041,
      "natural_ratio": 0.47333526611328125,
      "max_context_tokens": 131072,
      "context_length": 123995,
      "question_length": 47,
      "prediction": "因为原文中并没有提到Ploni Almoni这个人，也没有说明他为什么没有娶路得。根据提供的背景信息，是路得的婆婆 Naomi 建议她回去找自己的家人，而路得选择留下。所以无法回答这个问题。",
      "reference": "Because he was unwilling to jeopardize the inheritance of his own estate. ",
      "metrics": {
        "f1": 0.16470588235294117
      },
      "elapsed_time": 2.209099531173706,
      "timestamp": "2025-12-24T20:38:34.128402"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_26523",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 123245,
      "natural_ratio": 0.9402847290039062,
      "max_context_tokens": 131072,
      "context_length": 246399,
      "question_length": 51,
      "prediction": "Joe Komako died in Italy, saving Macreedy's life.",
      "reference": "The wildflowers.",
      "metrics": {
        "f1": 0.6285714285714286
      },
      "elapsed_time": 3.1959495544433594,
      "timestamp": "2025-12-24T20:38:37.324737"
    },
    {
      "sample_id": "squad_572683c7dd62a815002e87e2",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 561,
      "natural_ratio": 0.00428009033203125,
      "max_context_tokens": 131072,
      "context_length": 1003,
      "question_length": 80,
      "prediction": "蒙古入侵",
      "reference": "The Pagan Kingdom fell due to the Mongol invasions and several warring states emerged",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.05453300476074219,
      "timestamp": "2025-12-24T20:38:37.379804"
    },
    {
      "sample_id": "squad_56f8245fa6d7ea1400e17405",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 367,
      "natural_ratio": 0.00279998779296875,
      "max_context_tokens": 131072,
      "context_length": 655,
      "question_length": 40,
      "prediction": "1454年",
      "reference": "1454",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 0.07634949684143066,
      "timestamp": "2025-12-24T20:38:37.456310"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_22979",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 101280,
      "natural_ratio": 0.772705078125,
      "max_context_tokens": 131072,
      "context_length": 202501,
      "question_length": 20,
      "prediction": "Mental被自己的毒胶囊毒死。",
      "reference": "Harry and Lloyd give him rat poison pills believing that the pills are medication.",
      "metrics": {
        "f1": 0.34285714285714286
      },
      "elapsed_time": 3.5540108680725098,
      "timestamp": "2025-12-24T20:38:41.010703"
    },
    {
      "sample_id": "squad_5734580c879d6814001ca545",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 402,
      "natural_ratio": 0.0030670166015625,
      "max_context_tokens": 131072,
      "context_length": 689,
      "question_length": 76,
      "prediction": "4年",
      "reference": "four",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.050183773040771484,
      "timestamp": "2025-12-24T20:38:41.061064"
    },
    {
      "sample_id": "squad_5728d8be4b864d1900164f6d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 380,
      "natural_ratio": 0.002899169921875,
      "max_context_tokens": 131072,
      "context_length": 640,
      "question_length": 81,
      "prediction": "狩猎和钓鱼",
      "reference": "Hunting and fishing",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.06490087509155273,
      "timestamp": "2025-12-24T20:38:41.126117"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_30972",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 57903,
      "natural_ratio": 0.44176483154296875,
      "max_context_tokens": 131072,
      "context_length": 115683,
      "question_length": 84,
      "prediction": "Makovkina plans to spend the night with Prince Stepan Kasatsky, despite the initial reluctance of the hermit.",
      "reference": "To seduce him.",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 1.7729287147521973,
      "timestamp": "2025-12-24T20:38:42.899418"
    },
    {
      "sample_id": "squad_56f7366c711bf01900a44a66",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 234,
      "natural_ratio": 0.0017852783203125,
      "max_context_tokens": 131072,
      "context_length": 333,
      "question_length": 95,
      "prediction": "1945年",
      "reference": "1945",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 0.07454061508178711,
      "timestamp": "2025-12-24T20:38:42.974172"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_20599",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 104586,
      "natural_ratio": 0.7979278564453125,
      "max_context_tokens": 131072,
      "context_length": 209098,
      "question_length": 35,
      "prediction": "Dr. Evil是Scott的生物学父亲。",
      "reference": "Dr. Evil and Frau Farbissina",
      "metrics": {
        "f1": 0.5294117647058824
      },
      "elapsed_time": 2.595869541168213,
      "timestamp": "2025-12-24T20:38:45.570417"
    },
    {
      "sample_id": "squad_56e16caee3433e1400422f06",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 214,
      "natural_ratio": 0.0016326904296875,
      "max_context_tokens": 131072,
      "context_length": 338,
      "question_length": 50,
      "prediction": "The Germanic superstrate",
      "reference": "Germanic superstrate",
      "metrics": {
        "f1": 0.9090909090909091
      },
      "elapsed_time": 0.07578492164611816,
      "timestamp": "2025-12-24T20:38:45.646381"
    },
    {
      "sample_id": "squad_5728060d4b864d1900164266",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 573,
      "natural_ratio": 0.00437164306640625,
      "max_context_tokens": 131072,
      "context_length": 1015,
      "question_length": 92,
      "prediction": "文本中没有提到纳赛尔对美国和英国阻挠阿斯旺大坝建设的消息的反应。",
      "reference": "took great offense",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.2598249912261963,
      "timestamp": "2025-12-24T20:38:45.906378"
    },
    {
      "sample_id": "squad_57268e59708984140094c9f8",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 311,
      "natural_ratio": 0.00237274169921875,
      "max_context_tokens": 131072,
      "context_length": 501,
      "question_length": 82,
      "prediction": "是的，Homo erectus 和 Homo sapiens 都在缅甸留下了存在证据。",
      "reference": "neolithic age domestication of plants and animals and the use of polished stone tools dating to sometime between 10,000 and 6,000 BC has been discovered",
      "metrics": {
        "f1": 0.5185185185185185
      },
      "elapsed_time": 0.21671223640441895,
      "timestamp": "2025-12-24T20:38:46.123302"
    },
    {
      "sample_id": "squad_56f7d6d8aef2371900625c2c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 418,
      "natural_ratio": 0.0031890869140625,
      "max_context_tokens": 131072,
      "context_length": 743,
      "question_length": 53,
      "prediction": "odwieczna（ perennial）",
      "reference": "odwieczna",
      "metrics": {
        "f1": 0.6
      },
      "elapsed_time": 0.09687495231628418,
      "timestamp": "2025-12-24T20:38:46.220344"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_8668",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 93901,
      "natural_ratio": 0.7164077758789062,
      "max_context_tokens": 131072,
      "context_length": 187723,
      "question_length": 40,
      "prediction": "通过书商介绍，Count Octave的秘书试图打开他的契约箱时得知了Honorine的秘密，这导致了后续的一系列事件，最终使他们得以相见。具体来说，是通过Count Octave的秘书了解到Honorine的情况后，间接促成了他们的见面。",
      "reference": "They grew up together because Honorine was adopted into the family. ",
      "metrics": {
        "f1": 0.25
      },
      "elapsed_time": 0.7636549472808838,
      "timestamp": "2025-12-24T20:38:46.984367"
    },
    {
      "sample_id": "squad_57285ba53acd2414000df94a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 349,
      "natural_ratio": 0.00266265869140625,
      "max_context_tokens": 131072,
      "context_length": 594,
      "question_length": 65,
      "prediction": "波斯阿契美尼德帝国",
      "reference": "Persian Achaemenid Empire",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.1102895736694336,
      "timestamp": "2025-12-24T20:38:47.094842"
    },
    {
      "sample_id": "squad_57324d56b9d445190005e9ff",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 427,
      "natural_ratio": 0.00325775146484375,
      "max_context_tokens": 131072,
      "context_length": 745,
      "question_length": 69,
      "prediction": "Winston Churchill",
      "reference": "Churchill",
      "metrics": {
        "f1": 0.6923076923076924
      },
      "elapsed_time": 0.055947303771972656,
      "timestamp": "2025-12-24T20:38:47.150948"
    },
    {
      "sample_id": "squad_573031bb04bcaa1900d7733f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 458,
      "natural_ratio": 0.0034942626953125,
      "max_context_tokens": 131072,
      "context_length": 858,
      "question_length": 19,
      "prediction": "contracts是一组钩子，用于提供常见功能，可以与其他应用集成，包括搜索和分享。",
      "reference": "collection of hooks to provide common functionality that can integrate with other apps",
      "metrics": {
        "f1": 0.23728813559322035
      },
      "elapsed_time": 0.2367410659790039,
      "timestamp": "2025-12-24T20:38:47.387859"
    },
    {
      "sample_id": "squad_572ee029cb0c0d14000f1656",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 543,
      "natural_ratio": 0.00414276123046875,
      "max_context_tokens": 131072,
      "context_length": 1025,
      "question_length": 21,
      "prediction": "Liang Ji was forced to commit suicide.",
      "reference": "forced to commit suicide",
      "metrics": {
        "f1": 0.7741935483870968
      },
      "elapsed_time": 0.12422537803649902,
      "timestamp": "2025-12-24T20:38:47.512280"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_20553",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 122701,
      "natural_ratio": 0.9361343383789062,
      "max_context_tokens": 131072,
      "context_length": 245309,
      "question_length": 53,
      "prediction": "题目中并未提到Bart Gregory或他从死里复生帮助任何人的情节，因此无法作答。",
      "reference": "Joey Leubner",
      "metrics": {
        "f1": 0.2448979591836735
      },
      "elapsed_time": 3.6354074478149414,
      "timestamp": "2025-12-24T20:38:51.148042"
    },
    {
      "sample_id": "squad_572749b4708984140094dbc6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 663,
      "natural_ratio": 0.00505828857421875,
      "max_context_tokens": 131072,
      "context_length": 1195,
      "question_length": 92,
      "prediction": "四倍",
      "reference": "four times",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.051653146743774414,
      "timestamp": "2025-12-24T20:38:51.200225"
    },
    {
      "sample_id": "squad_57265236708984140094c24d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 338,
      "natural_ratio": 0.0025787353515625,
      "max_context_tokens": 131072,
      "context_length": 546,
      "question_length": 90,
      "prediction": "之前应用于防止欠曝的安全系数被放弃。",
      "reference": "safety factors against under-exposure",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.12563133239746094,
      "timestamp": "2025-12-24T20:38:51.326018"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_10561",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 114735,
      "natural_ratio": 0.8753585815429688,
      "max_context_tokens": 131072,
      "context_length": 229340,
      "question_length": 91,
      "prediction": "Mademoiselle La Rue is a teacher to Charlotte Temple.",
      "reference": "She is her teacher",
      "metrics": {
        "f1": 0.72
      },
      "elapsed_time": 3.984031915664673,
      "timestamp": "2025-12-24T20:38:55.310435"
    },
    {
      "sample_id": "squad_5727f67d3acd2414000df0f0",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 418,
      "natural_ratio": 0.0031890869140625,
      "max_context_tokens": 131072,
      "context_length": 754,
      "question_length": 42,
      "prediction": "Hussein Sirri Amer",
      "reference": "Hussein Sirri Amer",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.09233713150024414,
      "timestamp": "2025-12-24T20:38:55.403307"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_30134",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 88036,
      "natural_ratio": 0.671661376953125,
      "max_context_tokens": 131072,
      "context_length": 175985,
      "question_length": 47,
      "prediction": "Sykes可能误认为Julie背叛了Phil，或者他有其他心理原因导致他想要杀害Julie。但从给定的信息来看，具体原因未明确说明。根据剧情发展，最有可能的是Sykes误以为Julie背叛了Phil，从而产生了极端的反应。",
      "reference": "She rejected his advances",
      "metrics": {
        "f1": 0.12658227848101267
      },
      "elapsed_time": 0.7247765064239502,
      "timestamp": "2025-12-24T20:38:56.128453"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_11014",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 32917,
      "natural_ratio": 0.25113677978515625,
      "max_context_tokens": 131072,
      "context_length": 65737,
      "question_length": 58,
      "prediction": "These are written to Aurora.",
      "reference": " How He Lied to Her Husband",
      "metrics": {
        "f1": 0.8571428571428571
      },
      "elapsed_time": 0.8120391368865967,
      "timestamp": "2025-12-24T20:38:56.940792"
    },
    {
      "sample_id": "squad_56dfe23b7aa994140058e217",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 206,
      "natural_ratio": 0.0015716552734375,
      "max_context_tokens": 131072,
      "context_length": 266,
      "question_length": 107,
      "prediction": "O'Neill's",
      "reference": "O'Neill's",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05275273323059082,
      "timestamp": "2025-12-24T20:38:56.993732"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_256",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 32304,
      "natural_ratio": 0.2464599609375,
      "max_context_tokens": 131072,
      "context_length": 64510,
      "question_length": 59,
      "prediction": "Peace must be founded on the understanding that mankind is one united race.",
      "reference": "One human family",
      "metrics": {
        "f1": 0.625
      },
      "elapsed_time": 0.78605055809021,
      "timestamp": "2025-12-24T20:38:57.780029"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_32129",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 110483,
      "natural_ratio": 0.8429183959960938,
      "max_context_tokens": 131072,
      "context_length": 220905,
      "question_length": 22,
      "prediction": "Baron Friedrich von Schoenvorts and his German crew",
      "reference": "The Germans",
      "metrics": {
        "f1": 0.7142857142857143
      },
      "elapsed_time": 3.9135208129882812,
      "timestamp": "2025-12-24T20:39:01.693926"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_13624",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 112107,
      "natural_ratio": 0.8553085327148438,
      "max_context_tokens": 131072,
      "context_length": 224154,
      "question_length": 21,
      "prediction": "PAUL",
      "reference": "Zoil's female supervisor",
      "metrics": {
        "f1": 0.4210526315789474
      },
      "elapsed_time": 3.6815385818481445,
      "timestamp": "2025-12-24T20:39:05.376242"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_26427",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 97591,
      "natural_ratio": 0.7445602416992188,
      "max_context_tokens": 131072,
      "context_length": 195071,
      "question_length": 72,
      "prediction": "Phelps试图使用吸盘和绳索从列车顶部逃生。",
      "reference": "A helicopter.",
      "metrics": {
        "f1": 0.23529411764705885
      },
      "elapsed_time": 3.333800792694092,
      "timestamp": "2025-12-24T20:39:08.710777"
    },
    {
      "sample_id": "squad_5709880eed30961900e842a7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 393,
      "natural_ratio": 0.00299835205078125,
      "max_context_tokens": 131072,
      "context_length": 638,
      "question_length": 109,
      "prediction": "reddish copper(I) oxide",
      "reference": "reddish",
      "metrics": {
        "f1": 0.5
      },
      "elapsed_time": 0.0898137092590332,
      "timestamp": "2025-12-24T20:39:08.800775"
    },
    {
      "sample_id": "squad_5726e30ddd62a815002e93fc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 438,
      "natural_ratio": 0.0033416748046875,
      "max_context_tokens": 131072,
      "context_length": 783,
      "question_length": 54,
      "prediction": "撒哈拉沙漠",
      "reference": "Sahara",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.06770920753479004,
      "timestamp": "2025-12-24T20:39:08.868641"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_11528",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 112953,
      "natural_ratio": 0.8617630004882812,
      "max_context_tokens": 131072,
      "context_length": 225819,
      "question_length": 47,
      "prediction": "Raffles Haw views McIntyre Senior with suspicion and concern, believing his obsession with wealth and his attempts to exploit him are signs of a dangerous and potentially harmful influence.",
      "reference": "Raffles Haw thinks McIntyre Senior is a greedy beggar",
      "metrics": {
        "f1": 0.8444444444444444
      },
      "elapsed_time": 4.18000864982605,
      "timestamp": "2025-12-24T20:39:13.048991"
    },
    {
      "sample_id": "squad_571a9be84faf5e1900b8ab2e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 354,
      "natural_ratio": 0.0027008056640625,
      "max_context_tokens": 131072,
      "context_length": 586,
      "question_length": 82,
      "prediction": "Fertile Crescent",
      "reference": "Fertile Crescent",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07178282737731934,
      "timestamp": "2025-12-24T20:39:13.121304"
    },
    {
      "sample_id": "squad_570e22860dc6ce1900204de7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 380,
      "natural_ratio": 0.002899169921875,
      "max_context_tokens": 131072,
      "context_length": 679,
      "question_length": 41,
      "prediction": "Eritrea has no official language.",
      "reference": "The nation has no official language",
      "metrics": {
        "f1": 0.9333333333333333
      },
      "elapsed_time": 0.11837029457092285,
      "timestamp": "2025-12-24T20:39:13.239845"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_9707",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 22689,
      "natural_ratio": 0.17310333251953125,
      "max_context_tokens": 131072,
      "context_length": 45323,
      "question_length": 16,
      "prediction": "文本中并没有提到名为Clemenza的人物。",
      "reference": "A Detective that investigates Frye",
      "metrics": {
        "f1": 0.2222222222222222
      },
      "elapsed_time": 0.571014404296875,
      "timestamp": "2025-12-24T20:39:13.811165"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_28839",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 100823,
      "natural_ratio": 0.7692184448242188,
      "max_context_tokens": 131072,
      "context_length": 201565,
      "question_length": 42,
      "prediction": "因为Lina认为独立和自给自足比婚姻更重要，她不愿意接受约翰尼的求婚，而是选择从事自己的职业。",
      "reference": "She wants to remain independent in every way.",
      "metrics": {
        "f1": 0.10344827586206896
      },
      "elapsed_time": 4.34594464302063,
      "timestamp": "2025-12-24T20:39:18.157476"
    },
    {
      "sample_id": "squad_56fad9cef34c681400b0c165",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 246,
      "natural_ratio": 0.0018768310546875,
      "max_context_tokens": 131072,
      "context_length": 386,
      "question_length": 66,
      "prediction": "20世纪",
      "reference": "20th",
      "metrics": {
        "f1": 0.5
      },
      "elapsed_time": 0.05768156051635742,
      "timestamp": "2025-12-24T20:39:18.215338"
    },
    {
      "sample_id": "squad_5731b5b50fdd8d15006c6472",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 683,
      "natural_ratio": 0.00521087646484375,
      "max_context_tokens": 131072,
      "context_length": 1258,
      "question_length": 69,
      "prediction": "Quakers",
      "reference": "Quakers",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05015707015991211,
      "timestamp": "2025-12-24T20:39:18.265646"
    },
    {
      "sample_id": "squad_572c9bb7f182dd1900d7c7f9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 453,
      "natural_ratio": 0.00345611572265625,
      "max_context_tokens": 131072,
      "context_length": 797,
      "question_length": 69,
      "prediction": "Herbert Hoover",
      "reference": "Herbert Hoover",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0550684928894043,
      "timestamp": "2025-12-24T20:39:18.320861"
    },
    {
      "sample_id": "squad_572a15493f37b319004786cc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 393,
      "natural_ratio": 0.00299835205078125,
      "max_context_tokens": 131072,
      "context_length": 689,
      "question_length": 58,
      "prediction": "so-called Ghazi emirates",
      "reference": "Ghazi emirates",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 0.08458137512207031,
      "timestamp": "2025-12-24T20:39:18.405587"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_15623",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 48723,
      "natural_ratio": 0.37172698974609375,
      "max_context_tokens": 131072,
      "context_length": 97357,
      "question_length": 50,
      "prediction": "1780",
      "reference": "1785",
      "metrics": {
        "f1": 0.75
      },
      "elapsed_time": 1.1533501148223877,
      "timestamp": "2025-12-24T20:39:19.559278"
    },
    {
      "sample_id": "squad_572ee81bdfa6aa1500f8d4b7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 290,
      "natural_ratio": 0.0022125244140625,
      "max_context_tokens": 131072,
      "context_length": 515,
      "question_length": 26,
      "prediction": "Dong Zhuo burned Luoyang to the ground.",
      "reference": "burned",
      "metrics": {
        "f1": 0.2666666666666667
      },
      "elapsed_time": 0.14832735061645508,
      "timestamp": "2025-12-24T20:39:19.707782"
    },
    {
      "sample_id": "squad_57324bd1b9d445190005e9db",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 738,
      "natural_ratio": 0.0056304931640625,
      "max_context_tokens": 131072,
      "context_length": 1383,
      "question_length": 54,
      "prediction": "Kingdom Halls",
      "reference": "Kingdom Halls",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07049798965454102,
      "timestamp": "2025-12-24T20:39:19.778438"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_13861",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 101481,
      "natural_ratio": 0.7742385864257812,
      "max_context_tokens": 131072,
      "context_length": 202883,
      "question_length": 39,
      "prediction": "因为观察员们在进行秘密实验，为了掩盖错误，他们故意沉没了那艘苏联船。",
      "reference": "the mutagen experiment got out of control",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.1768884658813477,
      "timestamp": "2025-12-24T20:39:22.955736"
    },
    {
      "sample_id": "squad_570cee7ffed7b91900d45aff",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 342,
      "natural_ratio": 0.0026092529296875,
      "max_context_tokens": 131072,
      "context_length": 606,
      "question_length": 39,
      "prediction": "Infected plant cells",
      "reference": "The Ti and Ri plasmids",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 0.0698232650756836,
      "timestamp": "2025-12-24T20:39:23.025751"
    },
    {
      "sample_id": "squad_56ce7bf4aab44d1400b887f4",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 573,
      "natural_ratio": 0.00437164306640625,
      "max_context_tokens": 131072,
      "context_length": 1040,
      "question_length": 66,
      "prediction": "Vinnie Chieco",
      "reference": "Vinnie Chieco",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0798959732055664,
      "timestamp": "2025-12-24T20:39:23.105809"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_13096",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 16678,
      "natural_ratio": 0.1272430419921875,
      "max_context_tokens": 131072,
      "context_length": 33278,
      "question_length": 39,
      "prediction": "Prince Prospero initially reacted with a strong shudder of terror or disgust when he saw the stranger, but then his brow reddened with rage.",
      "reference": "He was angry and demanded to know his identity.",
      "metrics": {
        "f1": 0.7692307692307693
      },
      "elapsed_time": 0.6175518035888672,
      "timestamp": "2025-12-24T20:39:23.723604"
    },
    {
      "sample_id": "squad_56f8a2749e9bad19000a0227",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 200,
      "natural_ratio": 0.00152587890625,
      "max_context_tokens": 131072,
      "context_length": 336,
      "question_length": 24,
      "prediction": "湖泊",
      "reference": "lakes",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.031095027923583984,
      "timestamp": "2025-12-24T20:39:23.754865"
    },
    {
      "sample_id": "squad_57282ded2ca10214002da003",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 313,
      "natural_ratio": 0.00238800048828125,
      "max_context_tokens": 131072,
      "context_length": 548,
      "question_length": 39,
      "prediction": "Aristotle added historical empirical evidence to Plato's analysis.",
      "reference": "historical empirical evidence",
      "metrics": {
        "f1": 0.6105263157894737
      },
      "elapsed_time": 0.14285683631896973,
      "timestamp": "2025-12-24T20:39:23.897874"
    },
    {
      "sample_id": "squad_5728ac7b2ca10214002da596",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 335,
      "natural_ratio": 0.00255584716796875,
      "max_context_tokens": 131072,
      "context_length": 573,
      "question_length": 58,
      "prediction": "$40,042",
      "reference": "$40,042",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.09790325164794922,
      "timestamp": "2025-12-24T20:39:23.995922"
    },
    {
      "sample_id": "squad_57341dc0d058e614000b696d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 590,
      "natural_ratio": 0.0045013427734375,
      "max_context_tokens": 131072,
      "context_length": 1098,
      "question_length": 43,
      "prediction": "anaerobic bacteria species",
      "reference": "anaerobic bacteria",
      "metrics": {
        "f1": 0.8181818181818181
      },
      "elapsed_time": 0.07616996765136719,
      "timestamp": "2025-12-24T20:39:24.072238"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_17617",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 108678,
      "natural_ratio": 0.8291473388671875,
      "max_context_tokens": 131072,
      "context_length": 217262,
      "question_length": 54,
      "prediction": "电疗师套装、无限通讯器、不可限量的保护衣",
      "reference": "garment of protection, a record of events, spectacles that are a character marker",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.6004226207733154,
      "timestamp": "2025-12-24T20:39:27.673036"
    },
    {
      "sample_id": "squad_570a83046d058f1900182ef6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 523,
      "natural_ratio": 0.00399017333984375,
      "max_context_tokens": 131072,
      "context_length": 931,
      "question_length": 75,
      "prediction": "用 runes 系统书写。",
      "reference": "runic system",
      "metrics": {
        "f1": 0.5217391304347826
      },
      "elapsed_time": 0.1024172306060791,
      "timestamp": "2025-12-24T20:39:27.775996"
    },
    {
      "sample_id": "squad_5729f6063f37b31900478612",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 401,
      "natural_ratio": 0.00305938720703125,
      "max_context_tokens": 131072,
      "context_length": 599,
      "question_length": 163,
      "prediction": "飓风",
      "reference": "hurricane",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.04485964775085449,
      "timestamp": "2025-12-24T20:39:27.821013"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_5607",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 107124,
      "natural_ratio": 0.817291259765625,
      "max_context_tokens": 131072,
      "context_length": 214118,
      "question_length": 91,
      "prediction": "Parker",
      "reference": "Parker family",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 5.288559198379517,
      "timestamp": "2025-12-24T20:39:33.109994"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_31000",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 73682,
      "natural_ratio": 0.5621490478515625,
      "max_context_tokens": 131072,
      "context_length": 147294,
      "question_length": 31,
      "prediction": "Nix的家附近",
      "reference": "The Mojave Desert.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.349414348602295,
      "timestamp": "2025-12-24T20:39:36.460122"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_13491",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 80714,
      "natural_ratio": 0.6157989501953125,
      "max_context_tokens": 131072,
      "context_length": 161328,
      "question_length": 61,
      "prediction": "Grace发现莱昂纳德对她撒了谎，他在最近两天内亲吻过Julia。",
      "reference": "He is having an affair with Julia.",
      "metrics": {
        "f1": 0.32653061224489793
      },
      "elapsed_time": 2.899686336517334,
      "timestamp": "2025-12-24T20:39:39.360212"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_28194",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 53140,
      "natural_ratio": 0.405426025390625,
      "max_context_tokens": 131072,
      "context_length": 106158,
      "question_length": 83,
      "prediction": "Lavarcham suggests that Conchubor should go to his hut and leave Deirdre alone, implying that she is broken and不会再继续对话。",
      "reference": "She is old and ugly",
      "metrics": {
        "f1": 0.6046511627906976
      },
      "elapsed_time": 1.974491834640503,
      "timestamp": "2025-12-24T20:39:41.335045"
    },
    {
      "sample_id": "squad_56f82667a6d7ea1400e1741d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 298,
      "natural_ratio": 0.0022735595703125,
      "max_context_tokens": 131072,
      "context_length": 513,
      "question_length": 44,
      "prediction": "1492年",
      "reference": "1492",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 0.0775151252746582,
      "timestamp": "2025-12-24T20:39:41.412758"
    },
    {
      "sample_id": "squad_5727c79f3acd2414000dec2d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 358,
      "natural_ratio": 0.0027313232421875,
      "max_context_tokens": 131072,
      "context_length": 609,
      "question_length": 68,
      "prediction": "2",
      "reference": "two",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.03474307060241699,
      "timestamp": "2025-12-24T20:39:41.447659"
    },
    {
      "sample_id": "squad_56de71114396321400ee28fc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 272,
      "natural_ratio": 0.0020751953125,
      "max_context_tokens": 131072,
      "context_length": 418,
      "question_length": 86,
      "prediction": "The Cathar Crusade",
      "reference": "The Cathar Crusade",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07341361045837402,
      "timestamp": "2025-12-24T20:39:41.521226"
    },
    {
      "sample_id": "squad_57300b16b2c2fd140056879d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 394,
      "natural_ratio": 0.0030059814453125,
      "max_context_tokens": 131072,
      "context_length": 702,
      "question_length": 47,
      "prediction": "Karim Khan",
      "reference": "Karim Khan",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0550382137298584,
      "timestamp": "2025-12-24T20:39:41.576417"
    },
    {
      "sample_id": "squad_570aae5d4103511400d59921",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 488,
      "natural_ratio": 0.00372314453125,
      "max_context_tokens": 131072,
      "context_length": 877,
      "question_length": 60,
      "prediction": "ski-jump ramp converts some of the forward rolling movement of the aircraft into vertical velocity",
      "reference": "some of the forward rolling movement of the aircraft into vertical velocity",
      "metrics": {
        "f1": 0.8695652173913044
      },
      "elapsed_time": 0.19467973709106445,
      "timestamp": "2025-12-24T20:39:41.771256"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_32136",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 76457,
      "natural_ratio": 0.5833206176757812,
      "max_context_tokens": 131072,
      "context_length": 152825,
      "question_length": 50,
      "prediction": "Miami Dolphins和Philadelphia Eagles",
      "reference": "Miami Dolphins and Philadelphia Eagles",
      "metrics": {
        "f1": 0.962962962962963
      },
      "elapsed_time": 3.5390114784240723,
      "timestamp": "2025-12-24T20:39:45.310644"
    },
    {
      "sample_id": "squad_56e79d2300c9c71400d773c0",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 338,
      "natural_ratio": 0.0025787353515625,
      "max_context_tokens": 131072,
      "context_length": 566,
      "question_length": 70,
      "prediction": "2005年",
      "reference": "2005",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 0.07955622673034668,
      "timestamp": "2025-12-24T20:39:45.390400"
    },
    {
      "sample_id": "squad_57303815947a6a140053d2c9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 400,
      "natural_ratio": 0.0030517578125,
      "max_context_tokens": 131072,
      "context_length": 670,
      "question_length": 90,
      "prediction": "65部",
      "reference": "65",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 0.05468297004699707,
      "timestamp": "2025-12-24T20:39:45.445249"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_31820",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 113890,
      "natural_ratio": 0.8689117431640625,
      "max_context_tokens": 131072,
      "context_length": 227701,
      "question_length": 40,
      "prediction": "John F. Kennedy",
      "reference": "President Kennedy",
      "metrics": {
        "f1": 0.5454545454545454
      },
      "elapsed_time": 5.700146913528442,
      "timestamp": "2025-12-24T20:39:51.145727"
    },
    {
      "sample_id": "squad_572fc958a23a5019007fc9d9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 594,
      "natural_ratio": 0.0045318603515625,
      "max_context_tokens": 131072,
      "context_length": 1084,
      "question_length": 64,
      "prediction": "世界舆论，特别是美国，谴责日本。",
      "reference": "condemned",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.1261129379272461,
      "timestamp": "2025-12-24T20:39:51.272389"
    },
    {
      "sample_id": "squad_56e196cfcd28a01900c679f0",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 154,
      "natural_ratio": 0.0011749267578125,
      "max_context_tokens": 131072,
      "context_length": 242,
      "question_length": 27,
      "prediction": "在生物化学反应中通过排出还原当量来产生。",
      "reference": "expelling reducing equivalents in biochemical reactions",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.14827752113342285,
      "timestamp": "2025-12-24T20:39:51.420845"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_19401",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 78939,
      "natural_ratio": 0.6022567749023438,
      "max_context_tokens": 131072,
      "context_length": 157782,
      "question_length": 56,
      "prediction": "因为Debo说要找Craig报仇，Craig的父亲担心他安全，所以送他去叔叔家。",
      "reference": "Because the neighborhood bully, Deebo, would be getting out of jail soon",
      "metrics": {
        "f1": 0.3396226415094339
      },
      "elapsed_time": 2.4526219367980957,
      "timestamp": "2025-12-24T20:39:53.873822"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_24251",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 105035,
      "natural_ratio": 0.8013534545898438,
      "max_context_tokens": 131072,
      "context_length": 209999,
      "question_length": 32,
      "prediction": "Jason",
      "reference": "Jason",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 3.203779458999634,
      "timestamp": "2025-12-24T20:39:57.078048"
    },
    {
      "sample_id": "squad_56cf5187aab44d1400b88fc6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 308,
      "natural_ratio": 0.002349853515625,
      "max_context_tokens": 131072,
      "context_length": 466,
      "question_length": 110,
      "prediction": "The New York Times",
      "reference": "The New York Times",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08024358749389648,
      "timestamp": "2025-12-24T20:39:57.158488"
    },
    {
      "sample_id": "squad_57315d3b05b4da19006bd0e4",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 580,
      "natural_ratio": 0.004425048828125,
      "max_context_tokens": 131072,
      "context_length": 1085,
      "question_length": 36,
      "prediction": "Hunan region",
      "reference": "Hunan region where it was raised",
      "metrics": {
        "f1": 0.8333333333333333
      },
      "elapsed_time": 0.059839725494384766,
      "timestamp": "2025-12-24T20:39:57.218505"
    },
    {
      "sample_id": "squad_57313cc0a5e9cc1400cdbd7f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 416,
      "natural_ratio": 0.003173828125,
      "max_context_tokens": 131072,
      "context_length": 754,
      "question_length": 39,
      "prediction": "红、绿、蓝",
      "reference": "red, green and blue",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.07442855834960938,
      "timestamp": "2025-12-24T20:39:57.293094"
    },
    {
      "sample_id": "squad_56db1b38e7c41114004b4d2d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 393,
      "natural_ratio": 0.00299835205078125,
      "max_context_tokens": 131072,
      "context_length": 692,
      "question_length": 55,
      "prediction": "Green Party officials",
      "reference": "Green Party officials.",
      "metrics": {
        "f1": 0.967741935483871
      },
      "elapsed_time": 0.05404257774353027,
      "timestamp": "2025-12-24T20:39:57.347299"
    },
    {
      "sample_id": "squad_57342720d058e614000b6a26",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 453,
      "natural_ratio": 0.00345611572265625,
      "max_context_tokens": 131072,
      "context_length": 845,
      "question_length": 22,
      "prediction": "Immunoassays are techniques developed from complex serological techniques, using antibody-antigen binding to produce signals that can be detected by instrumentation, allowing for the quantitation of target antigens and the detection or measurement of antigens from infectious agents or immune responses.",
      "reference": "Complex serological techniques",
      "metrics": {
        "f1": 0.18018018018018017
      },
      "elapsed_time": 0.5528721809387207,
      "timestamp": "2025-12-24T20:39:57.900337"
    },
    {
      "sample_id": "squad_5731906a497a881900249042",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 346,
      "natural_ratio": 0.0026397705078125,
      "max_context_tokens": 131072,
      "context_length": 606,
      "question_length": 47,
      "prediction": "希腊神话",
      "reference": "Greek",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.04364633560180664,
      "timestamp": "2025-12-24T20:39:57.944139"
    },
    {
      "sample_id": "squad_570a84d76d058f1900182f14",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 304,
      "natural_ratio": 0.0023193359375,
      "max_context_tokens": 131072,
      "context_length": 536,
      "question_length": 33,
      "prediction": "Scyld",
      "reference": "Scyld",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05399656295776367,
      "timestamp": "2025-12-24T20:39:57.998287"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_483",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 90366,
      "natural_ratio": 0.6894378662109375,
      "max_context_tokens": 131072,
      "context_length": 180667,
      "question_length": 26,
      "prediction": "Megan和Kristi",
      "reference": "Hikers",
      "metrics": {
        "f1": 0.5882352941176471
      },
      "elapsed_time": 2.493011474609375,
      "timestamp": "2025-12-24T20:40:00.491692"
    },
    {
      "sample_id": "squad_56df4fa496943c1400a5d36e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 300,
      "natural_ratio": 0.002288818359375,
      "max_context_tokens": 131072,
      "context_length": 511,
      "question_length": 49,
      "prediction": "Vladimir Lenin",
      "reference": "Vladimir Lenin",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06928133964538574,
      "timestamp": "2025-12-24T20:40:00.561165"
    },
    {
      "sample_id": "squad_570f92ba80d9841400ab35cd",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 486,
      "natural_ratio": 0.0037078857421875,
      "max_context_tokens": 131072,
      "context_length": 860,
      "question_length": 72,
      "prediction": "女王的私人财富",
      "reference": "private wealth",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.06585192680358887,
      "timestamp": "2025-12-24T20:40:00.627180"
    },
    {
      "sample_id": "squad_572843362ca10214002da1e5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 354,
      "natural_ratio": 0.0027008056640625,
      "max_context_tokens": 131072,
      "context_length": 589,
      "question_length": 80,
      "prediction": "Operation Infinite Reach",
      "reference": "Operation Infinite Reach",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05465340614318848,
      "timestamp": "2025-12-24T20:40:00.681990"
    },
    {
      "sample_id": "squad_5726434b271a42140099d800",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 363,
      "natural_ratio": 0.00276947021484375,
      "max_context_tokens": 131072,
      "context_length": 634,
      "question_length": 52,
      "prediction": "两政府",
      "reference": "two",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.044170379638671875,
      "timestamp": "2025-12-24T20:40:00.726310"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_32150",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 76442,
      "natural_ratio": 0.5832061767578125,
      "max_context_tokens": 131072,
      "context_length": 152825,
      "question_length": 19,
      "prediction": "Ace Ventura is a pet detective with a talent for solving cases involving missing pets and other animals.",
      "reference": "A Private Investigater in Miami Fl.",
      "metrics": {
        "f1": 0.8333333333333333
      },
      "elapsed_time": 0.3394606113433838,
      "timestamp": "2025-12-24T20:40:01.066051"
    },
    {
      "sample_id": "squad_570feb9480d9841400ab3732",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 327,
      "natural_ratio": 0.00249481201171875,
      "max_context_tokens": 131072,
      "context_length": 555,
      "question_length": 60,
      "prediction": "50,000",
      "reference": "50,000",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08648419380187988,
      "timestamp": "2025-12-24T20:40:01.152730"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_6482",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 121463,
      "natural_ratio": 0.9266891479492188,
      "max_context_tokens": 131072,
      "context_length": 242784,
      "question_length": 103,
      "prediction": "在墓地的入口附近",
      "reference": "A barn.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 4.98868989944458,
      "timestamp": "2025-12-24T20:40:06.141805"
    },
    {
      "sample_id": "squad_56dda5109a695914005b957e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 180,
      "natural_ratio": 0.001373291015625,
      "max_context_tokens": 131072,
      "context_length": 262,
      "question_length": 59,
      "prediction": "Greek",
      "reference": "Greek",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.037251949310302734,
      "timestamp": "2025-12-24T20:40:06.179593"
    },
    {
      "sample_id": "squad_5730146f947a6a140053d07c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 460,
      "natural_ratio": 0.003509521484375,
      "max_context_tokens": 131072,
      "context_length": 816,
      "question_length": 65,
      "prediction": "The Regency style",
      "reference": "The Regency style",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06588029861450195,
      "timestamp": "2025-12-24T20:40:06.245639"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_6310",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 76974,
      "natural_ratio": 0.5872650146484375,
      "max_context_tokens": 131072,
      "context_length": 153821,
      "question_length": 87,
      "prediction": "因为The Dude认为那个Jeff Lebowski是故意陷害他，让他承担被尿湿的地毯费用，实际上那笔钱是Lebowski自己藏起来了。",
      "reference": "The Dude thinks he should because he was beat up by mistake.",
      "metrics": {
        "f1": 0.37142857142857144
      },
      "elapsed_time": 3.896620750427246,
      "timestamp": "2025-12-24T20:40:10.142636"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_4882",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 115813,
      "natural_ratio": 0.8835830688476562,
      "max_context_tokens": 131072,
      "context_length": 231539,
      "question_length": 48,
      "prediction": "Harley was boarded at a schoolhouse that stood desolate and was later pulled down.",
      "reference": "He lost both of his parents.",
      "metrics": {
        "f1": 0.7878787878787877
      },
      "elapsed_time": 4.570826768875122,
      "timestamp": "2025-12-24T20:40:14.713822"
    },
    {
      "sample_id": "squad_56cf61d3aab44d1400b891a3",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 376,
      "natural_ratio": 0.00286865234375,
      "max_context_tokens": 131072,
      "context_length": 650,
      "question_length": 62,
      "prediction": "1817年",
      "reference": "1817",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 0.08321928977966309,
      "timestamp": "2025-12-24T20:40:14.797582"
    },
    {
      "sample_id": "squad_5726166d89a1e219009ac22c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 424,
      "natural_ratio": 0.00323486328125,
      "max_context_tokens": 131072,
      "context_length": 733,
      "question_length": 75,
      "prediction": "Hellenistic科学",
      "reference": "Hellenistic",
      "metrics": {
        "f1": 0.9166666666666666
      },
      "elapsed_time": 0.06521153450012207,
      "timestamp": "2025-12-24T20:40:14.862959"
    },
    {
      "sample_id": "squad_572ea7e9dfa6aa1500f8d268",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 585,
      "natural_ratio": 0.00446319580078125,
      "max_context_tokens": 131072,
      "context_length": 1021,
      "question_length": 109,
      "prediction": "联合国安全理事会",
      "reference": "United Nations Security Council",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.05517864227294922,
      "timestamp": "2025-12-24T20:40:14.918296"
    },
    {
      "sample_id": "squad_5731408a05b4da19006bcf42",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 331,
      "natural_ratio": 0.00252532958984375,
      "max_context_tokens": 131072,
      "context_length": 574,
      "question_length": 48,
      "prediction": "意大利",
      "reference": "Rome",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.03383016586303711,
      "timestamp": "2025-12-24T20:40:14.952272"
    },
    {
      "sample_id": "squad_57342891d058e614000b6a5c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 451,
      "natural_ratio": 0.00344085693359375,
      "max_context_tokens": 131072,
      "context_length": 801,
      "question_length": 61,
      "prediction": "White House of the Confederacy",
      "reference": "White House of the Confederacy",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08496379852294922,
      "timestamp": "2025-12-24T20:40:15.037386"
    },
    {
      "sample_id": "squad_56d1314517492d1400aabbce",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 389,
      "natural_ratio": 0.00296783447265625,
      "max_context_tokens": 131072,
      "context_length": 658,
      "question_length": 81,
      "prediction": "电池寿命",
      "reference": "battery life",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.04438185691833496,
      "timestamp": "2025-12-24T20:40:15.081913"
    },
    {
      "sample_id": "squad_572935dd3f37b3190047810b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 660,
      "natural_ratio": 0.005035400390625,
      "max_context_tokens": 131072,
      "context_length": 1241,
      "question_length": 39,
      "prediction": "AD 260",
      "reference": "AD 260",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07886815071105957,
      "timestamp": "2025-12-24T20:40:15.160928"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_1029",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 118100,
      "natural_ratio": 0.901031494140625,
      "max_context_tokens": 131072,
      "context_length": 236034,
      "question_length": 127,
      "prediction": "30年",
      "reference": "30 years",
      "metrics": {
        "f1": 0.36363636363636365
      },
      "elapsed_time": 4.328776836395264,
      "timestamp": "2025-12-24T20:40:19.490043"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_14270",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 52170,
      "natural_ratio": 0.3980255126953125,
      "max_context_tokens": 131072,
      "context_length": 104242,
      "question_length": 59,
      "prediction": "EUR.",
      "reference": "Aeschulys",
      "metrics": {
        "f1": 0.3333333333333333
      },
      "elapsed_time": 1.5853853225708008,
      "timestamp": "2025-12-24T20:40:21.076125"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_1683",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 91296,
      "natural_ratio": 0.696533203125,
      "max_context_tokens": 131072,
      "context_length": 182517,
      "question_length": 35,
      "prediction": "Doreen不会去南美。",
      "reference": "Jack and Anna",
      "metrics": {
        "f1": 0.2222222222222222
      },
      "elapsed_time": 2.0894007682800293,
      "timestamp": "2025-12-24T20:40:23.165886"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_20043",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 102888,
      "natural_ratio": 0.78497314453125,
      "max_context_tokens": 131072,
      "context_length": 205659,
      "question_length": 77,
      "prediction": "俄罗斯大使馆",
      "reference": "Russia",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.3892126083374023,
      "timestamp": "2025-12-24T20:40:26.555512"
    },
    {
      "sample_id": "squad_56df680f8bc80c19004e4bdd",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 497,
      "natural_ratio": 0.00379180908203125,
      "max_context_tokens": 131072,
      "context_length": 907,
      "question_length": 47,
      "prediction": "在发展中国家的干旱地区或热带森林中。",
      "reference": "in the developing world",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.1621997356414795,
      "timestamp": "2025-12-24T20:40:26.717949"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_31368",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 84442,
      "natural_ratio": 0.6442413330078125,
      "max_context_tokens": 131072,
      "context_length": 168813,
      "question_length": 31,
      "prediction": "Cecily is Louise Derrick's half-sister.",
      "reference": "HER STEPSISTER",
      "metrics": {
        "f1": 0.5185185185185185
      },
      "elapsed_time": 2.7310824394226074,
      "timestamp": "2025-12-24T20:40:29.449417"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_26233",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 122916,
      "natural_ratio": 0.937774658203125,
      "max_context_tokens": 131072,
      "context_length": 245733,
      "question_length": 59,
      "prediction": "如果外星生物逃到文明地区，它可能会模仿人类或其他生物，对人们造成威胁或混乱。",
      "reference": "All life on Earth will be assimilated.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.3560595512390137,
      "timestamp": "2025-12-24T20:40:32.805846"
    },
    {
      "sample_id": "squad_56f9914a9b226e1400dd15ae",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 404,
      "natural_ratio": 0.003082275390625,
      "max_context_tokens": 131072,
      "context_length": 708,
      "question_length": 61,
      "prediction": "Zhejiang与Anhui的边境地区。",
      "reference": "with Anhui",
      "metrics": {
        "f1": 0.4166666666666667
      },
      "elapsed_time": 0.14043831825256348,
      "timestamp": "2025-12-24T20:40:32.946828"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_11641",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 50526,
      "natural_ratio": 0.3854827880859375,
      "max_context_tokens": 131072,
      "context_length": 100969,
      "question_length": 43,
      "prediction": "因为乔治认为哈格对他做了无法原谅的事情，尽管他后来意识到他应该原谅哈格。",
      "reference": "He had done a terrible harm",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 1.5172369480133057,
      "timestamp": "2025-12-24T20:40:34.464408"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_17570",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 71289,
      "natural_ratio": 0.5438919067382812,
      "max_context_tokens": 131072,
      "context_length": 142518,
      "question_length": 21,
      "prediction": "The setting is Egypt.",
      "reference": "Egypt in 48 BC",
      "metrics": {
        "f1": 0.6956521739130435
      },
      "elapsed_time": 0.908350944519043,
      "timestamp": "2025-12-24T20:40:35.373073"
    },
    {
      "sample_id": "squad_572794a8708984140094e173",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 897,
      "natural_ratio": 0.00684356689453125,
      "max_context_tokens": 131072,
      "context_length": 1715,
      "question_length": 39,
      "prediction": "BBC和Sky Sports",
      "reference": "From 1988 to 1997, the BBC and Sky Sports had coverage of the FA Cup",
      "metrics": {
        "f1": 0.5555555555555556
      },
      "elapsed_time": 0.07627296447753906,
      "timestamp": "2025-12-24T20:40:35.449566"
    },
    {
      "sample_id": "squad_5727f7e32ca10214002d9a6d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 626,
      "natural_ratio": 0.0047760009765625,
      "max_context_tokens": 131072,
      "context_length": 1149,
      "question_length": 64,
      "prediction": "法国大革命",
      "reference": "the French Revolution of 1789",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.0627896785736084,
      "timestamp": "2025-12-24T20:40:35.512567"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_4340",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 96523,
      "natural_ratio": 0.7364120483398438,
      "max_context_tokens": 131072,
      "context_length": 192961,
      "question_length": 46,
      "prediction": "圣诞节",
      "reference": "Christmas",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 5.0360352993011475,
      "timestamp": "2025-12-24T20:40:40.548953"
    },
    {
      "sample_id": "squad_57282d9cff5b5019007d9ea0",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 340,
      "natural_ratio": 0.002593994140625,
      "max_context_tokens": 131072,
      "context_length": 556,
      "question_length": 84,
      "prediction": "否",
      "reference": "Von Neumann–Wigner interpretation never gained acceptance amongst the majority of physicists",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.04067206382751465,
      "timestamp": "2025-12-24T20:40:40.589820"
    },
    {
      "sample_id": "squad_5725e29c271a42140099d2e1",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 562,
      "natural_ratio": 0.0042877197265625,
      "max_context_tokens": 131072,
      "context_length": 1032,
      "question_length": 52,
      "prediction": "190 BC",
      "reference": "190 BC",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06982207298278809,
      "timestamp": "2025-12-24T20:40:40.659805"
    },
    {
      "sample_id": "squad_57344892879d6814001ca482",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 402,
      "natural_ratio": 0.0030670166015625,
      "max_context_tokens": 131072,
      "context_length": 681,
      "question_length": 83,
      "prediction": "狩猎限制",
      "reference": "hunting restrictions",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.054593563079833984,
      "timestamp": "2025-12-24T20:40:40.714556"
    },
    {
      "sample_id": "squad_56f8d0389e9bad19000a0580",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 192,
      "natural_ratio": 0.00146484375,
      "max_context_tokens": 131072,
      "context_length": 301,
      "question_length": 44,
      "prediction": "mid-1991",
      "reference": "mid-1991",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07981491088867188,
      "timestamp": "2025-12-24T20:40:40.794517"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_4888",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 115813,
      "natural_ratio": 0.8835830688476562,
      "max_context_tokens": 131072,
      "context_length": 231539,
      "question_length": 48,
      "prediction": "Miss Walton is going to be married to Sir Harry Benson。",
      "reference": "Miss Walton and Sir Harry Benson.",
      "metrics": {
        "f1": 0.9142857142857143
      },
      "elapsed_time": 0.31196165084838867,
      "timestamp": "2025-12-24T20:40:41.106839"
    },
    {
      "sample_id": "squad_570629ba52bb891400689917",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 324,
      "natural_ratio": 0.002471923828125,
      "max_context_tokens": 131072,
      "context_length": 559,
      "question_length": 49,
      "prediction": "CD参数可以作为压缩比参考中的44.1 kHz，2通道每通道16位的参考。",
      "reference": "compression ratios",
      "metrics": {
        "f1": 0.0975609756097561
      },
      "elapsed_time": 0.26800036430358887,
      "timestamp": "2025-12-24T20:40:41.375373"
    },
    {
      "sample_id": "squad_5730cf87b54a4f140068cc77",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 847,
      "natural_ratio": 0.00646209716796875,
      "max_context_tokens": 131072,
      "context_length": 1603,
      "question_length": 51,
      "prediction": "1946年",
      "reference": "1946",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 0.08027195930480957,
      "timestamp": "2025-12-24T20:40:41.455814"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_30360",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 65068,
      "natural_ratio": 0.496429443359375,
      "max_context_tokens": 131072,
      "context_length": 130015,
      "question_length": 81,
      "prediction": "他们之前在岛上到处都发现了类似豌豆坑一样的沙坑，现在发现和尸体身体上的坑痕形状相似。",
      "reference": "The sands of the island.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 1.9455032348632812,
      "timestamp": "2025-12-24T20:40:43.401621"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_5293",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 102505,
      "natural_ratio": 0.7820510864257812,
      "max_context_tokens": 131072,
      "context_length": 204946,
      "question_length": 25,
      "prediction": "Colleen Sutton",
      "reference": "John Crunch. ",
      "metrics": {
        "f1": 0.5555555555555556
      },
      "elapsed_time": 0.18277716636657715,
      "timestamp": "2025-12-24T20:40:43.584751"
    },
    {
      "sample_id": "squad_56f8df189e9bad19000a0649",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 448,
      "natural_ratio": 0.00341796875,
      "max_context_tokens": 131072,
      "context_length": 792,
      "question_length": 65,
      "prediction": "环境压力",
      "reference": "environmental stress",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.04804086685180664,
      "timestamp": "2025-12-24T20:40:43.632973"
    },
    {
      "sample_id": "squad_57279d5a3acd2414000de7e9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 553,
      "natural_ratio": 0.00421905517578125,
      "max_context_tokens": 131072,
      "context_length": 1029,
      "question_length": 38,
      "prediction": "Diaspora",
      "reference": "Diaspora",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.055934906005859375,
      "timestamp": "2025-12-24T20:40:43.689072"
    },
    {
      "sample_id": "squad_572fc652a23a5019007fc9cb",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 455,
      "natural_ratio": 0.00347137451171875,
      "max_context_tokens": 131072,
      "context_length": 819,
      "question_length": 52,
      "prediction": "超过350万辆",
      "reference": "3.5 million",
      "metrics": {
        "f1": 0.25
      },
      "elapsed_time": 0.07525897026062012,
      "timestamp": "2025-12-24T20:40:43.764492"
    },
    {
      "sample_id": "squad_57263f68ec44d21400f3dcc5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 365,
      "natural_ratio": 0.00278472900390625,
      "max_context_tokens": 131072,
      "context_length": 646,
      "question_length": 45,
      "prediction": "epipubic bones",
      "reference": "epipubic bones",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06454038619995117,
      "timestamp": "2025-12-24T20:40:43.829188"
    },
    {
      "sample_id": "squad_571a2ab710f8ca1400304f22",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 381,
      "natural_ratio": 0.00290679931640625,
      "max_context_tokens": 131072,
      "context_length": 670,
      "question_length": 52,
      "prediction": "白人和其他；mulatto（混血，既有欧洲血统又有非洲血统）或黑人。",
      "reference": "white or \"other.\"",
      "metrics": {
        "f1": 0.10810810810810811
      },
      "elapsed_time": 0.2569923400878906,
      "timestamp": "2025-12-24T20:40:44.086400"
    },
    {
      "sample_id": "squad_5705fc5b75f01819005e7834",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 211,
      "natural_ratio": 0.00160980224609375,
      "max_context_tokens": 131072,
      "context_length": 348,
      "question_length": 34,
      "prediction": "飞行成本/能量成本",
      "reference": "energy cost",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.07245373725891113,
      "timestamp": "2025-12-24T20:40:44.159040"
    },
    {
      "sample_id": "squad_57282e373acd2414000df66d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 364,
      "natural_ratio": 0.002777099609375,
      "max_context_tokens": 131072,
      "context_length": 660,
      "question_length": 28,
      "prediction": "trochophore larvae",
      "reference": "trochophore larvae",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07441592216491699,
      "timestamp": "2025-12-24T20:40:44.233622"
    },
    {
      "sample_id": "squad_571cef645efbb31900334e56",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 600,
      "natural_ratio": 0.00457763671875,
      "max_context_tokens": 131072,
      "context_length": 1107,
      "question_length": 54,
      "prediction": "1960年代",
      "reference": "1960s",
      "metrics": {
        "f1": 0.7272727272727272
      },
      "elapsed_time": 0.07897663116455078,
      "timestamp": "2025-12-24T20:40:44.312755"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_7467",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 79395,
      "natural_ratio": 0.6057357788085938,
      "max_context_tokens": 131072,
      "context_length": 158713,
      "question_length": 37,
      "prediction": "Neville使用实验性的疫苗来尝试制作治愈方法。",
      "reference": "His own body",
      "metrics": {
        "f1": 0.12121212121212122
      },
      "elapsed_time": 2.7778289318084717,
      "timestamp": "2025-12-24T20:40:47.090916"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_31390",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 84447,
      "natural_ratio": 0.6442794799804688,
      "max_context_tokens": 131072,
      "context_length": 168813,
      "question_length": 42,
      "prediction": "Louise accidentally starts the fire by knocking over a lamp, which causes a lampshade, screen, table, and chair to catch fire.",
      "reference": "Tripping on a chair.",
      "metrics": {
        "f1": 0.6486486486486487
      },
      "elapsed_time": 0.4284522533416748,
      "timestamp": "2025-12-24T20:40:47.519738"
    },
    {
      "sample_id": "squad_56e11d89e3433e1400422c21",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 343,
      "natural_ratio": 0.00261688232421875,
      "max_context_tokens": 131072,
      "context_length": 574,
      "question_length": 72,
      "prediction": "2020年",
      "reference": "by 2020",
      "metrics": {
        "f1": 0.5
      },
      "elapsed_time": 0.07856917381286621,
      "timestamp": "2025-12-24T20:40:47.598503"
    },
    {
      "sample_id": "squad_5727b94dff5b5019007d9379",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 329,
      "natural_ratio": 0.00251007080078125,
      "max_context_tokens": 131072,
      "context_length": 551,
      "question_length": 67,
      "prediction": "Monroe县",
      "reference": "Monroe County",
      "metrics": {
        "f1": 0.625
      },
      "elapsed_time": 0.05538201332092285,
      "timestamp": "2025-12-24T20:40:47.654049"
    },
    {
      "sample_id": "squad_570da68e16d0071400510c4c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 414,
      "natural_ratio": 0.0031585693359375,
      "max_context_tokens": 131072,
      "context_length": 740,
      "question_length": 49,
      "prediction": "$16.3 million",
      "reference": "$16.3 million",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08445334434509277,
      "timestamp": "2025-12-24T20:40:47.738657"
    },
    {
      "sample_id": "squad_571b3d469499d21900609c34",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 557,
      "natural_ratio": 0.00424957275390625,
      "max_context_tokens": 131072,
      "context_length": 1003,
      "question_length": 72,
      "prediction": "回收的热拌沥青",
      "reference": "recycled",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.07537674903869629,
      "timestamp": "2025-12-24T20:40:47.814188"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_18448",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 80292,
      "natural_ratio": 0.612579345703125,
      "max_context_tokens": 131072,
      "context_length": 160519,
      "question_length": 25,
      "prediction": "Jonah",
      "reference": "Jonah",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 3.0034143924713135,
      "timestamp": "2025-12-24T20:40:50.817950"
    },
    {
      "sample_id": "squad_570c376aec8fbc190045bddc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 359,
      "natural_ratio": 0.00273895263671875,
      "max_context_tokens": 131072,
      "context_length": 599,
      "question_length": 80,
      "prediction": "英国殖民办公室",
      "reference": "The British Colonial Office",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.06013226509094238,
      "timestamp": "2025-12-24T20:40:50.878275"
    },
    {
      "sample_id": "squad_57263e5189a1e219009ac5ad",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 295,
      "natural_ratio": 0.00225067138671875,
      "max_context_tokens": 131072,
      "context_length": 510,
      "question_length": 41,
      "prediction": "Ciudad de México",
      "reference": "Ciudad de México",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06463241577148438,
      "timestamp": "2025-12-24T20:40:50.943072"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_32371",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 108948,
      "natural_ratio": 0.831207275390625,
      "max_context_tokens": 131072,
      "context_length": 217801,
      "question_length": 55,
      "prediction": "在监狱中。",
      "reference": "Auerbach's tavern in Leipzig",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 4.228086471557617,
      "timestamp": "2025-12-24T20:40:55.171531"
    },
    {
      "sample_id": "squad_5727fc04ff5b5019007d99f4",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 444,
      "natural_ratio": 0.003387451171875,
      "max_context_tokens": 131072,
      "context_length": 805,
      "question_length": 44,
      "prediction": "World War I",
      "reference": "World War I",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05991506576538086,
      "timestamp": "2025-12-24T20:40:55.231983"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_4593",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 70687,
      "natural_ratio": 0.5392990112304688,
      "max_context_tokens": 131072,
      "context_length": 141309,
      "question_length": 26,
      "prediction": "Eyolf falls into the water and is drowned, as revealed by the boys who saw the incident.",
      "reference": "He drowns is the sea.",
      "metrics": {
        "f1": 0.787878787878788
      },
      "elapsed_time": 2.529139995574951,
      "timestamp": "2025-12-24T20:40:57.761491"
    },
    {
      "sample_id": "squad_5725cd5838643c19005acd39",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 489,
      "natural_ratio": 0.00373077392578125,
      "max_context_tokens": 131072,
      "context_length": 885,
      "question_length": 53,
      "prediction": "Agrianes",
      "reference": "Agrianes",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06027936935424805,
      "timestamp": "2025-12-24T20:40:57.821965"
    },
    {
      "sample_id": "squad_572eaf3fcb0c0d14000f1477",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 311,
      "natural_ratio": 0.00237274169921875,
      "max_context_tokens": 131072,
      "context_length": 521,
      "question_length": 61,
      "prediction": "第三大",
      "reference": "third largest",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.045009613037109375,
      "timestamp": "2025-12-24T20:40:57.867137"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_12433",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 38935,
      "natural_ratio": 0.29705047607421875,
      "max_context_tokens": 131072,
      "context_length": 77754,
      "question_length": 76,
      "prediction": "Snake will get full pardon for every criminal act committed in the United States.",
      "reference": "Snake will be granted a pardon which will release him of his charges.",
      "metrics": {
        "f1": 0.909090909090909
      },
      "elapsed_time": 1.3869850635528564,
      "timestamp": "2025-12-24T20:40:59.254374"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_6525",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 123723,
      "natural_ratio": 0.9439315795898438,
      "max_context_tokens": 131072,
      "context_length": 247366,
      "question_length": 41,
      "prediction": "越南战争",
      "reference": "Vietnam War",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 4.446884632110596,
      "timestamp": "2025-12-24T20:41:03.701607"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_18192",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 75869,
      "natural_ratio": 0.5788345336914062,
      "max_context_tokens": 131072,
      "context_length": 151669,
      "question_length": 29,
      "prediction": "因为英俊的金发表亲受到了Babette的特别关注，这让他感到不安和嫉妒。",
      "reference": "Babette flirts with her cousin",
      "metrics": {
        "f1": 0.1702127659574468
      },
      "elapsed_time": 2.4773435592651367,
      "timestamp": "2025-12-24T20:41:06.179651"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_21737",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 113940,
      "natural_ratio": 0.869293212890625,
      "max_context_tokens": 131072,
      "context_length": 227820,
      "question_length": 21,
      "prediction": "Rainmaker是未来的一个新Boss，他突然出现并迅速控制了五大犯罪组织。",
      "reference": "Cid",
      "metrics": {
        "f1": 0.05128205128205128
      },
      "elapsed_time": 4.004578590393066,
      "timestamp": "2025-12-24T20:41:10.184633"
    },
    {
      "sample_id": "squad_570d2bfafed7b91900d45ca0",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 602,
      "natural_ratio": 0.0045928955078125,
      "max_context_tokens": 131072,
      "context_length": 1100,
      "question_length": 64,
      "prediction": "MacBook",
      "reference": "MacBooks",
      "metrics": {
        "f1": 0.923076923076923
      },
      "elapsed_time": 0.05530095100402832,
      "timestamp": "2025-12-24T20:41:10.240474"
    },
    {
      "sample_id": "squad_56f97f299b226e1400dd14c6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 421,
      "natural_ratio": 0.00321197509765625,
      "max_context_tokens": 131072,
      "context_length": 741,
      "question_length": 62,
      "prediction": "耳蜗",
      "reference": "cochlea",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.04525256156921387,
      "timestamp": "2025-12-24T20:41:10.285889"
    },
    {
      "sample_id": "squad_5726d965f1498d1400e8ed0e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 370,
      "natural_ratio": 0.0028228759765625,
      "max_context_tokens": 131072,
      "context_length": 640,
      "question_length": 60,
      "prediction": "苏联内务人民委员部（NKVD）troops",
      "reference": "Soviet NKVD",
      "metrics": {
        "f1": 0.4666666666666667
      },
      "elapsed_time": 0.15693974494934082,
      "timestamp": "2025-12-24T20:41:10.442993"
    },
    {
      "sample_id": "squad_56fc975cb53dbe1900755136",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 254,
      "natural_ratio": 0.0019378662109375,
      "max_context_tokens": 131072,
      "context_length": 417,
      "question_length": 51,
      "prediction": "prosody",
      "reference": "prosody",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.052191972732543945,
      "timestamp": "2025-12-24T20:41:10.495337"
    },
    {
      "sample_id": "squad_57070f769e06ca38007e9360",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 352,
      "natural_ratio": 0.002685546875,
      "max_context_tokens": 131072,
      "context_length": 630,
      "question_length": 35,
      "prediction": "根据外貌评价。",
      "reference": "like the colors of the spectrum",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.07411432266235352,
      "timestamp": "2025-12-24T20:41:10.569600"
    },
    {
      "sample_id": "squad_56e6d988de9d371400068085",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 237,
      "natural_ratio": 0.00180816650390625,
      "max_context_tokens": 131072,
      "context_length": 347,
      "question_length": 87,
      "prediction": "Urban格式",
      "reference": "urban",
      "metrics": {
        "f1": 0.8333333333333333
      },
      "elapsed_time": 0.04074215888977051,
      "timestamp": "2025-12-24T20:41:10.610491"
    },
    {
      "sample_id": "squad_5726e2f1708984140094d4cd",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 470,
      "natural_ratio": 0.0035858154296875,
      "max_context_tokens": 131072,
      "context_length": 869,
      "question_length": 31,
      "prediction": "Baden-Württemberg formed in 1952 when Baden, Württemberg-Baden, and Württemberg-Hohenzollern merged.",
      "reference": "following a referendum",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.41970300674438477,
      "timestamp": "2025-12-24T20:41:11.030419"
    },
    {
      "sample_id": "squad_573425204776f41900661950",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 317,
      "natural_ratio": 0.00241851806640625,
      "max_context_tokens": 131072,
      "context_length": 558,
      "question_length": 37,
      "prediction": "1940年",
      "reference": "1940",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 0.07572722434997559,
      "timestamp": "2025-12-24T20:41:11.106328"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_21960",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 54139,
      "natural_ratio": 0.41304779052734375,
      "max_context_tokens": 131072,
      "context_length": 108198,
      "question_length": 40,
      "prediction": "女王要求《黑暗女士》离开，并警告他不要进一步冒犯她。",
      "reference": "An apology.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 1.6156675815582275,
      "timestamp": "2025-12-24T20:41:12.722295"
    },
    {
      "sample_id": "squad_56d13b3ae7d4791d00902049",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 746,
      "natural_ratio": 0.0056915283203125,
      "max_context_tokens": 131072,
      "context_length": 1387,
      "question_length": 66,
      "prediction": "parody, satire, irony",
      "reference": "parody, satire, and irony",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.09349727630615234,
      "timestamp": "2025-12-24T20:41:12.815996"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_28889",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 113792,
      "natural_ratio": 0.8681640625,
      "max_context_tokens": 131072,
      "context_length": 227515,
      "question_length": 30,
      "prediction": "Bond is getting money for Renard.",
      "reference": "Sir Robert King",
      "metrics": {
        "f1": 0.7407407407407406
      },
      "elapsed_time": 3.881460666656494,
      "timestamp": "2025-12-24T20:41:16.697796"
    },
    {
      "sample_id": "squad_5735ad64e853931400426abd",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 499,
      "natural_ratio": 0.00380706787109375,
      "max_context_tokens": 131072,
      "context_length": 895,
      "question_length": 63,
      "prediction": "Raxaul",
      "reference": "Raxaul",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.060965776443481445,
      "timestamp": "2025-12-24T20:41:16.759302"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_8743",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 89177,
      "natural_ratio": 0.6803665161132812,
      "max_context_tokens": 131072,
      "context_length": 178274,
      "question_length": 40,
      "prediction": "Twinkle goes to Prairie Dog Town with Chubbins to explore and eventually discovers a prairie-dog town while wandering on the plains.",
      "reference": "To have a picnic",
      "metrics": {
        "f1": 0.6285714285714286
      },
      "elapsed_time": 3.0821382999420166,
      "timestamp": "2025-12-24T20:41:19.841797"
    },
    {
      "sample_id": "squad_573002eda23a5019007fcc35",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 506,
      "natural_ratio": 0.0038604736328125,
      "max_context_tokens": 131072,
      "context_length": 905,
      "question_length": 68,
      "prediction": " senators",
      "reference": "senators",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.040512800216674805,
      "timestamp": "2025-12-24T20:41:19.882499"
    },
    {
      "sample_id": "squad_572e88f303f9891900756745",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 446,
      "natural_ratio": 0.0034027099609375,
      "max_context_tokens": 131072,
      "context_length": 798,
      "question_length": 55,
      "prediction": "柴油机（柴油发动机）",
      "reference": "diesel engine",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.08506917953491211,
      "timestamp": "2025-12-24T20:41:19.967724"
    },
    {
      "sample_id": "squad_570fe3fb80d9841400ab3705",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 480,
      "natural_ratio": 0.003662109375,
      "max_context_tokens": 131072,
      "context_length": 862,
      "question_length": 58,
      "prediction": "2008年",
      "reference": "2008",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 0.07541370391845703,
      "timestamp": "2025-12-24T20:41:20.043291"
    },
    {
      "sample_id": "squad_56dfbedb7aa994140058e0ea",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 177,
      "natural_ratio": 0.00135040283203125,
      "max_context_tokens": 131072,
      "context_length": 241,
      "question_length": 74,
      "prediction": "A mailbox provider",
      "reference": "A mailbox provider",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04938149452209473,
      "timestamp": "2025-12-24T20:41:20.092827"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_20537",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 90496,
      "natural_ratio": 0.6904296875,
      "max_context_tokens": 131072,
      "context_length": 180911,
      "question_length": 42,
      "prediction": "没有提到具体的名字，只提到是Jango Fett的飞船。",
      "reference": "Slave I",
      "metrics": {
        "f1": 0.19354838709677416
      },
      "elapsed_time": 4.221982717514038,
      "timestamp": "2025-12-24T20:41:24.315141"
    },
    {
      "sample_id": "squad_56be96653aeaaa14008c9119",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 622,
      "natural_ratio": 0.0047454833984375,
      "max_context_tokens": 131072,
      "context_length": 1132,
      "question_length": 72,
      "prediction": "Thriller",
      "reference": "thriller",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05563974380493164,
      "timestamp": "2025-12-24T20:41:24.370969"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_18379",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 101192,
      "natural_ratio": 0.77203369140625,
      "max_context_tokens": 131072,
      "context_length": 202298,
      "question_length": 46,
      "prediction": "Major",
      "reference": "She was a major in the Salvation Army.",
      "metrics": {
        "f1": 0.45454545454545453
      },
      "elapsed_time": 3.774611473083496,
      "timestamp": "2025-12-24T20:41:28.145939"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_28716",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 13529,
      "natural_ratio": 0.10321807861328125,
      "max_context_tokens": 131072,
      "context_length": 26984,
      "question_length": 34,
      "prediction": "Tiny-cub的",
      "reference": "The Wee Bear.",
      "metrics": {
        "f1": 0.2222222222222222
      },
      "elapsed_time": 0.32398104667663574,
      "timestamp": "2025-12-24T20:41:28.470150"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_27260",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 21333,
      "natural_ratio": 0.16275787353515625,
      "max_context_tokens": 131072,
      "context_length": 42577,
      "question_length": 49,
      "prediction": "The Skin Horse",
      "reference": "The Skin Horse",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.4620397090911865,
      "timestamp": "2025-12-24T20:41:28.932479"
    },
    {
      "sample_id": "squad_572f7a7704bcaa1900d769de",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 589,
      "natural_ratio": 0.00449371337890625,
      "max_context_tokens": 131072,
      "context_length": 1093,
      "question_length": 45,
      "prediction": "The Art of Noise hoped to bring sampled and electronic sounds to the pop mainstream.",
      "reference": "attempt to bring sampled and electronic sounds to the pop mainstream",
      "metrics": {
        "f1": 0.9473684210526316
      },
      "elapsed_time": 0.1856541633605957,
      "timestamp": "2025-12-24T20:41:29.118320"
    },
    {
      "sample_id": "squad_56f858c6aef2371900626007",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 144,
      "natural_ratio": 0.0010986328125,
      "max_context_tokens": 131072,
      "context_length": 199,
      "question_length": 50,
      "prediction": "English Civil War",
      "reference": "English Civil War",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04952287673950195,
      "timestamp": "2025-12-24T20:41:29.168001"
    },
    {
      "sample_id": "squad_573039c004bcaa1900d773c8",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 384,
      "natural_ratio": 0.0029296875,
      "max_context_tokens": 131072,
      "context_length": 651,
      "question_length": 78,
      "prediction": "heavy infantry",
      "reference": "cohort",
      "metrics": {
        "f1": 0.37499999999999994
      },
      "elapsed_time": 0.04386711120605469,
      "timestamp": "2025-12-24T20:41:29.212018"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_11146",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 106746,
      "natural_ratio": 0.8144073486328125,
      "max_context_tokens": 131072,
      "context_length": 213413,
      "question_length": 40,
      "prediction": "当前的船长是自动导航系统Auto，但最终是人类船长夺回了控制权。",
      "reference": "Captain McCrea.",
      "metrics": {
        "f1": 0.1
      },
      "elapsed_time": 3.4586822986602783,
      "timestamp": "2025-12-24T20:41:32.671043"
    },
    {
      "sample_id": "squad_572629b789a1e219009ac484",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 663,
      "natural_ratio": 0.00505828857421875,
      "max_context_tokens": 131072,
      "context_length": 1245,
      "question_length": 42,
      "prediction": "Justinian",
      "reference": "Justinian",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05427861213684082,
      "timestamp": "2025-12-24T20:41:32.725885"
    },
    {
      "sample_id": "squad_570dc39016d0071400510d5b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 496,
      "natural_ratio": 0.0037841796875,
      "max_context_tokens": 131072,
      "context_length": 820,
      "question_length": 132,
      "prediction": "增强动机",
      "reference": "enjoyment",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.04497838020324707,
      "timestamp": "2025-12-24T20:41:32.771021"
    },
    {
      "sample_id": "squad_57266b63708984140094c562",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 351,
      "natural_ratio": 0.00267791748046875,
      "max_context_tokens": 131072,
      "context_length": 582,
      "question_length": 81,
      "prediction": "The Sandberg Game",
      "reference": "The Sandberg Game",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06386613845825195,
      "timestamp": "2025-12-24T20:41:32.835043"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_9317",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 105197,
      "natural_ratio": 0.8025894165039062,
      "max_context_tokens": 131072,
      "context_length": 210291,
      "question_length": 64,
      "prediction": "Eddie Hawkins tries to celebrate his first day of parole by going to Alex's bar and getting a cappuccino.",
      "reference": "By drinking a cappuccino.",
      "metrics": {
        "f1": 0.7692307692307693
      },
      "elapsed_time": 3.593836784362793,
      "timestamp": "2025-12-24T20:41:36.429224"
    },
    {
      "sample_id": "squad_572a50757a1753140016aea1",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 658,
      "natural_ratio": 0.0050201416015625,
      "max_context_tokens": 131072,
      "context_length": 1226,
      "question_length": 50,
      "prediction": "太大的镁或维生素C摄入、未消化的乳糖、乳糖不耐受、果糖吸收不良、高果糖且高葡萄糖含量的食物、难以被身体吸收的山梨醇（常在无糖食品中发现）。",
      "reference": "too much magnesium or vitamin C or undigested lactose",
      "metrics": {
        "f1": 0.02985074626865672
      },
      "elapsed_time": 0.5546574592590332,
      "timestamp": "2025-12-24T20:41:36.984077"
    },
    {
      "sample_id": "squad_57342891d058e614000b6a5b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 452,
      "natural_ratio": 0.003448486328125,
      "max_context_tokens": 131072,
      "context_length": 801,
      "question_length": 64,
      "prediction": "CSS Virginia",
      "reference": "CSS Virginia",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04129171371459961,
      "timestamp": "2025-12-24T20:41:37.025523"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_25507",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 85273,
      "natural_ratio": 0.6505813598632812,
      "max_context_tokens": 131072,
      "context_length": 170470,
      "question_length": 37,
      "prediction": "Enzo认为自己是更好的自由潜水员。",
      "reference": "Enzo",
      "metrics": {
        "f1": 0.3636363636363636
      },
      "elapsed_time": 3.9307925701141357,
      "timestamp": "2025-12-24T20:41:40.956687"
    },
    {
      "sample_id": "squad_5727e3ec3acd2414000def17",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 673,
      "natural_ratio": 0.00513458251953125,
      "max_context_tokens": 131072,
      "context_length": 1255,
      "question_length": 52,
      "prediction": "intellectual property",
      "reference": "intellectual property",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06491303443908691,
      "timestamp": "2025-12-24T20:41:41.021794"
    },
    {
      "sample_id": "squad_572b415834ae481900dead53",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 292,
      "natural_ratio": 0.002227783203125,
      "max_context_tokens": 131072,
      "context_length": 505,
      "question_length": 40,
      "prediction": "A.J. Ayer是维也纳学派的重要成员。",
      "reference": "Logical empiricism",
      "metrics": {
        "f1": 0.2580645161290323
      },
      "elapsed_time": 0.1650078296661377,
      "timestamp": "2025-12-24T20:41:41.186976"
    },
    {
      "sample_id": "squad_571de5e2556973190063909e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 514,
      "natural_ratio": 0.0039215087890625,
      "max_context_tokens": 131072,
      "context_length": 901,
      "question_length": 87,
      "prediction": " Eugenics",
      "reference": "eugenics and ideas of racial purity",
      "metrics": {
        "f1": 0.5833333333333334
      },
      "elapsed_time": 0.05530595779418945,
      "timestamp": "2025-12-24T20:41:41.242445"
    },
    {
      "sample_id": "squad_5709fb5e4103511400d594cf",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 394,
      "natural_ratio": 0.0030059814453125,
      "max_context_tokens": 131072,
      "context_length": 697,
      "question_length": 51,
      "prediction": "1937",
      "reference": "1937",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06448626518249512,
      "timestamp": "2025-12-24T20:41:41.307085"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_12964",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 82373,
      "natural_ratio": 0.6284561157226562,
      "max_context_tokens": 131072,
      "context_length": 164642,
      "question_length": 65,
      "prediction": "Together in Paris",
      "reference": "\"Together in Paris.\"",
      "metrics": {
        "f1": 0.923076923076923
      },
      "elapsed_time": 2.8162691593170166,
      "timestamp": "2025-12-24T20:41:44.123702"
    },
    {
      "sample_id": "squad_56e6f84ede9d37140006810c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 270,
      "natural_ratio": 0.0020599365234375,
      "max_context_tokens": 131072,
      "context_length": 431,
      "question_length": 70,
      "prediction": "beautiful music和easy listening",
      "reference": "beautiful music and easy listening",
      "metrics": {
        "f1": 0.9375
      },
      "elapsed_time": 0.07880449295043945,
      "timestamp": "2025-12-24T20:41:44.202707"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_24843",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 77695,
      "natural_ratio": 0.5927658081054688,
      "max_context_tokens": 131072,
      "context_length": 155315,
      "question_length": 35,
      "prediction": "法国革命",
      "reference": "French Revolutionary army.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 2.0916907787323,
      "timestamp": "2025-12-24T20:41:46.294710"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_28331",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 78030,
      "natural_ratio": 0.5953216552734375,
      "max_context_tokens": 131072,
      "context_length": 155970,
      "question_length": 51,
      "prediction": "Mr. Cordyce has the boxcar moved to his property and renovated it into a home for the children.",
      "reference": "He has it transferred to his backyard for the kids to enjoy.",
      "metrics": {
        "f1": 0.8205128205128205
      },
      "elapsed_time": 2.5339787006378174,
      "timestamp": "2025-12-24T20:41:48.829031"
    },
    {
      "sample_id": "squad_56f8d8959e9bad19000a05e2",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 193,
      "natural_ratio": 0.00147247314453125,
      "max_context_tokens": 131072,
      "context_length": 288,
      "question_length": 59,
      "prediction": "PRS（Party for Social Renewal）",
      "reference": "PRS (Party for Social Renewal)",
      "metrics": {
        "f1": 0.8823529411764706
      },
      "elapsed_time": 0.10870885848999023,
      "timestamp": "2025-12-24T20:41:48.937966"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_30355",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 65054,
      "natural_ratio": 0.4963226318359375,
      "max_context_tokens": 131072,
      "context_length": 130015,
      "question_length": 53,
      "prediction": "早晨醒来后发现同伴不见了，而且帐篷被风吹倒了，地上有深坑。",
      "reference": "One of the paddles, and some food.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.30267882347106934,
      "timestamp": "2025-12-24T20:41:49.240930"
    },
    {
      "sample_id": "squad_56e7b4f737bdd419002c43b3",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 233,
      "natural_ratio": 0.00177764892578125,
      "max_context_tokens": 131072,
      "context_length": 384,
      "question_length": 43,
      "prediction": "2001",
      "reference": "2001",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06319642066955566,
      "timestamp": "2025-12-24T20:41:49.304310"
    },
    {
      "sample_id": "squad_57292b1f6aef051400154b20",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 684,
      "natural_ratio": 0.005218505859375,
      "max_context_tokens": 131072,
      "context_length": 1288,
      "question_length": 40,
      "prediction": "Abgeordnetenhaus",
      "reference": "Abgeordnetenhaus",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.09155845642089844,
      "timestamp": "2025-12-24T20:41:49.396024"
    },
    {
      "sample_id": "squad_572827c62ca10214002d9f6a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 375,
      "natural_ratio": 0.00286102294921875,
      "max_context_tokens": 131072,
      "context_length": 669,
      "question_length": 42,
      "prediction": "Burnet基于圣经中的事件来解释地球的过去。",
      "reference": "the Bible",
      "metrics": {
        "f1": 0.20689655172413793
      },
      "elapsed_time": 0.14481377601623535,
      "timestamp": "2025-12-24T20:41:49.540994"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_7798",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 105491,
      "natural_ratio": 0.8048324584960938,
      "max_context_tokens": 131072,
      "context_length": 210849,
      "question_length": 94,
      "prediction": "停止人类的生育能力",
      "reference": "Free will.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.508756637573242,
      "timestamp": "2025-12-24T20:41:53.050087"
    },
    {
      "sample_id": "squad_56e161bfcd28a01900c67845",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 611,
      "natural_ratio": 0.00466156005859375,
      "max_context_tokens": 131072,
      "context_length": 1145,
      "question_length": 37,
      "prediction": "The Boston Red Sox",
      "reference": "The Boston Red Sox",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0735323429107666,
      "timestamp": "2025-12-24T20:41:53.123806"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_7874",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 48475,
      "natural_ratio": 0.36983489990234375,
      "max_context_tokens": 131072,
      "context_length": 96842,
      "question_length": 69,
      "prediction": "文本中并没有提到费尔南多因为什么而穿过田野时喷气和跺脚。",
      "reference": "He sits on a bee",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 1.427124261856079,
      "timestamp": "2025-12-24T20:41:54.551230"
    },
    {
      "sample_id": "squad_5707199590286e26004fc8f5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 511,
      "natural_ratio": 0.00389862060546875,
      "max_context_tokens": 131072,
      "context_length": 941,
      "question_length": 41,
      "prediction": "paz porfiriana",
      "reference": "paz porfiriana",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07773399353027344,
      "timestamp": "2025-12-24T20:41:54.629134"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_31754",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 124250,
      "natural_ratio": 0.9479522705078125,
      "max_context_tokens": 131072,
      "context_length": 248406,
      "question_length": 54,
      "prediction": "Debbie is caught by the police along with other women and is treated as a prostitute suspect.",
      "reference": "Cole kidnaps her",
      "metrics": {
        "f1": 0.742857142857143
      },
      "elapsed_time": 3.3770601749420166,
      "timestamp": "2025-12-24T20:41:58.006531"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_14490",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 62040,
      "natural_ratio": 0.47332763671875,
      "max_context_tokens": 131072,
      "context_length": 123995,
      "question_length": 46,
      "prediction": "Naomi decided to return to Bethlehem due to heavy trouble, specifically the loss of her husband and her two sons.",
      "reference": "Her husband and both sons had all died. ",
      "metrics": {
        "f1": 0.7894736842105263
      },
      "elapsed_time": 1.8972105979919434,
      "timestamp": "2025-12-24T20:41:59.904429"
    },
    {
      "sample_id": "squad_5725b77c271a42140099d092",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 434,
      "natural_ratio": 0.0033111572265625,
      "max_context_tokens": 131072,
      "context_length": 758,
      "question_length": 70,
      "prediction": "1922年",
      "reference": "1922",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 0.07883191108703613,
      "timestamp": "2025-12-24T20:41:59.983469"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_235",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 32294,
      "natural_ratio": 0.2463836669921875,
      "max_context_tokens": 131072,
      "context_length": 64510,
      "question_length": 38,
      "prediction": "和平需要建立在人类统一性的精神真理上。",
      "reference": "the thought that mankind is one human family",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.7566471099853516,
      "timestamp": "2025-12-24T20:42:00.740400"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_22046",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 100482,
      "natural_ratio": 0.7666168212890625,
      "max_context_tokens": 131072,
      "context_length": 200864,
      "question_length": 61,
      "prediction": "导致人类死亡",
      "reference": "It decomposes them.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 2.502164125442505,
      "timestamp": "2025-12-24T20:42:03.242946"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_4288",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 92064,
      "natural_ratio": 0.702392578125,
      "max_context_tokens": 131072,
      "context_length": 184037,
      "question_length": 51,
      "prediction": "因为世界已经从有毒的太空中恢复过来，氧气含量恢复正常，所以他们没有死亡。",
      "reference": "The Earth had passed the poison belt",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.057680368423462,
      "timestamp": "2025-12-24T20:42:06.301073"
    },
    {
      "sample_id": "squad_570d2d61b3d812140066d4f1",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 451,
      "natural_ratio": 0.00344085693359375,
      "max_context_tokens": 131072,
      "context_length": 792,
      "question_length": 70,
      "prediction": "在2010年5月，GE宣布在北爱尔兰扩大业务。",
      "reference": "May 2010",
      "metrics": {
        "f1": 0.21428571428571427
      },
      "elapsed_time": 0.2018110752105713,
      "timestamp": "2025-12-24T20:42:06.503118"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_16988",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 41294,
      "natural_ratio": 0.3150482177734375,
      "max_context_tokens": 131072,
      "context_length": 82487,
      "question_length": 61,
      "prediction": "国王是尼科尔特的父亲",
      "reference": "He is her father.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 1.130446195602417,
      "timestamp": "2025-12-24T20:42:07.633907"
    },
    {
      "sample_id": "squad_57267187dd62a815002e850f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 354,
      "natural_ratio": 0.0027008056640625,
      "max_context_tokens": 131072,
      "context_length": 612,
      "question_length": 56,
      "prediction": "20世纪",
      "reference": "the 20th century",
      "metrics": {
        "f1": 0.26666666666666666
      },
      "elapsed_time": 0.05804586410522461,
      "timestamp": "2025-12-24T20:42:07.692156"
    },
    {
      "sample_id": "squad_5727a3d0ff5b5019007d919f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 495,
      "natural_ratio": 0.00377655029296875,
      "max_context_tokens": 131072,
      "context_length": 894,
      "question_length": 57,
      "prediction": "这些儿童中有许多不得不从事石碾作业、街头 hustling 和卖淫等职业，这些职业比服装生产行业更为危险和剥削。",
      "reference": "stone-crushing, street hustling, and prostitution\"",
      "metrics": {
        "f1": 0.2727272727272727
      },
      "elapsed_time": 0.33873724937438965,
      "timestamp": "2025-12-24T20:42:08.031122"
    },
    {
      "sample_id": "squad_571a7ebb10f8ca14003050a7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 443,
      "natural_ratio": 0.00337982177734375,
      "max_context_tokens": 131072,
      "context_length": 785,
      "question_length": 61,
      "prediction": "25%",
      "reference": "a quarter",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.058685302734375,
      "timestamp": "2025-12-24T20:42:08.090018"
    },
    {
      "sample_id": "squad_57299cb33f37b319004784f9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 309,
      "natural_ratio": 0.00235748291015625,
      "max_context_tokens": 131072,
      "context_length": 538,
      "question_length": 40,
      "prediction": "480百万年前（或480 million years ago），在奥陶纪时期。",
      "reference": "roughly 480 million years ago",
      "metrics": {
        "f1": 0.6382978723404256
      },
      "elapsed_time": 0.2251439094543457,
      "timestamp": "2025-12-24T20:42:08.315358"
    },
    {
      "sample_id": "squad_56e147e6cd28a01900c6772b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 328,
      "natural_ratio": 0.00250244140625,
      "max_context_tokens": 131072,
      "context_length": 507,
      "question_length": 109,
      "prediction": "电影生产",
      "reference": "movie production",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.0445859432220459,
      "timestamp": "2025-12-24T20:42:08.360104"
    },
    {
      "sample_id": "squad_570bd90fec8fbc190045bb76",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 345,
      "natural_ratio": 0.00263214111328125,
      "max_context_tokens": 131072,
      "context_length": 609,
      "question_length": 41,
      "prediction": "Bemer-Ross Code",
      "reference": "Bemer-Ross Code",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07429289817810059,
      "timestamp": "2025-12-24T20:42:08.434554"
    },
    {
      "sample_id": "squad_5735c47ae853931400426b66",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 483,
      "natural_ratio": 0.00368499755859375,
      "max_context_tokens": 131072,
      "context_length": 887,
      "question_length": 39,
      "prediction": "Kathmandu",
      "reference": "Kirat",
      "metrics": {
        "f1": 0.4615384615384615
      },
      "elapsed_time": 0.06471467018127441,
      "timestamp": "2025-12-24T20:42:08.499418"
    },
    {
      "sample_id": "squad_57282f18ff5b5019007d9eab",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 391,
      "natural_ratio": 0.00298309326171875,
      "max_context_tokens": 131072,
      "context_length": 698,
      "question_length": 44,
      "prediction": "9日",
      "reference": "9 June",
      "metrics": {
        "f1": 0.25
      },
      "elapsed_time": 0.044156789779663086,
      "timestamp": "2025-12-24T20:42:08.543736"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_21765",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 110900,
      "natural_ratio": 0.846099853515625,
      "max_context_tokens": 131072,
      "context_length": 221711,
      "question_length": 49,
      "prediction": "fourth round",
      "reference": "The 4th round",
      "metrics": {
        "f1": 0.8421052631578948
      },
      "elapsed_time": 3.40871262550354,
      "timestamp": "2025-12-24T20:42:11.952779"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_31365",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 84445,
      "natural_ratio": 0.6442642211914062,
      "max_context_tokens": 131072,
      "context_length": 168813,
      "question_length": 38,
      "prediction": "The Mumfords lived in Sutton, which is in England.",
      "reference": "ENGLAND",
      "metrics": {
        "f1": 0.24561403508771928
      },
      "elapsed_time": 0.2805211544036865,
      "timestamp": "2025-12-24T20:42:12.233975"
    },
    {
      "sample_id": "squad_572a48aefed8de19000d5b6d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 338,
      "natural_ratio": 0.0025787353515625,
      "max_context_tokens": 131072,
      "context_length": 544,
      "question_length": 92,
      "prediction": "Artspace on Orange Street",
      "reference": "Artspace",
      "metrics": {
        "f1": 0.48484848484848486
      },
      "elapsed_time": 0.0863037109375,
      "timestamp": "2025-12-24T20:42:12.320459"
    },
    {
      "sample_id": "squad_5726660bdd62a815002e83a2",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 320,
      "natural_ratio": 0.00244140625,
      "max_context_tokens": 131072,
      "context_length": 523,
      "question_length": 78,
      "prediction": "人们与活禽接触是感染病毒的最大风险，尤其是在东南亚等地区，野生鸟类和家禽中存在该疾病，病毒有可能变异成为在人类中高度致病和传染的流感病毒，从而引发流感大流行。",
      "reference": "The virus possibly could mutate to become highly virulent and infectious in humans and cause an influenza pandemic.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.505678653717041,
      "timestamp": "2025-12-24T20:42:12.826350"
    },
    {
      "sample_id": "squad_57313d70497a881900248cb9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 311,
      "natural_ratio": 0.00237274169921875,
      "max_context_tokens": 131072,
      "context_length": 545,
      "question_length": 38,
      "prediction": "旅游业、农业、工业和服务业。",
      "reference": "tourism, agriculture, industry and services",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.10449075698852539,
      "timestamp": "2025-12-24T20:42:12.931028"
    },
    {
      "sample_id": "squad_5727c82c3acd2414000dec43",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 790,
      "natural_ratio": 0.0060272216796875,
      "max_context_tokens": 131072,
      "context_length": 1451,
      "question_length": 90,
      "prediction": "因为较大的未完成书籍将被大量的学术脚注和更多技术细节所拖累，使其不如原书那样易于接近。",
      "reference": "would have been encumbered by scholarly footnotes and much more technical detail",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.29854655265808105,
      "timestamp": "2025-12-24T20:42:13.229749"
    },
    {
      "sample_id": "squad_5709b165ed30961900e84426",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 425,
      "natural_ratio": 0.00324249267578125,
      "max_context_tokens": 131072,
      "context_length": 736,
      "question_length": 74,
      "prediction": "必须是已故的人。",
      "reference": "deceased",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.08421921730041504,
      "timestamp": "2025-12-24T20:42:13.314125"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_22920",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 121873,
      "natural_ratio": 0.9298171997070312,
      "max_context_tokens": 131072,
      "context_length": 243651,
      "question_length": 56,
      "prediction": "Joe在预审会议时决定接手Beckett的案件。",
      "reference": "When he saw others' discriminatory behavior toward Beckett at the law library.",
      "metrics": {
        "f1": 0.2857142857142857
      },
      "elapsed_time": 3.695587635040283,
      "timestamp": "2025-12-24T20:42:17.010149"
    },
    {
      "sample_id": "squad_5726d859dd62a815002e924f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 315,
      "natural_ratio": 0.00240325927734375,
      "max_context_tokens": 131072,
      "context_length": 556,
      "question_length": 34,
      "prediction": "1 September",
      "reference": "1 September",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04919576644897461,
      "timestamp": "2025-12-24T20:42:17.059887"
    },
    {
      "sample_id": "squad_572616c289a1e219009ac232",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 424,
      "natural_ratio": 0.00323486328125,
      "max_context_tokens": 131072,
      "context_length": 746,
      "question_length": 62,
      "prediction": "Arsenal Ladies",
      "reference": "Arsenal Ladies",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05599045753479004,
      "timestamp": "2025-12-24T20:42:17.116043"
    },
    {
      "sample_id": "squad_57302231b2c2fd1400568917",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 476,
      "natural_ratio": 0.003631591796875,
      "max_context_tokens": 131072,
      "context_length": 845,
      "question_length": 67,
      "prediction": "August 16, 2012",
      "reference": "August 16, 2012",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.12543559074401855,
      "timestamp": "2025-12-24T20:42:17.241636"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_22254",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 75312,
      "natural_ratio": 0.5745849609375,
      "max_context_tokens": 131072,
      "context_length": 150534,
      "question_length": 50,
      "prediction": "Maggie wandered aimlessly for several blocks before stopping in a darker area.",
      "reference": "Back home",
      "metrics": {
        "f1": 0.5161290322580644
      },
      "elapsed_time": 2.4727864265441895,
      "timestamp": "2025-12-24T20:42:19.714804"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_32286",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 44778,
      "natural_ratio": 0.3416290283203125,
      "max_context_tokens": 131072,
      "context_length": 89437,
      "question_length": 80,
      "prediction": "Lucia suspected Turiddu to be at Lola's place.",
      "reference": "He was going to buy some more wine.",
      "metrics": {
        "f1": 0.6666666666666667
      },
      "elapsed_time": 1.381566047668457,
      "timestamp": "2025-12-24T20:42:21.096712"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_10650",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 114367,
      "natural_ratio": 0.8725509643554688,
      "max_context_tokens": 131072,
      "context_length": 228667,
      "question_length": 27,
      "prediction": "Doom is revealed to be a Toon.",
      "reference": "A toon in disguise ",
      "metrics": {
        "f1": 0.6923076923076923
      },
      "elapsed_time": 4.076655387878418,
      "timestamp": "2025-12-24T20:42:25.173726"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_5763",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 46086,
      "natural_ratio": 0.3516082763671875,
      "max_context_tokens": 131072,
      "context_length": 92081,
      "question_length": 52,
      "prediction": "Cato",
      "reference": "Bounty hunters",
      "metrics": {
        "f1": 0.26666666666666666
      },
      "elapsed_time": 1.472475290298462,
      "timestamp": "2025-12-24T20:42:26.646890"
    },
    {
      "sample_id": "squad_572801afff5b5019007d9ad1",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 353,
      "natural_ratio": 0.00269317626953125,
      "max_context_tokens": 131072,
      "context_length": 606,
      "question_length": 61,
      "prediction": "Hellenistic",
      "reference": "Hellenistic",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05884408950805664,
      "timestamp": "2025-12-24T20:42:26.705923"
    },
    {
      "sample_id": "squad_57301640b2c2fd1400568834",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 448,
      "natural_ratio": 0.00341796875,
      "max_context_tokens": 131072,
      "context_length": 799,
      "question_length": 58,
      "prediction": "optimates",
      "reference": "populares",
      "metrics": {
        "f1": 0.625
      },
      "elapsed_time": 0.0461735725402832,
      "timestamp": "2025-12-24T20:42:26.752262"
    },
    {
      "sample_id": "squad_56de93f94396321400ee2a36",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 213,
      "natural_ratio": 0.00162506103515625,
      "max_context_tokens": 131072,
      "context_length": 282,
      "question_length": 104,
      "prediction": "1988年",
      "reference": "1988",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 0.07099390029907227,
      "timestamp": "2025-12-24T20:42:26.823416"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_31029",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 98987,
      "natural_ratio": 0.7552108764648438,
      "max_context_tokens": 131072,
      "context_length": 197891,
      "question_length": 43,
      "prediction": "Jeff Talley's former line of work was as a SWAT negotiator in LA.",
      "reference": "SWAT officer.",
      "metrics": {
        "f1": 0.7096774193548387
      },
      "elapsed_time": 3.545950412750244,
      "timestamp": "2025-12-24T20:42:30.369795"
    },
    {
      "sample_id": "squad_5726f517dd62a815002e963b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 347,
      "natural_ratio": 0.00264739990234375,
      "max_context_tokens": 131072,
      "context_length": 561,
      "question_length": 94,
      "prediction": "美国在2010年缅甸选举后的改革变化中，在2011年11月放松了对缅甸的外援限制，并于2012年1月13日宣布恢复外交关系。",
      "reference": "the United States relaxed curbs on foreign aid to Myanmar in November 2011 and announced the resumption of diplomatic relations",
      "metrics": {
        "f1": 0.0923076923076923
      },
      "elapsed_time": 0.4957859516143799,
      "timestamp": "2025-12-24T20:42:30.865826"
    },
    {
      "sample_id": "squad_57320e1ee17f3d1400422637",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 370,
      "natural_ratio": 0.0028228759765625,
      "max_context_tokens": 131072,
      "context_length": 651,
      "question_length": 49,
      "prediction": "Anglicanism",
      "reference": "Anglicanism",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.056569576263427734,
      "timestamp": "2025-12-24T20:42:30.922574"
    },
    {
      "sample_id": "squad_57281c262ca10214002d9e04",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 309,
      "natural_ratio": 0.00235748291015625,
      "max_context_tokens": 131072,
      "context_length": 502,
      "question_length": 77,
      "prediction": "London's status as the capital of England, and later the United Kingdom",
      "reference": "London's status as the capital of England",
      "metrics": {
        "f1": 0.7321428571428571
      },
      "elapsed_time": 0.164475679397583,
      "timestamp": "2025-12-24T20:42:31.087215"
    },
    {
      "sample_id": "squad_57280a4fff5b5019007d9b97",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 485,
      "natural_ratio": 0.00370025634765625,
      "max_context_tokens": 131072,
      "context_length": 896,
      "question_length": 34,
      "prediction": "是的，冯·诺依曼有两个弟弟。",
      "reference": "two younger brothers",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.13930487632751465,
      "timestamp": "2025-12-24T20:42:31.226675"
    },
    {
      "sample_id": "squad_56dee2b8c65bf219000b3dd5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 263,
      "natural_ratio": 0.00200653076171875,
      "max_context_tokens": 131072,
      "context_length": 460,
      "question_length": 26,
      "prediction": "Combat-Related Employment of Women",
      "reference": "Combat-Related Employment of Women",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08080720901489258,
      "timestamp": "2025-12-24T20:42:31.307640"
    },
    {
      "sample_id": "squad_56db2190e7c41114004b4dd5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 263,
      "natural_ratio": 0.00200653076171875,
      "max_context_tokens": 131072,
      "context_length": 437,
      "question_length": 50,
      "prediction": "4",
      "reference": "four",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.030613183975219727,
      "timestamp": "2025-12-24T20:42:31.338401"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_25800",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 110898,
      "natural_ratio": 0.8460845947265625,
      "max_context_tokens": 131072,
      "context_length": 221699,
      "question_length": 58,
      "prediction": "一个会说话的鹦鹉",
      "reference": "Mitch wants to buy lovebirds.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 5.975012302398682,
      "timestamp": "2025-12-24T20:42:37.313798"
    },
    {
      "sample_id": "squad_56d8e7d8dc89441400fdb3bd",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 541,
      "natural_ratio": 0.00412750244140625,
      "max_context_tokens": 131072,
      "context_length": 977,
      "question_length": 66,
      "prediction": "Golden Gate Bridge",
      "reference": "Golden Gate Bridge",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06280350685119629,
      "timestamp": "2025-12-24T20:42:37.377140"
    },
    {
      "sample_id": "squad_56e15da4cd28a01900c67818",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 453,
      "natural_ratio": 0.00345611572265625,
      "max_context_tokens": 131072,
      "context_length": 813,
      "question_length": 53,
      "prediction": "2013年",
      "reference": "in 2013",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.07503151893615723,
      "timestamp": "2025-12-24T20:42:37.452333"
    },
    {
      "sample_id": "squad_56f7c6e1aef2371900625bff",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 219,
      "natural_ratio": 0.00167083740234375,
      "max_context_tokens": 131072,
      "context_length": 325,
      "question_length": 73,
      "prediction": "17世纪的波士认为\"szlachta\"来自德语\"schlachten\"（意为\"屠宰\"或\"屠杀\"）。",
      "reference": "German \"schlachten\"",
      "metrics": {
        "f1": 0.4090909090909091
      },
      "elapsed_time": 0.33237314224243164,
      "timestamp": "2025-12-24T20:42:37.784876"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_31752",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 124249,
      "natural_ratio": 0.9479446411132812,
      "max_context_tokens": 131072,
      "context_length": 248406,
      "question_length": 53,
      "prediction": "Rick需要阻止布拉德做爱。",
      "reference": "Committing suicide",
      "metrics": {
        "f1": 0.15384615384615383
      },
      "elapsed_time": 0.23769807815551758,
      "timestamp": "2025-12-24T20:42:38.022895"
    },
    {
      "sample_id": "squad_5733ad50d058e614000b6020",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 390,
      "natural_ratio": 0.0029754638671875,
      "max_context_tokens": 131072,
      "context_length": 689,
      "question_length": 52,
      "prediction": "将耶稣会士驱逐出境并没收其资产。",
      "reference": "Sebastião de Melo prosecuted every person involved, even women and children",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.14757156372070312,
      "timestamp": "2025-12-24T20:42:38.170998"
    },
    {
      "sample_id": "squad_56fdcadf761e401900d28c01",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 365,
      "natural_ratio": 0.00278472900390625,
      "max_context_tokens": 131072,
      "context_length": 633,
      "question_length": 58,
      "prediction": "wheel-and-disc mechanisms",
      "reference": "wheel-and-disc",
      "metrics": {
        "f1": 0.717948717948718
      },
      "elapsed_time": 0.06459856033325195,
      "timestamp": "2025-12-24T20:42:38.235752"
    },
    {
      "sample_id": "squad_572823083acd2414000df56d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 429,
      "natural_ratio": 0.00327301025390625,
      "max_context_tokens": 131072,
      "context_length": 710,
      "question_length": 108,
      "prediction": "面部表情",
      "reference": "facial expressions",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.044259071350097656,
      "timestamp": "2025-12-24T20:42:38.280162"
    },
    {
      "sample_id": "squad_56e10a04cd28a01900c674a1",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 689,
      "natural_ratio": 0.00525665283203125,
      "max_context_tokens": 131072,
      "context_length": 1288,
      "question_length": 51,
      "prediction": "1969年11月",
      "reference": "November 1969",
      "metrics": {
        "f1": 0.37499999999999994
      },
      "elapsed_time": 0.10956311225891113,
      "timestamp": "2025-12-24T20:42:38.389876"
    },
    {
      "sample_id": "squad_57264c97dd62a815002e80dc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 646,
      "natural_ratio": 0.0049285888671875,
      "max_context_tokens": 131072,
      "context_length": 1203,
      "question_length": 50,
      "prediction": "Latvian and Lithuanian Baltic peoples",
      "reference": "Latvian and Lithuanian Baltic peoples",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.11887526512145996,
      "timestamp": "2025-12-24T20:42:38.508900"
    },
    {
      "sample_id": "squad_56e860fd00c9c71400d77676",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 139,
      "natural_ratio": 0.00106048583984375,
      "max_context_tokens": 131072,
      "context_length": 173,
      "question_length": 66,
      "prediction": "1983年",
      "reference": "1983",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 0.06950879096984863,
      "timestamp": "2025-12-24T20:42:38.578556"
    },
    {
      "sample_id": "squad_5735adcde853931400426ac6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 495,
      "natural_ratio": 0.00377655029296875,
      "max_context_tokens": 131072,
      "context_length": 894,
      "question_length": 56,
      "prediction": "喜马拉雅山脉",
      "reference": "Himalayas",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.06857848167419434,
      "timestamp": "2025-12-24T20:42:38.647277"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_28863",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 100822,
      "natural_ratio": 0.7692108154296875,
      "max_context_tokens": 131072,
      "context_length": 201565,
      "question_length": 40,
      "prediction": "Joey Percival is a cashier.",
      "reference": "He is a pilot",
      "metrics": {
        "f1": 0.7200000000000001
      },
      "elapsed_time": 4.135798692703247,
      "timestamp": "2025-12-24T20:42:42.783431"
    },
    {
      "sample_id": "squad_57268c78708984140094c9bf",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 401,
      "natural_ratio": 0.00305938720703125,
      "max_context_tokens": 131072,
      "context_length": 719,
      "question_length": 43,
      "prediction": "Unwrapped",
      "reference": "Unwrapped",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05062460899353027,
      "timestamp": "2025-12-24T20:42:42.834250"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_19282",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 32908,
      "natural_ratio": 0.251068115234375,
      "max_context_tokens": 131072,
      "context_length": 65755,
      "question_length": 22,
      "prediction": "Mary",
      "reference": "Mary Cushing",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.7540888786315918,
      "timestamp": "2025-12-24T20:42:43.588610"
    },
    {
      "sample_id": "squad_572f427604bcaa1900d767f3",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 597,
      "natural_ratio": 0.00455474853515625,
      "max_context_tokens": 131072,
      "context_length": 1107,
      "question_length": 48,
      "prediction": "因为南军未能保持其港口开放。",
      "reference": "The Confederacy had a larger area than it could defend",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.1206212043762207,
      "timestamp": "2025-12-24T20:42:43.709421"
    },
    {
      "sample_id": "squad_56e16960cd28a01900c678a6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 536,
      "natural_ratio": 0.00408935546875,
      "max_context_tokens": 131072,
      "context_length": 987,
      "question_length": 46,
      "prediction": "$40 million",
      "reference": "$40 million",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06552720069885254,
      "timestamp": "2025-12-24T20:42:43.775099"
    },
    {
      "sample_id": "squad_57301a88b2c2fd1400568875",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 573,
      "natural_ratio": 0.00437164306640625,
      "max_context_tokens": 131072,
      "context_length": 1058,
      "question_length": 48,
      "prediction": "The indigenous peoples they encountered，尤其是那些生活在更为孤立的“丛林”中的社区。",
      "reference": "indigenous peoples they encountered,",
      "metrics": {
        "f1": 0.6666666666666665
      },
      "elapsed_time": 0.2045907974243164,
      "timestamp": "2025-12-24T20:42:43.979852"
    },
    {
      "sample_id": "squad_56e78f3100c9c71400d772bf",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 311,
      "natural_ratio": 0.00237274169921875,
      "max_context_tokens": 131072,
      "context_length": 442,
      "question_length": 140,
      "prediction": "包括州政府。",
      "reference": "state governments",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.0607757568359375,
      "timestamp": "2025-12-24T20:42:44.040779"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_18212",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 116526,
      "natural_ratio": 0.8890228271484375,
      "max_context_tokens": 131072,
      "context_length": 232971,
      "question_length": 42,
      "prediction": "文本中并没有提到Marrow的助手的脸被划伤的内容。",
      "reference": "It was slashed through a freak accident involving a clavichord wire. ",
      "metrics": {
        "f1": 0.18604651162790697
      },
      "elapsed_time": 3.616955041885376,
      "timestamp": "2025-12-24T20:42:47.658094"
    },
    {
      "sample_id": "squad_56d10ca817492d1400aab818",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 537,
      "natural_ratio": 0.00409698486328125,
      "max_context_tokens": 131072,
      "context_length": 994,
      "question_length": 41,
      "prediction": "My Beautiful Dark Twisted Fantasy",
      "reference": "My Beautiful Dark Twisted Fantasy",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.09187602996826172,
      "timestamp": "2025-12-24T20:42:47.750512"
    },
    {
      "sample_id": "squad_570b1d3d6b8089140040f720",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 677,
      "natural_ratio": 0.00516510009765625,
      "max_context_tokens": 131072,
      "context_length": 1247,
      "question_length": 68,
      "prediction": "Peter Moore工作的地方是Sega of America。",
      "reference": "Sega of America",
      "metrics": {
        "f1": 0.625
      },
      "elapsed_time": 0.13057851791381836,
      "timestamp": "2025-12-24T20:42:47.881260"
    },
    {
      "sample_id": "squad_5725662acc50291900b283a3",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 355,
      "natural_ratio": 0.00270843505859375,
      "max_context_tokens": 131072,
      "context_length": 636,
      "question_length": 34,
      "prediction": "他的预算被否决。",
      "reference": "his budget was defeated",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.08414006233215332,
      "timestamp": "2025-12-24T20:42:47.965563"
    },
    {
      "sample_id": "squad_5732bcead6dcfa19001e8a99",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 489,
      "natural_ratio": 0.00373077392578125,
      "max_context_tokens": 131072,
      "context_length": 903,
      "question_length": 36,
      "prediction": "batons, tear gas, riot control agents, rubber bullets, riot shields, water cannons, electroshock weapons",
      "reference": "batons, tear gas, riot control agents, rubber bullets, riot shields, water cannons and electroshock weapons",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.2551841735839844,
      "timestamp": "2025-12-24T20:42:48.220914"
    },
    {
      "sample_id": "squad_56cf3e29aab44d1400b88ed0",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 498,
      "natural_ratio": 0.0037994384765625,
      "max_context_tokens": 131072,
      "context_length": 922,
      "question_length": 34,
      "prediction": "Charmian Bond",
      "reference": "Charmian Bond",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06533575057983398,
      "timestamp": "2025-12-24T20:42:48.286400"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_24765",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 15311,
      "natural_ratio": 0.11681365966796875,
      "max_context_tokens": 131072,
      "context_length": 30565,
      "question_length": 18,
      "prediction": "Kate",
      "reference": "Kate.",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 0.33335208892822266,
      "timestamp": "2025-12-24T20:42:48.619951"
    },
    {
      "sample_id": "squad_572fde67947a6a140053cd85",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 475,
      "natural_ratio": 0.00362396240234375,
      "max_context_tokens": 131072,
      "context_length": 870,
      "question_length": 41,
      "prediction": "聚合酶链反应（PCR）",
      "reference": "polymerase chain reaction",
      "metrics": {
        "f1": 0.23076923076923075
      },
      "elapsed_time": 0.09558796882629395,
      "timestamp": "2025-12-24T20:42:48.715730"
    },
    {
      "sample_id": "squad_5731070b05b4da19006bcce5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 653,
      "natural_ratio": 0.00498199462890625,
      "max_context_tokens": 131072,
      "context_length": 1185,
      "question_length": 81,
      "prediction": "在穆斯林信仰的书中，即《古兰经》中，记载每人都出生纯洁无瑕。",
      "reference": "in Islam every human child is born pure and immaculate",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.26884889602661133,
      "timestamp": "2025-12-24T20:42:48.984737"
    },
    {
      "sample_id": "squad_572b8a3b34ae481900deae87",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 364,
      "natural_ratio": 0.002777099609375,
      "max_context_tokens": 131072,
      "context_length": 628,
      "question_length": 61,
      "prediction": "区分phonemic /l/和/ʎ/",
      "reference": "a distinction between phonemic /l/ and /ʎ/",
      "metrics": {
        "f1": 0.7272727272727272
      },
      "elapsed_time": 0.12316131591796875,
      "timestamp": "2025-12-24T20:42:49.108056"
    },
    {
      "sample_id": "squad_5726b6a0f1498d1400e8e87f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 330,
      "natural_ratio": 0.0025177001953125,
      "max_context_tokens": 131072,
      "context_length": 528,
      "question_length": 92,
      "prediction": "显著的经济混乱",
      "reference": "significant disruption",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.06325292587280273,
      "timestamp": "2025-12-24T20:42:49.171476"
    },
    {
      "sample_id": "squad_5727b735ff5b5019007d933e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 424,
      "natural_ratio": 0.00323486328125,
      "max_context_tokens": 131072,
      "context_length": 742,
      "question_length": 66,
      "prediction": "ISO-8859-1",
      "reference": "ISO-8859-1",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.10414910316467285,
      "timestamp": "2025-12-24T20:42:49.275774"
    },
    {
      "sample_id": "squad_570b43ccec8fbc190045b93a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 397,
      "natural_ratio": 0.00302886962890625,
      "max_context_tokens": 131072,
      "context_length": 719,
      "question_length": 36,
      "prediction": "General Elwell Otis",
      "reference": "General Elwell Otis",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07462549209594727,
      "timestamp": "2025-12-24T20:42:49.350549"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_8081",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 108468,
      "natural_ratio": 0.827545166015625,
      "max_context_tokens": 131072,
      "context_length": 216815,
      "question_length": 81,
      "prediction": "Thor",
      "reference": "The Hulk",
      "metrics": {
        "f1": 0.36363636363636365
      },
      "elapsed_time": 3.6827917098999023,
      "timestamp": "2025-12-24T20:42:53.033727"
    },
    {
      "sample_id": "squad_5727cd5c2ca10214002d96bc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 362,
      "natural_ratio": 0.0027618408203125,
      "max_context_tokens": 131072,
      "context_length": 629,
      "question_length": 55,
      "prediction": "One Detroit Center的建筑风格是后现代哥特式。",
      "reference": "post modern Neo-Gothic spires",
      "metrics": {
        "f1": 0.4864864864864865
      },
      "elapsed_time": 0.16220355033874512,
      "timestamp": "2025-12-24T20:42:53.196513"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_14771",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 101476,
      "natural_ratio": 0.774200439453125,
      "max_context_tokens": 131072,
      "context_length": 202850,
      "question_length": 62,
      "prediction": "她的头",
      "reference": "A chainsaw",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 4.029199600219727,
      "timestamp": "2025-12-24T20:42:57.226089"
    },
    {
      "sample_id": "squad_5726a2b75951b619008f7875",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 615,
      "natural_ratio": 0.00469207763671875,
      "max_context_tokens": 131072,
      "context_length": 1131,
      "question_length": 60,
      "prediction": "ecoParq",
      "reference": "ecoParq",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06525063514709473,
      "timestamp": "2025-12-24T20:42:57.291546"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_1664",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 91294,
      "natural_ratio": 0.6965179443359375,
      "max_context_tokens": 131072,
      "context_length": 182517,
      "question_length": 31,
      "prediction": "Frank Carter的葬礼。",
      "reference": "Frank",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.17992234230041504,
      "timestamp": "2025-12-24T20:42:57.471769"
    },
    {
      "sample_id": "squad_5727faef3acd2414000df155",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 538,
      "natural_ratio": 0.0041046142578125,
      "max_context_tokens": 131072,
      "context_length": 976,
      "question_length": 60,
      "prediction": " Gupta Empire",
      "reference": "Gupta",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.04767203330993652,
      "timestamp": "2025-12-24T20:42:57.519601"
    },
    {
      "sample_id": "squad_56e152d7cd28a01900c677c8",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 509,
      "natural_ratio": 0.00388336181640625,
      "max_context_tokens": 131072,
      "context_length": 943,
      "question_length": 35,
      "prediction": "Harvard University",
      "reference": "Harvard University",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.056772708892822266,
      "timestamp": "2025-12-24T20:42:57.576525"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_1268",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 71196,
      "natural_ratio": 0.543182373046875,
      "max_context_tokens": 131072,
      "context_length": 142299,
      "question_length": 54,
      "prediction": "Theodore gives her a portrait.",
      "reference": "The portrait of his wife, Augustine",
      "metrics": {
        "f1": 0.7333333333333334
      },
      "elapsed_time": 1.992063283920288,
      "timestamp": "2025-12-24T20:42:59.568873"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_11235",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 119800,
      "natural_ratio": 0.91400146484375,
      "max_context_tokens": 131072,
      "context_length": 239521,
      "question_length": 39,
      "prediction": "Snakes and Spiders",
      "reference": "Spider and Snakes",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 4.3827924728393555,
      "timestamp": "2025-12-24T20:43:03.952006"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_17595",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 104835,
      "natural_ratio": 0.7998275756835938,
      "max_context_tokens": 131072,
      "context_length": 209596,
      "question_length": 34,
      "prediction": "因为Cyril和朋友们在校园里挑衅了Rod所在的团队。",
      "reference": "Rod finds out Dave has serenaded Katherine and he mistakes Cyril for Dave and beats up the wrong person.",
      "metrics": {
        "f1": 0.2857142857142857
      },
      "elapsed_time": 3.748267650604248,
      "timestamp": "2025-12-24T20:43:07.700981"
    },
    {
      "sample_id": "squad_572aa0a1be1ee31400cb80d7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 560,
      "natural_ratio": 0.0042724609375,
      "max_context_tokens": 131072,
      "context_length": 947,
      "question_length": 134,
      "prediction": " governor-general",
      "reference": "colonies such as Canada, Australia, New Zealand, the Union of South Africa, and Newfoundland",
      "metrics": {
        "f1": 0.4444444444444444
      },
      "elapsed_time": 0.05125761032104492,
      "timestamp": "2025-12-24T20:43:07.752432"
    },
    {
      "sample_id": "squad_5727b2134b864d1900163aa1",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 347,
      "natural_ratio": 0.00264739990234375,
      "max_context_tokens": 131072,
      "context_length": 574,
      "question_length": 80,
      "prediction": "faceting of form, spatial ambiguity, transparency, and multiplicity",
      "reference": "faceting of form, spatial ambiguity, transparency, and multiplicity.",
      "metrics": {
        "f1": 0.975609756097561
      },
      "elapsed_time": 0.1544017791748047,
      "timestamp": "2025-12-24T20:43:07.907001"
    },
    {
      "sample_id": "squad_572643ee1125e71900ae1915",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 320,
      "natural_ratio": 0.00244140625,
      "max_context_tokens": 131072,
      "context_length": 527,
      "question_length": 74,
      "prediction": "99%",
      "reference": "99 percent",
      "metrics": {
        "f1": 0.2
      },
      "elapsed_time": 0.05399274826049805,
      "timestamp": "2025-12-24T20:43:07.961148"
    },
    {
      "sample_id": "squad_5709b0aaed30961900e8441c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 602,
      "natural_ratio": 0.0045928955078125,
      "max_context_tokens": 131072,
      "context_length": 1098,
      "question_length": 66,
      "prediction": " Petrochemical refineries",
      "reference": "Petrochemical",
      "metrics": {
        "f1": 0.7027027027027027
      },
      "elapsed_time": 0.0654609203338623,
      "timestamp": "2025-12-24T20:43:08.026757"
    },
    {
      "sample_id": "squad_56cee14aaab44d1400b88bcb",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 377,
      "natural_ratio": 0.00287628173828125,
      "max_context_tokens": 131072,
      "context_length": 681,
      "question_length": 33,
      "prediction": "Liu Shaokun",
      "reference": "Liu Shaokun",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07432436943054199,
      "timestamp": "2025-12-24T20:43:08.101228"
    },
    {
      "sample_id": "squad_572ff927947a6a140053cebf",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 365,
      "natural_ratio": 0.00278472900390625,
      "max_context_tokens": 131072,
      "context_length": 608,
      "question_length": 83,
      "prediction": "大约在公元前4395年。",
      "reference": "the 4th millennium BC",
      "metrics": {
        "f1": 0.08333333333333333
      },
      "elapsed_time": 0.11389279365539551,
      "timestamp": "2025-12-24T20:43:08.215275"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_21497",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 82715,
      "natural_ratio": 0.6310653686523438,
      "max_context_tokens": 131072,
      "context_length": 165323,
      "question_length": 67,
      "prediction": "Abbe Troubert",
      "reference": "Troubert",
      "metrics": {
        "f1": 0.761904761904762
      },
      "elapsed_time": 2.4522883892059326,
      "timestamp": "2025-12-24T20:43:10.667933"
    },
    {
      "sample_id": "squad_56e136e7cd28a01900c676bc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 810,
      "natural_ratio": 0.0061798095703125,
      "max_context_tokens": 131072,
      "context_length": 1501,
      "question_length": 79,
      "prediction": "tidal areas",
      "reference": "tidal areas",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06378555297851562,
      "timestamp": "2025-12-24T20:43:10.731906"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_28882",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 113799,
      "natural_ratio": 0.8682174682617188,
      "max_context_tokens": 131072,
      "context_length": 227515,
      "question_length": 44,
      "prediction": "因为Elektra亲手杀死了自己的父亲，并且她利用了Renard来复仇，她并不像表面上那样无辜。",
      "reference": "Renard drops a hint that he and Elektra are collaborating. ",
      "metrics": {
        "f1": 0.2857142857142857
      },
      "elapsed_time": 0.4366335868835449,
      "timestamp": "2025-12-24T20:43:11.168875"
    },
    {
      "sample_id": "squad_56f7f171aef2371900625c9f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 123,
      "natural_ratio": 0.00093841552734375,
      "max_context_tokens": 131072,
      "context_length": 167,
      "question_length": 39,
      "prediction": "印度",
      "reference": "India",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.036392927169799805,
      "timestamp": "2025-12-24T20:43:11.205802"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_17571",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 71299,
      "natural_ratio": 0.5439682006835938,
      "max_context_tokens": 131072,
      "context_length": 142518,
      "question_length": 40,
      "prediction": "他囚禁了年轻的国王和妹妹克莉奥佩特拉。",
      "reference": "Cleopatra",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.2847871780395508,
      "timestamp": "2025-12-24T20:43:11.490877"
    },
    {
      "sample_id": "squad_56d369ca59d6e41400146324",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 358,
      "natural_ratio": 0.0027313232421875,
      "max_context_tokens": 131072,
      "context_length": 609,
      "question_length": 67,
      "prediction": "第九季",
      "reference": "season nine",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.04688668251037598,
      "timestamp": "2025-12-24T20:43:11.537938"
    },
    {
      "sample_id": "squad_5728dacc2ca10214002da9a9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 363,
      "natural_ratio": 0.00276947021484375,
      "max_context_tokens": 131072,
      "context_length": 640,
      "question_length": 47,
      "prediction": "四所",
      "reference": "four",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.0442960262298584,
      "timestamp": "2025-12-24T20:43:11.582382"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_22615",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 122153,
      "natural_ratio": 0.9319534301757812,
      "max_context_tokens": 131072,
      "context_length": 244211,
      "question_length": 55,
      "prediction": "CLYDE和BONNIE在Arcadia被弗兰克·哈默及其手下追捕，最终在Arcadia的街道上被包围和枪杀。",
      "reference": "They are captured after stopping on the side of the road to help Ivan Moss fix a flat tire, where the police are hiding in the bushes.",
      "metrics": {
        "f1": 0.34920634920634924
      },
      "elapsed_time": 4.903658151626587,
      "timestamp": "2025-12-24T20:43:16.486415"
    },
    {
      "sample_id": "squad_572e904bdfa6aa1500f8d148",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 336,
      "natural_ratio": 0.0025634765625,
      "max_context_tokens": 131072,
      "context_length": 597,
      "question_length": 36,
      "prediction": "肺结核（或 Tuberculosis）",
      "reference": "tuberculosis",
      "metrics": {
        "f1": 0.7741935483870968
      },
      "elapsed_time": 0.11058712005615234,
      "timestamp": "2025-12-24T20:43:16.597542"
    },
    {
      "sample_id": "squad_572ed5ccdfa6aa1500f8d428",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 622,
      "natural_ratio": 0.0047454833984375,
      "max_context_tokens": 131072,
      "context_length": 1147,
      "question_length": 58,
      "prediction": "俄罗斯皇帝被其妻子凯瑟琳推翻，结束了与普鲁士的联盟并退出了战争。",
      "reference": "Catherine, who ended Russia's alliance with Prussia and withdrew from the war",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.26305365562438965,
      "timestamp": "2025-12-24T20:43:16.860820"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_15232",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 124001,
      "natural_ratio": 0.9460525512695312,
      "max_context_tokens": 131072,
      "context_length": 247925,
      "question_length": 38,
      "prediction": "通过热成像定位 prey 的体温。",
      "reference": "By heat",
      "metrics": {
        "f1": 0.26086956521739124
      },
      "elapsed_time": 3.9598958492279053,
      "timestamp": "2025-12-24T20:43:20.821107"
    },
    {
      "sample_id": "squad_572714d2708984140094d977",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 382,
      "natural_ratio": 0.0029144287109375,
      "max_context_tokens": 131072,
      "context_length": 643,
      "question_length": 81,
      "prediction": "消费致癌的腌制食品和寄生虫感染。",
      "reference": "consumption of carcinogenic preserved foods",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.15051531791687012,
      "timestamp": "2025-12-24T20:43:20.972186"
    },
    {
      "sample_id": "squad_570d45acb3d812140066d612",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 308,
      "natural_ratio": 0.002349853515625,
      "max_context_tokens": 131072,
      "context_length": 529,
      "question_length": 48,
      "prediction": "在瓦伦西亚市中心。",
      "reference": "centre of the city",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.08494997024536133,
      "timestamp": "2025-12-24T20:43:21.057321"
    },
    {
      "sample_id": "squad_56dfbb8c231d4119001abd3c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 868,
      "natural_ratio": 0.006622314453125,
      "max_context_tokens": 131072,
      "context_length": 1637,
      "question_length": 60,
      "prediction": "municipal broadband",
      "reference": "municipal broadband",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07075095176696777,
      "timestamp": "2025-12-24T20:43:21.128237"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_10101",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 52267,
      "natural_ratio": 0.39876556396484375,
      "max_context_tokens": 131072,
      "context_length": 104464,
      "question_length": 31,
      "prediction": "Withnail is arrested for driving under the influence.",
      "reference": "Driving under the influence",
      "metrics": {
        "f1": 0.6749999999999999
      },
      "elapsed_time": 1.5601816177368164,
      "timestamp": "2025-12-24T20:43:22.688737"
    },
    {
      "sample_id": "squad_56ddf08d66d3e219004dae35",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 231,
      "natural_ratio": 0.00176239013671875,
      "max_context_tokens": 131072,
      "context_length": 386,
      "question_length": 37,
      "prediction": "约公元前2000年",
      "reference": "2000 BC",
      "metrics": {
        "f1": 0.3333333333333333
      },
      "elapsed_time": 0.10686302185058594,
      "timestamp": "2025-12-24T20:43:22.795778"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_20679",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 101054,
      "natural_ratio": 0.7709808349609375,
      "max_context_tokens": 131072,
      "context_length": 202029,
      "question_length": 39,
      "prediction": "Glen Whitehouse",
      "reference": "Their father, Glen Whitehouse",
      "metrics": {
        "f1": 0.8571428571428571
      },
      "elapsed_time": 2.2971811294555664,
      "timestamp": "2025-12-24T20:43:25.093322"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_21756",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 113948,
      "natural_ratio": 0.869354248046875,
      "max_context_tokens": 131072,
      "context_length": 227820,
      "question_length": 37,
      "prediction": "根据提供的剧本内容，Joe在Shanghai的身份是Looper（时间循环杀手）。",
      "reference": "He is a hitman.",
      "metrics": {
        "f1": 0.26666666666666666
      },
      "elapsed_time": 0.35064101219177246,
      "timestamp": "2025-12-24T20:43:25.444321"
    },
    {
      "sample_id": "squad_572e8035cb0c0d14000f11e0",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 466,
      "natural_ratio": 0.0035552978515625,
      "max_context_tokens": 131072,
      "context_length": 844,
      "question_length": 49,
      "prediction": "英国和法国",
      "reference": "British and French forces",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.06063508987426758,
      "timestamp": "2025-12-24T20:43:25.505487"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_32251",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 105026,
      "natural_ratio": 0.8012847900390625,
      "max_context_tokens": 131072,
      "context_length": 209986,
      "question_length": 27,
      "prediction": "11岁",
      "reference": "Fifteen ",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 7.016922473907471,
      "timestamp": "2025-12-24T20:43:32.522746"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_16399",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 82792,
      "natural_ratio": 0.63165283203125,
      "max_context_tokens": 131072,
      "context_length": 165473,
      "question_length": 72,
      "prediction": "因为Ellida在与他人的约定之后，认为与这位陌生人之间的约定也结束了。",
      "reference": "Because he murdered a captain and had to escape.",
      "metrics": {
        "f1": 0.17777777777777778
      },
      "elapsed_time": 3.0004680156707764,
      "timestamp": "2025-12-24T20:43:35.523620"
    },
    {
      "sample_id": "squad_56f7d4f7aef2371900625c24",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 223,
      "natural_ratio": 0.00170135498046875,
      "max_context_tokens": 131072,
      "context_length": 357,
      "question_length": 50,
      "prediction": "成为wealthier gentry的佃农",
      "reference": "to become tenants of the wealthier gentry.",
      "metrics": {
        "f1": 0.6666666666666667
      },
      "elapsed_time": 0.10777854919433594,
      "timestamp": "2025-12-24T20:43:35.631594"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_22266",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 101299,
      "natural_ratio": 0.7728500366210938,
      "max_context_tokens": 131072,
      "context_length": 202518,
      "question_length": 41,
      "prediction": "因为Valentine假装疯癫来拖延签署财产转让文书，这让他的父亲感到被愚弄和愤怒。",
      "reference": "He spends too much money and is now in debt.",
      "metrics": {
        "f1": 0.17543859649122806
      },
      "elapsed_time": 4.51965856552124,
      "timestamp": "2025-12-24T20:43:40.151664"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_22976",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 101295,
      "natural_ratio": 0.7728195190429688,
      "max_context_tokens": 131072,
      "context_length": 202501,
      "question_length": 50,
      "prediction": "因为他在工作中表现得过于投入，以至于忘记了时间，导致客户的食物被狗尿弄湿。",
      "reference": "He delivered the dogs late to the dog show and got them dirty.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.6859099864959717,
      "timestamp": "2025-12-24T20:43:43.837952"
    },
    {
      "sample_id": "squad_5726acfa5951b619008f79c9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 469,
      "natural_ratio": 0.00357818603515625,
      "max_context_tokens": 131072,
      "context_length": 843,
      "question_length": 55,
      "prediction": "30 July",
      "reference": "30 July",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06305241584777832,
      "timestamp": "2025-12-24T20:43:43.901191"
    },
    {
      "sample_id": "squad_5727b3b72ca10214002d944c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 312,
      "natural_ratio": 0.00238037109375,
      "max_context_tokens": 131072,
      "context_length": 520,
      "question_length": 64,
      "prediction": "传统字符编码无法支持多种语言的计算机处理（任意-script的混合处理）。",
      "reference": "wide usage in various countries of the world but remain largely incompatible with each other",
      "metrics": {
        "f1": 0.21428571428571427
      },
      "elapsed_time": 0.19153690338134766,
      "timestamp": "2025-12-24T20:43:44.092899"
    },
    {
      "sample_id": "squad_57264ac7f1498d1400e8db57",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 459,
      "natural_ratio": 0.00350189208984375,
      "max_context_tokens": 131072,
      "context_length": 814,
      "question_length": 65,
      "prediction": " affluent middle-class",
      "reference": "the affluent middle-class",
      "metrics": {
        "f1": 0.9655172413793104
      },
      "elapsed_time": 0.05474138259887695,
      "timestamp": "2025-12-24T20:43:44.147796"
    },
    {
      "sample_id": "squad_572734eb5951b619008f86b9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 360,
      "natural_ratio": 0.00274658203125,
      "max_context_tokens": 131072,
      "context_length": 630,
      "question_length": 51,
      "prediction": "1980-81",
      "reference": "didn't celebrate its centenary year until 1980–81",
      "metrics": {
        "f1": 0.32000000000000006
      },
      "elapsed_time": 0.09477567672729492,
      "timestamp": "2025-12-24T20:43:44.242729"
    },
    {
      "sample_id": "squad_56de739f4396321400ee292d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 520,
      "natural_ratio": 0.00396728515625,
      "max_context_tokens": 131072,
      "context_length": 935,
      "question_length": 66,
      "prediction": "orthodoxy",
      "reference": "orthodoxy",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04611063003540039,
      "timestamp": "2025-12-24T20:43:44.288987"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_32378",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 108947,
      "natural_ratio": 0.8311996459960938,
      "max_context_tokens": 131072,
      "context_length": 217801,
      "question_length": 53,
      "prediction": "Faust and Gretchen have a romantic relationship, though it is complicated by Gretchen's marriage to another man and her eventual condemnation for her actions.",
      "reference": "They are lovers.",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 4.4917871952056885,
      "timestamp": "2025-12-24T20:43:48.781107"
    },
    {
      "sample_id": "squad_572fcc11947a6a140053ccd3",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 360,
      "natural_ratio": 0.00274658203125,
      "max_context_tokens": 131072,
      "context_length": 615,
      "question_length": 65,
      "prediction": "Spirochaetes of the genus Borrelia",
      "reference": "Spirochaetes of the genus Borrelia",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.1311657428741455,
      "timestamp": "2025-12-24T20:43:48.912813"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_16959",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 61932,
      "natural_ratio": 0.472503662109375,
      "max_context_tokens": 131072,
      "context_length": 123786,
      "question_length": 38,
      "prediction": "二十磅",
      "reference": "Twenty Pounds.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 1.7290968894958496,
      "timestamp": "2025-12-24T20:43:50.642210"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_1543",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 119841,
      "natural_ratio": 0.9143142700195312,
      "max_context_tokens": 131072,
      "context_length": 239611,
      "question_length": 31,
      "prediction": "The angel was an angel of Art.",
      "reference": "Angel of Italian Art",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 4.550123929977417,
      "timestamp": "2025-12-24T20:43:55.192695"
    },
    {
      "sample_id": "squad_5730dfc8b7151e1900c0159f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 474,
      "natural_ratio": 0.0036163330078125,
      "max_context_tokens": 131072,
      "context_length": 867,
      "question_length": 42,
      "prediction": "to achieve military, diplomatic, informational, and/or economic objectives",
      "reference": "achieve military, diplomatic, informational, and/or economic objectives",
      "metrics": {
        "f1": 0.9793103448275862
      },
      "elapsed_time": 0.15235280990600586,
      "timestamp": "2025-12-24T20:43:55.345592"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_28971",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 36686,
      "natural_ratio": 0.2798919677734375,
      "max_context_tokens": 131072,
      "context_length": 73259,
      "question_length": 73,
      "prediction": "Tweel帮助地球人抵御梦魇的攻击。",
      "reference": "Tweel.",
      "metrics": {
        "f1": 0.3636363636363636
      },
      "elapsed_time": 0.9731988906860352,
      "timestamp": "2025-12-24T20:43:56.319092"
    },
    {
      "sample_id": "squad_570b3980ec8fbc190045b8e8",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 1047,
      "natural_ratio": 0.00798797607421875,
      "max_context_tokens": 131072,
      "context_length": 1979,
      "question_length": 76,
      "prediction": "战争 of1812",
      "reference": "the War of 1812",
      "metrics": {
        "f1": 0.6
      },
      "elapsed_time": 0.09598422050476074,
      "timestamp": "2025-12-24T20:43:56.415304"
    },
    {
      "sample_id": "squad_5728eae64b864d190016508f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 315,
      "natural_ratio": 0.00240325927734375,
      "max_context_tokens": 131072,
      "context_length": 553,
      "question_length": 37,
      "prediction": "在现代日本的日常生活和现代日本武道中。",
      "reference": "everyday life and in modern Japanese martial arts",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.1467430591583252,
      "timestamp": "2025-12-24T20:43:56.562257"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_27787",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 96974,
      "natural_ratio": 0.7398529052734375,
      "max_context_tokens": 131072,
      "context_length": 193837,
      "question_length": 71,
      "prediction": "Timmy Bateman returned as a zombie, laughing and laughing, and eventually burned to death by his father.",
      "reference": "he comes back as an evil zombie",
      "metrics": {
        "f1": 0.7027027027027029
      },
      "elapsed_time": 4.868615388870239,
      "timestamp": "2025-12-24T20:44:01.431239"
    },
    {
      "sample_id": "squad_570d2095b3d812140066d45e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 296,
      "natural_ratio": 0.00225830078125,
      "max_context_tokens": 131072,
      "context_length": 509,
      "question_length": 43,
      "prediction": "1520-1522年",
      "reference": "1520–1522",
      "metrics": {
        "f1": 0.7272727272727272
      },
      "elapsed_time": 0.12998151779174805,
      "timestamp": "2025-12-24T20:44:01.561412"
    },
    {
      "sample_id": "squad_57290ea56aef0514001549fd",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 375,
      "natural_ratio": 0.00286102294921875,
      "max_context_tokens": 131072,
      "context_length": 662,
      "question_length": 49,
      "prediction": "提供批评或比较，将产品的状态和行为与oracle（原则或机制）进行对比。",
      "reference": "furnishes a criticism or comparison",
      "metrics": {
        "f1": 0.20833333333333331
      },
      "elapsed_time": 0.22441959381103516,
      "timestamp": "2025-12-24T20:44:01.786006"
    },
    {
      "sample_id": "squad_570dcb0c0dc6ce1900204cbe",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 451,
      "natural_ratio": 0.00344085693359375,
      "max_context_tokens": 131072,
      "context_length": 783,
      "question_length": 79,
      "prediction": "decline",
      "reference": "decline",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.048685312271118164,
      "timestamp": "2025-12-24T20:44:01.834841"
    },
    {
      "sample_id": "squad_57262c2c271a42140099d6f2",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 294,
      "natural_ratio": 0.0022430419921875,
      "max_context_tokens": 131072,
      "context_length": 517,
      "question_length": 31,
      "prediction": "Professional wrestling is an athletic form of entertainment based on a portrayal of a combat sport, taking the form of live events held by touring promotions and portraying a unique style of combat based on a combination of classical wrestling, catch wrestling, various forms of martial arts, and an innovative style based on grappling, striking, and aerialism. Various forms of weaponry are sometimes used.",
      "reference": "an athletic form of entertainment based on a portrayal of a combat sport",
      "metrics": {
        "f1": 0.3773584905660377
      },
      "elapsed_time": 0.7497479915618896,
      "timestamp": "2025-12-24T20:44:02.584759"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_28306",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 55784,
      "natural_ratio": 0.42559814453125,
      "max_context_tokens": 131072,
      "context_length": 111486,
      "question_length": 42,
      "prediction": "他们经历了一系列事件，包括时钟倒转、建筑物下沉进入第四维度、时间倒流、与土著居民交易获取食物、以及最终通过人工刺激地下热泉返回现代时间。在这一过程中，他们还经历了食物短缺、恐慌和救援工作。",
      "reference": "They were stranded in pre- Columbian Manhattan",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 1.9337944984436035,
      "timestamp": "2025-12-24T20:44:04.518874"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_12422",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 38924,
      "natural_ratio": 0.296966552734375,
      "max_context_tokens": 131072,
      "context_length": 77754,
      "question_length": 55,
      "prediction": "Snake和Maggie等人将总统救出。",
      "reference": "the Duke of New York",
      "metrics": {
        "f1": 0.20000000000000004
      },
      "elapsed_time": 1.3619563579559326,
      "timestamp": "2025-12-24T20:44:05.881164"
    },
    {
      "sample_id": "squad_570dae6016d0071400510cc9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 423,
      "natural_ratio": 0.00322723388671875,
      "max_context_tokens": 131072,
      "context_length": 714,
      "question_length": 93,
      "prediction": "身体不满意",
      "reference": "body dissatisfaction",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.049308061599731445,
      "timestamp": "2025-12-24T20:44:05.930697"
    },
    {
      "sample_id": "squad_57267077dd62a815002e84e6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 353,
      "natural_ratio": 0.00269317626953125,
      "max_context_tokens": 131072,
      "context_length": 623,
      "question_length": 44,
      "prediction": "When a wrestler enters the ring, they are accompanied by music and may add other elements to their entrance.",
      "reference": "accompanied by music",
      "metrics": {
        "f1": 0.3125
      },
      "elapsed_time": 0.23819756507873535,
      "timestamp": "2025-12-24T20:44:06.169119"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_22506",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 62482,
      "natural_ratio": 0.4766998291015625,
      "max_context_tokens": 131072,
      "context_length": 124868,
      "question_length": 56,
      "prediction": "Richard is at the Hall when he learns his grandfather is dead.",
      "reference": "America",
      "metrics": {
        "f1": 0.45454545454545453
      },
      "elapsed_time": 1.87986421585083,
      "timestamp": "2025-12-24T20:44:08.049285"
    },
    {
      "sample_id": "squad_5726b9d05951b619008f7beb",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 506,
      "natural_ratio": 0.0038604736328125,
      "max_context_tokens": 131072,
      "context_length": 877,
      "question_length": 96,
      "prediction": "提供能量",
      "reference": "provide energy",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.04764819145202637,
      "timestamp": "2025-12-24T20:44:08.097123"
    },
    {
      "sample_id": "squad_572c0399f182dd1900d7c7b9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 392,
      "natural_ratio": 0.00299072265625,
      "max_context_tokens": 131072,
      "context_length": 717,
      "question_length": 28,
      "prediction": "Leonard Mlodinow",
      "reference": "Leonard Mlodinow",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08422493934631348,
      "timestamp": "2025-12-24T20:44:08.181508"
    },
    {
      "sample_id": "squad_570971e6ed30961900e84158",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 506,
      "natural_ratio": 0.0038604736328125,
      "max_context_tokens": 131072,
      "context_length": 901,
      "question_length": 72,
      "prediction": "一个恶意的紧凑型光盘",
      "reference": "a malicious compact disc in a car's stereo system",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.0945289134979248,
      "timestamp": "2025-12-24T20:44:08.276195"
    },
    {
      "sample_id": "squad_56cf67c74df3c31400b0d72f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 494,
      "natural_ratio": 0.0037689208984375,
      "max_context_tokens": 131072,
      "context_length": 884,
      "question_length": 64,
      "prediction": "Chopin的钢琴协奏曲第一号（E小调）的慢板（Larghetto）乐章。",
      "reference": "Piano Concerto No. 1 (in E minor)",
      "metrics": {
        "f1": 0.4
      },
      "elapsed_time": 0.2918422222137451,
      "timestamp": "2025-12-24T20:44:08.568198"
    },
    {
      "sample_id": "squad_573201d7b9d445190005e717",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 369,
      "natural_ratio": 0.00281524658203125,
      "max_context_tokens": 131072,
      "context_length": 651,
      "question_length": 48,
      "prediction": "罗马元老院",
      "reference": "the senate",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.06464314460754395,
      "timestamp": "2025-12-24T20:44:08.632991"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_5354",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 113207,
      "natural_ratio": 0.8637008666992188,
      "max_context_tokens": 131072,
      "context_length": 226304,
      "question_length": 70,
      "prediction": "通过提出成为她的亲属来获得她手中的信件，尽管这涉及到道德和情感上的复杂考量。",
      "reference": "by marrying her",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 4.084240198135376,
      "timestamp": "2025-12-24T20:44:12.717584"
    },
    {
      "sample_id": "squad_5726d39add62a815002e9195",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 432,
      "natural_ratio": 0.0032958984375,
      "max_context_tokens": 131072,
      "context_length": 744,
      "question_length": 80,
      "prediction": "莫斯科",
      "reference": "Moscow",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.05808830261230469,
      "timestamp": "2025-12-24T20:44:12.776220"
    },
    {
      "sample_id": "squad_5728ff31af94a219006a9f07",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 445,
      "natural_ratio": 0.00339508056640625,
      "max_context_tokens": 131072,
      "context_length": 797,
      "question_length": 54,
      "prediction": "19世纪",
      "reference": "19th",
      "metrics": {
        "f1": 0.5
      },
      "elapsed_time": 0.055814266204833984,
      "timestamp": "2025-12-24T20:44:12.832210"
    },
    {
      "sample_id": "squad_573059f08ab72b1400f9c491",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 380,
      "natural_ratio": 0.002899169921875,
      "max_context_tokens": 131072,
      "context_length": 667,
      "question_length": 54,
      "prediction": "Greek",
      "reference": "Greek",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.03430008888244629,
      "timestamp": "2025-12-24T20:44:12.866668"
    },
    {
      "sample_id": "squad_5727c8104b864d1900163cfe",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 392,
      "natural_ratio": 0.00299072265625,
      "max_context_tokens": 131072,
      "context_length": 662,
      "question_length": 82,
      "prediction": "不再假设哲学是人类的自然完善。",
      "reference": "no longer assuming that philosophy was man's natural perfection",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.1141660213470459,
      "timestamp": "2025-12-24T20:44:12.980989"
    },
    {
      "sample_id": "squad_57264659708984140094c11f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 473,
      "natural_ratio": 0.00360870361328125,
      "max_context_tokens": 131072,
      "context_length": 857,
      "question_length": 50,
      "prediction": "希腊岛屿和城市",
      "reference": "Greek Isles and cities",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.06492090225219727,
      "timestamp": "2025-12-24T20:44:13.046059"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_6012",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 106847,
      "natural_ratio": 0.8151779174804688,
      "max_context_tokens": 131072,
      "context_length": 213615,
      "question_length": 39,
      "prediction": "5给9的是2的旧手杖。",
      "reference": "A light bulb.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.863924026489258,
      "timestamp": "2025-12-24T20:44:16.910319"
    },
    {
      "sample_id": "squad_5709986aed30961900e8436a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 618,
      "natural_ratio": 0.0047149658203125,
      "max_context_tokens": 131072,
      "context_length": 1110,
      "question_length": 87,
      "prediction": "Established Men",
      "reference": "Established Men",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06406593322753906,
      "timestamp": "2025-12-24T20:44:16.974932"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_12911",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 114988,
      "natural_ratio": 0.877288818359375,
      "max_context_tokens": 131072,
      "context_length": 229886,
      "question_length": 51,
      "prediction": "从浴室传来的。",
      "reference": "An audiotape left by Muffy",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.127747058868408,
      "timestamp": "2025-12-24T20:44:20.103025"
    },
    {
      "sample_id": "squad_57101eefa58dae1900cd68b4",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 511,
      "natural_ratio": 0.00389862060546875,
      "max_context_tokens": 131072,
      "context_length": 924,
      "question_length": 59,
      "prediction": "因为这样会导致有价值的信息丢失。",
      "reference": "A person who has only predominantly same sex reactions is different from someone with relatively little reaction but lots of same sex experience",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.10077023506164551,
      "timestamp": "2025-12-24T20:44:20.204356"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_28106",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 72205,
      "natural_ratio": 0.5508804321289062,
      "max_context_tokens": 131072,
      "context_length": 144336,
      "question_length": 35,
      "prediction": "文本中并未提到Mumbles在澳大利亚被关押的情况。",
      "reference": "Marine World.",
      "metrics": {
        "f1": 0.16216216216216217
      },
      "elapsed_time": 2.259310007095337,
      "timestamp": "2025-12-24T20:44:22.464007"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_4286",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 92059,
      "natural_ratio": 0.7023544311523438,
      "max_context_tokens": 131072,
      "context_length": 184037,
      "question_length": 41,
      "prediction": "伦敦的世界",
      "reference": "The Lost World",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.17337942123413086,
      "timestamp": "2025-12-24T20:44:22.637703"
    },
    {
      "sample_id": "squad_56e151c2cd28a01900c677c1",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 324,
      "natural_ratio": 0.002471923828125,
      "max_context_tokens": 131072,
      "context_length": 554,
      "question_length": 54,
      "prediction": "New England Conservatory",
      "reference": "the New England Conservatory",
      "metrics": {
        "f1": 0.967741935483871
      },
      "elapsed_time": 0.06658458709716797,
      "timestamp": "2025-12-24T20:44:22.704459"
    },
    {
      "sample_id": "squad_5726d198dd62a815002e9144",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 484,
      "natural_ratio": 0.003692626953125,
      "max_context_tokens": 131072,
      "context_length": 857,
      "question_length": 72,
      "prediction": "英国和法国",
      "reference": "UK and France",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.055062294006347656,
      "timestamp": "2025-12-24T20:44:22.759680"
    },
    {
      "sample_id": "squad_56f959019e9bad19000a085e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 250,
      "natural_ratio": 0.0019073486328125,
      "max_context_tokens": 131072,
      "context_length": 392,
      "question_length": 69,
      "prediction": "1905年",
      "reference": "1905",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 0.07053756713867188,
      "timestamp": "2025-12-24T20:44:22.830372"
    },
    {
      "sample_id": "squad_5723d1300dadf01500fa1f3c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 291,
      "natural_ratio": 0.00222015380859375,
      "max_context_tokens": 131072,
      "context_length": 513,
      "question_length": 29,
      "prediction": "1882年",
      "reference": "1882",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 0.07435035705566406,
      "timestamp": "2025-12-24T20:44:22.904878"
    },
    {
      "sample_id": "squad_572971b83f37b319004783d9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 378,
      "natural_ratio": 0.0028839111328125,
      "max_context_tokens": 131072,
      "context_length": 643,
      "question_length": 74,
      "prediction": "The Stern Review made a strong economic case for investing in low carbon technologies.",
      "reference": "Stern Review",
      "metrics": {
        "f1": 0.25
      },
      "elapsed_time": 0.1739945411682129,
      "timestamp": "2025-12-24T20:44:23.079025"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_4660",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 115824,
      "natural_ratio": 0.8836669921875,
      "max_context_tokens": 131072,
      "context_length": 231558,
      "question_length": 50,
      "prediction": "Austin Powers用鞋攻击Random Task。",
      "reference": "His Penis Pump",
      "metrics": {
        "f1": 0.5517241379310346
      },
      "elapsed_time": 3.0388481616973877,
      "timestamp": "2025-12-24T20:44:26.118220"
    },
    {
      "sample_id": "squad_56e10565e3433e1400422adf",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 697,
      "natural_ratio": 0.00531768798828125,
      "max_context_tokens": 131072,
      "context_length": 1312,
      "question_length": 43,
      "prediction": " tortoises",
      "reference": "tortoises",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06444072723388672,
      "timestamp": "2025-12-24T20:44:26.183196"
    },
    {
      "sample_id": "squad_573220fce99e3014001e653b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 564,
      "natural_ratio": 0.004302978515625,
      "max_context_tokens": 131072,
      "context_length": 1015,
      "question_length": 73,
      "prediction": "在帝国时期，在罗马建立了几个犹太会堂。",
      "reference": "synagogues",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.1459810733795166,
      "timestamp": "2025-12-24T20:44:26.329345"
    },
    {
      "sample_id": "squad_5725e93b38643c19005ace6d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 510,
      "natural_ratio": 0.0038909912109375,
      "max_context_tokens": 131072,
      "context_length": 948,
      "question_length": 33,
      "prediction": "Antigonus II Mattathias",
      "reference": "Antigonus II Mattathias",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0953209400177002,
      "timestamp": "2025-12-24T20:44:26.424825"
    },
    {
      "sample_id": "squad_57293a886aef051400154ba1",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 656,
      "natural_ratio": 0.0050048828125,
      "max_context_tokens": 131072,
      "context_length": 1203,
      "question_length": 69,
      "prediction": "卡尔·冯·克劳塞维茨",
      "reference": "Karl von Clausewitz",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.11048531532287598,
      "timestamp": "2025-12-24T20:44:26.535479"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_22997",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 101304,
      "natural_ratio": 0.77288818359375,
      "max_context_tokens": 131072,
      "context_length": 202501,
      "question_length": 67,
      "prediction": "她的手提包",
      "reference": "A suitcase with ransom money for her husband Bobby",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.2010200023651123,
      "timestamp": "2025-12-24T20:44:26.736847"
    },
    {
      "sample_id": "squad_57263c94271a42140099d7a7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 337,
      "natural_ratio": 0.00257110595703125,
      "max_context_tokens": 131072,
      "context_length": 597,
      "question_length": 37,
      "prediction": "因为惰性气体虽然减少了灯丝蒸发，但同时也传导热量，从而冷却灯丝，降低了效率。",
      "reference": "conducts heat from the filament, thereby cooling the filament",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.25830745697021484,
      "timestamp": "2025-12-24T20:44:26.995382"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_26566",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 122878,
      "natural_ratio": 0.9374847412109375,
      "max_context_tokens": 131072,
      "context_length": 245670,
      "question_length": 47,
      "prediction": "Joey, played by Pantucci, survived by fixing the remaining engine and creating noise to distract the creature, allowing Finnegan and Trillian to escape in the jet ski.",
      "reference": "He used Finnegan's surfboard to paddle ashore",
      "metrics": {
        "f1": 0.7555555555555556
      },
      "elapsed_time": 3.8033478260040283,
      "timestamp": "2025-12-24T20:44:30.799116"
    },
    {
      "sample_id": "squad_570e79e90b85d914000d7f2d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 748,
      "natural_ratio": 0.005706787109375,
      "max_context_tokens": 131072,
      "context_length": 1398,
      "question_length": 59,
      "prediction": "Yarra River",
      "reference": "Yarra",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.06495857238769531,
      "timestamp": "2025-12-24T20:44:30.864627"
    },
    {
      "sample_id": "squad_5727617f708984140094dcbd",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 404,
      "natural_ratio": 0.003082275390625,
      "max_context_tokens": 131072,
      "context_length": 688,
      "question_length": 81,
      "prediction": "743年",
      "reference": "743",
      "metrics": {
        "f1": 0.8571428571428571
      },
      "elapsed_time": 0.0653231143951416,
      "timestamp": "2025-12-24T20:44:30.930119"
    },
    {
      "sample_id": "squad_570c69aeb3d812140066d1e9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 446,
      "natural_ratio": 0.0034027099609375,
      "max_context_tokens": 131072,
      "context_length": 816,
      "question_length": 37,
      "prediction": "Magna Carta",
      "reference": "Magna Carta",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06557559967041016,
      "timestamp": "2025-12-24T20:44:30.995851"
    },
    {
      "sample_id": "squad_570c8d7cb3d812140066d21f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 427,
      "natural_ratio": 0.00325775146484375,
      "max_context_tokens": 131072,
      "context_length": 749,
      "question_length": 66,
      "prediction": "Catalans think the red/blue colors were chosen by Joan Gamper and are those of his home team, FC Basel.",
      "reference": "Joan Gamper",
      "metrics": {
        "f1": 0.19298245614035087
      },
      "elapsed_time": 0.2739064693450928,
      "timestamp": "2025-12-24T20:44:31.269912"
    },
    {
      "sample_id": "squad_57334b0b4776f4190066080e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 413,
      "natural_ratio": 0.00315093994140625,
      "max_context_tokens": 131072,
      "context_length": 731,
      "question_length": 56,
      "prediction": "约20亿美元",
      "reference": "under $20 billion",
      "metrics": {
        "f1": 0.21052631578947367
      },
      "elapsed_time": 0.06517291069030762,
      "timestamp": "2025-12-24T20:44:31.335234"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_19726",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 123185,
      "natural_ratio": 0.9398269653320312,
      "max_context_tokens": 131072,
      "context_length": 246302,
      "question_length": 28,
      "prediction": "一群武装的暴徒（Raiders） raiding一个购物中心（mall），而不是一个住房项目。",
      "reference": "Roger DiMarco",
      "metrics": {
        "f1": 0.3636363636363636
      },
      "elapsed_time": 5.599550247192383,
      "timestamp": "2025-12-24T20:44:36.935197"
    },
    {
      "sample_id": "squad_572e8aaec246551400ce4316",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 380,
      "natural_ratio": 0.002899169921875,
      "max_context_tokens": 131072,
      "context_length": 629,
      "question_length": 91,
      "prediction": " vibrant new scenes",
      "reference": "vibrant new scenes",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.061063528060913086,
      "timestamp": "2025-12-24T20:44:36.996809"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_16972",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 41301,
      "natural_ratio": 0.31510162353515625,
      "max_context_tokens": 131072,
      "context_length": 82487,
      "question_length": 76,
      "prediction": "将妮可莱特关在一间装饰精美的房间里，并派了一个老妇人陪伴她，还提供了食物和饮料。",
      "reference": "He locked her in a tower",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.351377010345459,
      "timestamp": "2025-12-24T20:44:37.348477"
    },
    {
      "sample_id": "squad_56f8e0b29e9bad19000a065e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 379,
      "natural_ratio": 0.00289154052734375,
      "max_context_tokens": 131072,
      "context_length": 648,
      "question_length": 71,
      "prediction": "零售和住房发展",
      "reference": "a retail and housing development",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.06573820114135742,
      "timestamp": "2025-12-24T20:44:37.414388"
    },
    {
      "sample_id": "squad_5732674b0fdd8d15006c6a99",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 300,
      "natural_ratio": 0.002288818359375,
      "max_context_tokens": 131072,
      "context_length": 507,
      "question_length": 53,
      "prediction": "27次",
      "reference": "27",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 0.054152727127075195,
      "timestamp": "2025-12-24T20:44:37.468694"
    },
    {
      "sample_id": "squad_570b0c33ec8fbc190045b80c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 307,
      "natural_ratio": 0.00234222412109375,
      "max_context_tokens": 131072,
      "context_length": 525,
      "question_length": 49,
      "prediction": "28-或29天的二月或其他30天的月份",
      "reference": "short month",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.18384766578674316,
      "timestamp": "2025-12-24T20:44:37.652697"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_15574",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 106796,
      "natural_ratio": 0.814788818359375,
      "max_context_tokens": 131072,
      "context_length": 213442,
      "question_length": 111,
      "prediction": "Rachel Banks",
      "reference": "Reynolds' bank account.",
      "metrics": {
        "f1": 0.7142857142857143
      },
      "elapsed_time": 3.6028623580932617,
      "timestamp": "2025-12-24T20:44:41.255895"
    },
    {
      "sample_id": "squad_572941d96aef051400154c11",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 316,
      "natural_ratio": 0.002410888671875,
      "max_context_tokens": 131072,
      "context_length": 538,
      "question_length": 55,
      "prediction": "葡萄牙语",
      "reference": "Portuguese",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.04883527755737305,
      "timestamp": "2025-12-24T20:44:41.305278"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_30185",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 103914,
      "natural_ratio": 0.7928009033203125,
      "max_context_tokens": 131072,
      "context_length": 207746,
      "question_length": 42,
      "prediction": "The murder of crows",
      "reference": "A murder of crows.",
      "metrics": {
        "f1": 0.8461538461538461
      },
      "elapsed_time": 4.25521445274353,
      "timestamp": "2025-12-24T20:44:45.560865"
    },
    {
      "sample_id": "squad_57269560dd62a815002e8a5b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 777,
      "natural_ratio": 0.00592803955078125,
      "max_context_tokens": 131072,
      "context_length": 1423,
      "question_length": 91,
      "prediction": "Clement Greenberg",
      "reference": "Clement Greenberg.",
      "metrics": {
        "f1": 0.9523809523809523
      },
      "elapsed_time": 0.08672499656677246,
      "timestamp": "2025-12-24T20:44:45.647794"
    },
    {
      "sample_id": "squad_57318db8497a881900249026",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 386,
      "natural_ratio": 0.0029449462890625,
      "max_context_tokens": 131072,
      "context_length": 667,
      "question_length": 65,
      "prediction": "19th世纪",
      "reference": "19th",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 0.06527948379516602,
      "timestamp": "2025-12-24T20:44:45.713251"
    },
    {
      "sample_id": "squad_573326c84776f4190066071e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 385,
      "natural_ratio": 0.00293731689453125,
      "max_context_tokens": 131072,
      "context_length": 652,
      "question_length": 78,
      "prediction": "因为这本书的体积庞大（超过2000页），目标读者群体狭窄（仅为专业数学家），导致出版成本超出预期。",
      "reference": "the three-volume work was so massive (more than 2,000 pages) and its audience so narrow (professional mathematicians)",
      "metrics": {
        "f1": 0.060606060606060615
      },
      "elapsed_time": 0.31421780586242676,
      "timestamp": "2025-12-24T20:44:46.027647"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_7288",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 93591,
      "natural_ratio": 0.7140426635742188,
      "max_context_tokens": 131072,
      "context_length": 187078,
      "question_length": 64,
      "prediction": "Kate Hannah lies to her students that she is pregnant after she throws up.",
      "reference": "That she is pregnant.",
      "metrics": {
        "f1": 0.7741935483870968
      },
      "elapsed_time": 2.8419082164764404,
      "timestamp": "2025-12-24T20:44:48.869938"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_4828",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 122379,
      "natural_ratio": 0.9336776733398438,
      "max_context_tokens": 131072,
      "context_length": 244683,
      "question_length": 35,
      "prediction": "Tik-tok products music from his breath.",
      "reference": "Musicker",
      "metrics": {
        "f1": 0.5925925925925926
      },
      "elapsed_time": 4.628596782684326,
      "timestamp": "2025-12-24T20:44:53.498942"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_21356",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 60297,
      "natural_ratio": 0.46002960205078125,
      "max_context_tokens": 131072,
      "context_length": 120502,
      "question_length": 52,
      "prediction": "文本中并没有提到Ash Williams被Lord Arthur的士兵带走的原因。",
      "reference": "He is thought to work for Duke Henry.",
      "metrics": {
        "f1": 0.42553191489361697
      },
      "elapsed_time": 2.557408094406128,
      "timestamp": "2025-12-24T20:44:56.057071"
    },
    {
      "sample_id": "squad_57276e51708984140094dd5e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 636,
      "natural_ratio": 0.004852294921875,
      "max_context_tokens": 131072,
      "context_length": 1191,
      "question_length": 41,
      "prediction": "784–967",
      "reference": "784–967",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.10781097412109375,
      "timestamp": "2025-12-24T20:44:56.165199"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_14611",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 72877,
      "natural_ratio": 0.5560073852539062,
      "max_context_tokens": 131072,
      "context_length": 145668,
      "question_length": 47,
      "prediction": "The Ninth Gates story involves a mysterious book that is suspected to be a forgery and is connected to supernatural events, including demonic rituals and a series of murders.",
      "reference": "it is said to have a way to call the devil, to get invincibility and immortality",
      "metrics": {
        "f1": 0.8636363636363635
      },
      "elapsed_time": 2.749227523803711,
      "timestamp": "2025-12-24T20:44:58.914797"
    },
    {
      "sample_id": "squad_57318d40a5e9cc1400cdc052",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 459,
      "natural_ratio": 0.00350189208984375,
      "max_context_tokens": 131072,
      "context_length": 854,
      "question_length": 25,
      "prediction": "科幻",
      "reference": "futuristic science fiction",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.04014849662780762,
      "timestamp": "2025-12-24T20:44:58.955141"
    },
    {
      "sample_id": "squad_56e74a8e00c9c71400d76f5e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 329,
      "natural_ratio": 0.00251007080078125,
      "max_context_tokens": 131072,
      "context_length": 554,
      "question_length": 65,
      "prediction": "19世纪",
      "reference": "19th",
      "metrics": {
        "f1": 0.5
      },
      "elapsed_time": 0.05454111099243164,
      "timestamp": "2025-12-24T20:44:59.009840"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_32106",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 110497,
      "natural_ratio": 0.8430252075195312,
      "max_context_tokens": 131072,
      "context_length": 220905,
      "question_length": 50,
      "prediction": "大内陆湖",
      "reference": "The English Channel",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.8250651359558105,
      "timestamp": "2025-12-24T20:45:02.835260"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_31155",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 70909,
      "natural_ratio": 0.5409927368164062,
      "max_context_tokens": 131072,
      "context_length": 141730,
      "question_length": 49,
      "prediction": "Sera likely got back together with Ben because she realized she cared for him and didn't want to lose him, especially after his illness and her own experiences.",
      "reference": "A PHONE CALL FROM HIM.",
      "metrics": {
        "f1": 0.6829268292682926
      },
      "elapsed_time": 3.6766743659973145,
      "timestamp": "2025-12-24T20:45:06.512679"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_25176",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 88717,
      "natural_ratio": 0.6768569946289062,
      "max_context_tokens": 131072,
      "context_length": 177345,
      "question_length": 49,
      "prediction": "John",
      "reference": "JOHN",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 3.8137028217315674,
      "timestamp": "2025-12-24T20:45:10.326775"
    },
    {
      "sample_id": "squad_5727f5f53acd2414000df0d9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 608,
      "natural_ratio": 0.004638671875,
      "max_context_tokens": 131072,
      "context_length": 1142,
      "question_length": 35,
      "prediction": "意大利文艺复兴",
      "reference": "Italian Renaissance",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.06264400482177734,
      "timestamp": "2025-12-24T20:45:10.389607"
    },
    {
      "sample_id": "squad_5726b22bf1498d1400e8e7bc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 619,
      "natural_ratio": 0.00472259521484375,
      "max_context_tokens": 131072,
      "context_length": 1146,
      "question_length": 52,
      "prediction": "居民们讲英语和诺福克岛克里奥尔语（Norfuk）。",
      "reference": "English and a creole language known as Norfuk",
      "metrics": {
        "f1": 0.3
      },
      "elapsed_time": 0.21949172019958496,
      "timestamp": "2025-12-24T20:45:10.609281"
    },
    {
      "sample_id": "squad_5727bf482ca10214002d9557",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 486,
      "natural_ratio": 0.0037078857421875,
      "max_context_tokens": 131072,
      "context_length": 887,
      "question_length": 45,
      "prediction": "Rous (equestrian)",
      "reference": "Rous",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.09578347206115723,
      "timestamp": "2025-12-24T20:45:10.705226"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_18867",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 90002,
      "natural_ratio": 0.6866607666015625,
      "max_context_tokens": 131072,
      "context_length": 179911,
      "question_length": 53,
      "prediction": "计算出一个复杂问题的答案",
      "reference": "Pick a train that will pass more than hers.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.2952170372009277,
      "timestamp": "2025-12-24T20:45:14.000789"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_3507",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 100633,
      "natural_ratio": 0.7677688598632812,
      "max_context_tokens": 131072,
      "context_length": 201182,
      "question_length": 45,
      "prediction": "Delta Zeta House",
      "reference": "A sorority house",
      "metrics": {
        "f1": 0.7272727272727273
      },
      "elapsed_time": 4.101528644561768,
      "timestamp": "2025-12-24T20:45:18.102747"
    },
    {
      "sample_id": "squad_5730ca8fb7151e1900c01537",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 1003,
      "natural_ratio": 0.00765228271484375,
      "max_context_tokens": 131072,
      "context_length": 1870,
      "question_length": 96,
      "prediction": "Justinian I",
      "reference": "Emperor Justinian I,",
      "metrics": {
        "f1": 0.7272727272727273
      },
      "elapsed_time": 0.07043313980102539,
      "timestamp": "2025-12-24T20:45:18.173374"
    },
    {
      "sample_id": "squad_56dfc016231d4119001abd8e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 122,
      "natural_ratio": 0.0009307861328125,
      "max_context_tokens": 131072,
      "context_length": 166,
      "question_length": 38,
      "prediction": "Internet hosting services",
      "reference": "Internet hosting services",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.049551963806152344,
      "timestamp": "2025-12-24T20:45:18.223086"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_14476",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 65597,
      "natural_ratio": 0.5004653930664062,
      "max_context_tokens": 131072,
      "context_length": 131128,
      "question_length": 26,
      "prediction": "Will works in a bookshop.",
      "reference": "Bookstore owner",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 2.397731304168701,
      "timestamp": "2025-12-24T20:45:20.621195"
    },
    {
      "sample_id": "squad_572ecda5cb0c0d14000f159a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 308,
      "natural_ratio": 0.002349853515625,
      "max_context_tokens": 131072,
      "context_length": 526,
      "question_length": 51,
      "prediction": "太平洋西北地区",
      "reference": "Pacific Northwest",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.058692216873168945,
      "timestamp": "2025-12-24T20:45:20.680097"
    },
    {
      "sample_id": "squad_57288f642ca10214002da472",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 592,
      "natural_ratio": 0.0045166015625,
      "max_context_tokens": 131072,
      "context_length": 1062,
      "question_length": 83,
      "prediction": "美国教育部",
      "reference": "United States Department of Education",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.04642224311828613,
      "timestamp": "2025-12-24T20:45:20.726682"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_6517",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 123731,
      "natural_ratio": 0.9439926147460938,
      "max_context_tokens": 131072,
      "context_length": 247366,
      "question_length": 57,
      "prediction": "The drug dealers are arrested and taken to the police station.",
      "reference": "They are released after the Feds step in.",
      "metrics": {
        "f1": 0.7878787878787877
      },
      "elapsed_time": 4.548088073730469,
      "timestamp": "2025-12-24T20:45:25.275124"
    },
    {
      "sample_id": "squad_5706c2a60eeca41400aa0e0e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 230,
      "natural_ratio": 0.0017547607421875,
      "max_context_tokens": 131072,
      "context_length": 360,
      "question_length": 61,
      "prediction": "苏联",
      "reference": "Soviet Union",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.037444114685058594,
      "timestamp": "2025-12-24T20:45:25.313131"
    },
    {
      "sample_id": "squad_56e7b0b737bdd419002c4366",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 203,
      "natural_ratio": 0.00154876708984375,
      "max_context_tokens": 131072,
      "context_length": 321,
      "question_length": 45,
      "prediction": "Kunqu",
      "reference": "Kunqu",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.051674842834472656,
      "timestamp": "2025-12-24T20:45:25.364958"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_26153",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 80921,
      "natural_ratio": 0.6173782348632812,
      "max_context_tokens": 131072,
      "context_length": 161754,
      "question_length": 48,
      "prediction": "Thorn三角形",
      "reference": "Thorn.",
      "metrics": {
        "f1": 0.7142857142857143
      },
      "elapsed_time": 3.3809423446655273,
      "timestamp": "2025-12-24T20:45:28.746291"
    },
    {
      "sample_id": "squad_57324760e17f3d14004227c7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 538,
      "natural_ratio": 0.0041046142578125,
      "max_context_tokens": 131072,
      "context_length": 973,
      "question_length": 63,
      "prediction": "1928",
      "reference": "1928",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07090878486633301,
      "timestamp": "2025-12-24T20:45:28.817397"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_31116",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 49039,
      "natural_ratio": 0.37413787841796875,
      "max_context_tokens": 131072,
      "context_length": 98001,
      "question_length": 37,
      "prediction": "因为他的女儿去世了，原文中提到\"She died of heartbreak. But that has nothing to do with my story.\"表明他的女儿的去世让他心碎。",
      "reference": "Their wounded honor and the unfixabe weapon.",
      "metrics": {
        "f1": 0.5084745762711864
      },
      "elapsed_time": 1.5716784000396729,
      "timestamp": "2025-12-24T20:45:30.389422"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_1776",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 115597,
      "natural_ratio": 0.8819351196289062,
      "max_context_tokens": 131072,
      "context_length": 231136,
      "question_length": 18,
      "prediction": "Bourne",
      "reference": "Manheim, under the orders of Abbtott.",
      "metrics": {
        "f1": 0.5217391304347826
      },
      "elapsed_time": 3.0208537578582764,
      "timestamp": "2025-12-24T20:45:33.410601"
    },
    {
      "sample_id": "squad_57326547e99e3014001e677a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 491,
      "natural_ratio": 0.00374603271484375,
      "max_context_tokens": 131072,
      "context_length": 901,
      "question_length": 42,
      "prediction": "因立法过程中对发行债券融资该项目存在争议。",
      "reference": "issuance of bonds to finance the project",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.14031434059143066,
      "timestamp": "2025-12-24T20:45:33.551476"
    },
    {
      "sample_id": "squad_57302ef4b2c2fd1400568a15",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 314,
      "natural_ratio": 0.0023956298828125,
      "max_context_tokens": 131072,
      "context_length": 522,
      "question_length": 67,
      "prediction": "Circularly polarized antenna",
      "reference": "circularly polarized",
      "metrics": {
        "f1": 0.8333333333333333
      },
      "elapsed_time": 0.07435894012451172,
      "timestamp": "2025-12-24T20:45:33.626008"
    },
    {
      "sample_id": "squad_5729450a6aef051400154c31",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 683,
      "natural_ratio": 0.00521087646484375,
      "max_context_tokens": 131072,
      "context_length": 1262,
      "question_length": 64,
      "prediction": "Sainte-Chapelle, Paris",
      "reference": "Sainte-Chapelle",
      "metrics": {
        "f1": 0.8108108108108109
      },
      "elapsed_time": 0.09877920150756836,
      "timestamp": "2025-12-24T20:45:33.724948"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_31158",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 92671,
      "natural_ratio": 0.7070236206054688,
      "max_context_tokens": 131072,
      "context_length": 185227,
      "question_length": 76,
      "prediction": "Hoth",
      "reference": "The ice planet Hoth",
      "metrics": {
        "f1": 0.42857142857142855
      },
      "elapsed_time": 4.075650215148926,
      "timestamp": "2025-12-24T20:45:37.800992"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_32667",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 54589,
      "natural_ratio": 0.41648101806640625,
      "max_context_tokens": 131072,
      "context_length": 109098,
      "question_length": 41,
      "prediction": "文本中并未提到Champagne的刑期信息，因此无法回答他在联邦监狱待了多长时间。",
      "reference": "21 months",
      "metrics": {
        "f1": 0.12244897959183673
      },
      "elapsed_time": 1.7091827392578125,
      "timestamp": "2025-12-24T20:45:39.510504"
    },
    {
      "sample_id": "squad_570d6cecb3d812140066d8e1",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 638,
      "natural_ratio": 0.0048675537109375,
      "max_context_tokens": 131072,
      "context_length": 1142,
      "question_length": 94,
      "prediction": "许多公里",
      "reference": "many kilometers",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.053941965103149414,
      "timestamp": "2025-12-24T20:45:39.564657"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_13056",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 83977,
      "natural_ratio": 0.6406936645507812,
      "max_context_tokens": 131072,
      "context_length": 167865,
      "question_length": 49,
      "prediction": "David becomes a werewolf.",
      "reference": "A werewolf.",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 2.406998634338379,
      "timestamp": "2025-12-24T20:45:41.972048"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_2040",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 64844,
      "natural_ratio": 0.494720458984375,
      "max_context_tokens": 131072,
      "context_length": 129587,
      "question_length": 62,
      "prediction": "Cadmus and Harmonia are fated to be turned into serpents.",
      "reference": "Snakes",
      "metrics": {
        "f1": 0.3478260869565218
      },
      "elapsed_time": 2.0126900672912598,
      "timestamp": "2025-12-24T20:45:43.985108"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_2259",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 65049,
      "natural_ratio": 0.49628448486328125,
      "max_context_tokens": 131072,
      "context_length": 129997,
      "question_length": 62,
      "prediction": "Jennifer Jollie",
      "reference": "Randy",
      "metrics": {
        "f1": 0.2857142857142857
      },
      "elapsed_time": 2.5667459964752197,
      "timestamp": "2025-12-24T20:45:46.552173"
    },
    {
      "sample_id": "squad_56df50e896943c1400a5d383",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 160,
      "natural_ratio": 0.001220703125,
      "max_context_tokens": 131072,
      "context_length": 231,
      "question_length": 49,
      "prediction": "Nigeria",
      "reference": "Nigeria",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04622769355773926,
      "timestamp": "2025-12-24T20:45:46.598609"
    },
    {
      "sample_id": "squad_57274c04708984140094dbd5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 378,
      "natural_ratio": 0.0028839111328125,
      "max_context_tokens": 131072,
      "context_length": 658,
      "question_length": 59,
      "prediction": "与由无性繁殖的父母产生的后代在基因上完全相同。",
      "reference": "genetically identical to the parent",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.1744217872619629,
      "timestamp": "2025-12-24T20:45:46.773212"
    },
    {
      "sample_id": "squad_570b66566b8089140040f931",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 498,
      "natural_ratio": 0.0037994384765625,
      "max_context_tokens": 131072,
      "context_length": 909,
      "question_length": 48,
      "prediction": "Los Angeles",
      "reference": "Los Angeles",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.046112060546875,
      "timestamp": "2025-12-24T20:45:46.819501"
    },
    {
      "sample_id": "squad_570e5b3b0dc6ce1900204f89",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 381,
      "natural_ratio": 0.00290679931640625,
      "max_context_tokens": 131072,
      "context_length": 670,
      "question_length": 52,
      "prediction": "quarter acre home and garden",
      "reference": "quarter acre home and garden",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07448148727416992,
      "timestamp": "2025-12-24T20:45:46.894150"
    },
    {
      "sample_id": "squad_570b6da3ec8fbc190045ba17",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 1154,
      "natural_ratio": 0.0088043212890625,
      "max_context_tokens": 131072,
      "context_length": 2215,
      "question_length": 53,
      "prediction": "1975",
      "reference": "1975",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07673192024230957,
      "timestamp": "2025-12-24T20:45:46.971041"
    },
    {
      "sample_id": "squad_57284ab33acd2414000df8b5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 339,
      "natural_ratio": 0.00258636474609375,
      "max_context_tokens": 131072,
      "context_length": 600,
      "question_length": 38,
      "prediction": "十六个王国被称为Mahajanapadas。",
      "reference": "Mahajanapadas",
      "metrics": {
        "f1": 0.7428571428571429
      },
      "elapsed_time": 0.11389279365539551,
      "timestamp": "2025-12-24T20:45:47.085090"
    },
    {
      "sample_id": "squad_570d3c29b3d812140066d5b0",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 916,
      "natural_ratio": 0.006988525390625,
      "max_context_tokens": 131072,
      "context_length": 1728,
      "question_length": 64,
      "prediction": "8 GB",
      "reference": "8 GB",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0518488883972168,
      "timestamp": "2025-12-24T20:45:47.137093"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_4074",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 105521,
      "natural_ratio": 0.8050613403320312,
      "max_context_tokens": 131072,
      "context_length": 210934,
      "question_length": 69,
      "prediction": "银行",
      "reference": "Buffalo Savings Bank",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.27358078956604,
      "timestamp": "2025-12-24T20:45:50.411016"
    },
    {
      "sample_id": "squad_5727fc492ca10214002d9ab8",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 528,
      "natural_ratio": 0.0040283203125,
      "max_context_tokens": 131072,
      "context_length": 942,
      "question_length": 75,
      "prediction": "英国政府",
      "reference": "British government",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.05092120170593262,
      "timestamp": "2025-12-24T20:45:50.462122"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_4572",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 70686,
      "natural_ratio": 0.5392913818359375,
      "max_context_tokens": 131072,
      "context_length": 141309,
      "question_length": 23,
      "prediction": "Eyolf's injury is a shrunken leg and lameness due to an unspecified fall from a table.",
      "reference": "He is paralyzed in one of his legs.",
      "metrics": {
        "f1": 0.7804878048780487
      },
      "elapsed_time": 2.549682378768921,
      "timestamp": "2025-12-24T20:45:53.012147"
    },
    {
      "sample_id": "squad_5727d8773acd2414000dedf6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 413,
      "natural_ratio": 0.00315093994140625,
      "max_context_tokens": 131072,
      "context_length": 726,
      "question_length": 60,
      "prediction": "Aristotle",
      "reference": "Aristotle",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05913686752319336,
      "timestamp": "2025-12-24T20:45:53.071494"
    },
    {
      "sample_id": "squad_570d9d86df2f5219002ed053",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 318,
      "natural_ratio": 0.0024261474609375,
      "max_context_tokens": 131072,
      "context_length": 533,
      "question_length": 64,
      "prediction": "Modular Handgun System程序",
      "reference": "Modular Handgun System",
      "metrics": {
        "f1": 0.9565217391304348
      },
      "elapsed_time": 0.08447003364562988,
      "timestamp": "2025-12-24T20:45:53.156130"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_22433",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 76049,
      "natural_ratio": 0.5802078247070312,
      "max_context_tokens": 131072,
      "context_length": 152011,
      "question_length": 48,
      "prediction": "因为根据她的父亲的意愿和安排，她与比利从小就认识并且被介绍要相爱。但是由于阅读书籍的影响，她开始怀疑对比利的感情，并认为自己可能只是出于对初恋的怀念而保留了对他的感情，而不是真正的爱情。",
      "reference": "money",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 2.8657026290893555,
      "timestamp": "2025-12-24T20:45:56.022164"
    },
    {
      "sample_id": "squad_56deeb2a3277331400b4d815",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 183,
      "natural_ratio": 0.00139617919921875,
      "max_context_tokens": 131072,
      "context_length": 298,
      "question_length": 28,
      "prediction": "2nd Canadian Division, the 3rd Canadian Division, the 4th Canadian Division, the 5th Canadian Division",
      "reference": "the 2nd Canadian Division, the 3rd Canadian Division, the 4th Canadian Division and the 5th Canadian Division",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.27062129974365234,
      "timestamp": "2025-12-24T20:45:56.292991"
    },
    {
      "sample_id": "squad_573012fab2c2fd140056880c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 489,
      "natural_ratio": 0.00373077392578125,
      "max_context_tokens": 131072,
      "context_length": 875,
      "question_length": 63,
      "prediction": "1,307,402",
      "reference": "1,307,402",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.12268400192260742,
      "timestamp": "2025-12-24T20:45:56.415850"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_24558",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 95879,
      "natural_ratio": 0.7314987182617188,
      "max_context_tokens": 131072,
      "context_length": 191657,
      "question_length": 61,
      "prediction": "在 Penguin 尝试用他的伞枪攻击蝙蝠侠时，蝙蝠侠用一个按钮装置释放了一群蝙蝠攻击他，导致 Penguin 被蝙蝠包围并被拖走。",
      "reference": "He dies.",
      "metrics": {
        "f1": 0.125
      },
      "elapsed_time": 3.0911433696746826,
      "timestamp": "2025-12-24T20:45:59.507426"
    },
    {
      "sample_id": "squad_57266a195951b619008f71fa",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 309,
      "natural_ratio": 0.00235748291015625,
      "max_context_tokens": 131072,
      "context_length": 529,
      "question_length": 49,
      "prediction": "19世纪",
      "reference": "the 19th century",
      "metrics": {
        "f1": 0.26666666666666666
      },
      "elapsed_time": 0.06031036376953125,
      "timestamp": "2025-12-24T20:45:59.567971"
    },
    {
      "sample_id": "squad_572ebcb4c246551400ce45bc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 393,
      "natural_ratio": 0.00299835205078125,
      "max_context_tokens": 131072,
      "context_length": 696,
      "question_length": 51,
      "prediction": "一些电气性质在这一真空度下变得可观察。",
      "reference": "A number of electrical properties",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.14578032493591309,
      "timestamp": "2025-12-24T20:45:59.713932"
    },
    {
      "sample_id": "squad_572833674b864d1900164707",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 372,
      "natural_ratio": 0.002838134765625,
      "max_context_tokens": 131072,
      "context_length": 659,
      "question_length": 45,
      "prediction": "中奥陶世，即约472至461百万年前。",
      "reference": "472 to 461 million years ago",
      "metrics": {
        "f1": 0.2777777777777778
      },
      "elapsed_time": 0.19411087036132812,
      "timestamp": "2025-12-24T20:45:59.908218"
    },
    {
      "sample_id": "squad_56bfdd3fa10cfb140055134f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 264,
      "natural_ratio": 0.00201416015625,
      "max_context_tokens": 131072,
      "context_length": 453,
      "question_length": 36,
      "prediction": "Starpower: Beyoncé",
      "reference": "Starpower: Beyoncé",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08314895629882812,
      "timestamp": "2025-12-24T20:45:59.991533"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_5740",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 46071,
      "natural_ratio": 0.35149383544921875,
      "max_context_tokens": 131072,
      "context_length": 92081,
      "question_length": 22,
      "prediction": "Holly lives in Fort Dupree, South Dakota.",
      "reference": "Fort Dupree, SD",
      "metrics": {
        "f1": 0.7096774193548387
      },
      "elapsed_time": 1.5505003929138184,
      "timestamp": "2025-12-24T20:46:01.542368"
    },
    {
      "sample_id": "squad_5730e504f6cb411900e24522",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 496,
      "natural_ratio": 0.0037841796875,
      "max_context_tokens": 131072,
      "context_length": 880,
      "question_length": 72,
      "prediction": "46%",
      "reference": "46%",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.059023141860961914,
      "timestamp": "2025-12-24T20:46:01.601584"
    },
    {
      "sample_id": "squad_56de7aaa4396321400ee2947",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 167,
      "natural_ratio": 0.00127410888671875,
      "max_context_tokens": 131072,
      "context_length": 247,
      "question_length": 48,
      "prediction": "Birmingham",
      "reference": "Birmingham",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.040561676025390625,
      "timestamp": "2025-12-24T20:46:01.642306"
    },
    {
      "sample_id": "squad_5727be6f3acd2414000deb3c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 365,
      "natural_ratio": 0.00278472900390625,
      "max_context_tokens": 131072,
      "context_length": 633,
      "question_length": 58,
      "prediction": "区 qū",
      "reference": "qū",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.05433225631713867,
      "timestamp": "2025-12-24T20:46:01.696796"
    },
    {
      "sample_id": "squad_57103069a58dae1900cd694a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 638,
      "natural_ratio": 0.0048675537109375,
      "max_context_tokens": 131072,
      "context_length": 1192,
      "question_length": 45,
      "prediction": "研究显示，兄弟数量与同性恋倾向之间存在关联（即亲兄弟出生顺序效应），尽管尚未找到确切的孕前机制证据，但有强有力的证据支持其孕前起源，并且研究提示这可能源于免疫学原因。",
      "reference": "this may be of immunological origin, caused by a maternal immune reaction against a substance crucial to male fetal development during pregnancy,",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.5341448783874512,
      "timestamp": "2025-12-24T20:46:02.231108"
    },
    {
      "sample_id": "squad_56dcf81e66d3e219004dab6e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 479,
      "natural_ratio": 0.00365447998046875,
      "max_context_tokens": 131072,
      "context_length": 880,
      "question_length": 39,
      "prediction": "2002年1月",
      "reference": "January 2002",
      "metrics": {
        "f1": 0.2857142857142857
      },
      "elapsed_time": 0.0949697494506836,
      "timestamp": "2025-12-24T20:46:02.326229"
    },
    {
      "sample_id": "squad_5728c9a54b864d1900164e29",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 325,
      "natural_ratio": 0.00247955322265625,
      "max_context_tokens": 131072,
      "context_length": 541,
      "question_length": 70,
      "prediction": "mid-7th to early 8th centuries",
      "reference": "mid-7th to early 8th centuries",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.12415480613708496,
      "timestamp": "2025-12-24T20:46:02.450569"
    },
    {
      "sample_id": "squad_5728f2a34b864d1900165102",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 306,
      "natural_ratio": 0.0023345947265625,
      "max_context_tokens": 131072,
      "context_length": 514,
      "question_length": 59,
      "prediction": "samurai",
      "reference": "Samurai",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04369020462036133,
      "timestamp": "2025-12-24T20:46:02.494421"
    },
    {
      "sample_id": "squad_5726d94d708984140094d37b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 510,
      "natural_ratio": 0.0038909912109375,
      "max_context_tokens": 131072,
      "context_length": 951,
      "question_length": 30,
      "prediction": "章",
      "reference": "zhāng",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.038880348205566406,
      "timestamp": "2025-12-24T20:46:02.533451"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_25049",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 64612,
      "natural_ratio": 0.492950439453125,
      "max_context_tokens": 131072,
      "context_length": 129126,
      "question_length": 59,
      "prediction": "Queen Phillip说服国王宽恕加来市民。",
      "reference": "Queen Philippa",
      "metrics": {
        "f1": 0.6
      },
      "elapsed_time": 2.0052011013031006,
      "timestamp": "2025-12-24T20:46:04.539015"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_17779",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 86064,
      "natural_ratio": 0.6566162109375,
      "max_context_tokens": 131072,
      "context_length": 172028,
      "question_length": 60,
      "prediction": "Delacroix",
      "reference": "Manray",
      "metrics": {
        "f1": 0.2857142857142857
      },
      "elapsed_time": 4.347479581832886,
      "timestamp": "2025-12-24T20:46:08.886906"
    },
    {
      "sample_id": "squad_5727e8484b864d1900163fc9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 540,
      "natural_ratio": 0.004119873046875,
      "max_context_tokens": 131072,
      "context_length": 993,
      "question_length": 47,
      "prediction": "Middle Ages",
      "reference": "Middle Ages",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05105733871459961,
      "timestamp": "2025-12-24T20:46:08.938157"
    },
    {
      "sample_id": "squad_5730fd7e497a881900248acd",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 674,
      "natural_ratio": 0.0051422119140625,
      "max_context_tokens": 131072,
      "context_length": 1276,
      "question_length": 32,
      "prediction": "Bloomberg命名纳米比亚为非洲最佳新兴市场经济体，并且是全球第13佳的经济体。",
      "reference": "top emerging market economy in Africa",
      "metrics": {
        "f1": 0.19607843137254902
      },
      "elapsed_time": 0.24909162521362305,
      "timestamp": "2025-12-24T20:46:09.187423"
    },
    {
      "sample_id": "squad_56cf6141aab44d1400b89195",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 625,
      "natural_ratio": 0.00476837158203125,
      "max_context_tokens": 131072,
      "context_length": 1143,
      "question_length": 67,
      "prediction": "Foxy Brown",
      "reference": "Foxy Brown",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05890035629272461,
      "timestamp": "2025-12-24T20:46:09.246491"
    },
    {
      "sample_id": "squad_5726e2fcf1498d1400e8eea3",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 400,
      "natural_ratio": 0.0030517578125,
      "max_context_tokens": 131072,
      "context_length": 705,
      "question_length": 55,
      "prediction": "CPC",
      "reference": "CPC",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.044916391372680664,
      "timestamp": "2025-12-24T20:46:09.291559"
    },
    {
      "sample_id": "squad_5726252189a1e219009ac35c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 533,
      "natural_ratio": 0.00406646728515625,
      "max_context_tokens": 131072,
      "context_length": 980,
      "question_length": 46,
      "prediction": "The Crown Estate",
      "reference": "Crown Estate",
      "metrics": {
        "f1": 0.8571428571428571
      },
      "elapsed_time": 0.055208444595336914,
      "timestamp": "2025-12-24T20:46:09.346917"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_29014",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 77203,
      "natural_ratio": 0.5890121459960938,
      "max_context_tokens": 131072,
      "context_length": 154337,
      "question_length": 30,
      "prediction": "Iailor成为了Emilia的体guard。",
      "reference": "Arcite was Emilia's bodyguard",
      "metrics": {
        "f1": 0.5882352941176471
      },
      "elapsed_time": 2.9656620025634766,
      "timestamp": "2025-12-24T20:46:12.312904"
    },
    {
      "sample_id": "squad_572958236aef051400154d25",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 436,
      "natural_ratio": 0.003326416015625,
      "max_context_tokens": 131072,
      "context_length": 764,
      "question_length": 69,
      "prediction": "软件开发过程",
      "reference": "software development process",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.05970001220703125,
      "timestamp": "2025-12-24T20:46:12.372805"
    },
    {
      "sample_id": "squad_56cff665234ae51400d9c189",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 212,
      "natural_ratio": 0.001617431640625,
      "max_context_tokens": 131072,
      "context_length": 346,
      "question_length": 38,
      "prediction": "54 million",
      "reference": "54 million",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05747509002685547,
      "timestamp": "2025-12-24T20:46:12.430462"
    },
    {
      "sample_id": "squad_56f78825aef2371900625b98",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 414,
      "natural_ratio": 0.0031585693359375,
      "max_context_tokens": 131072,
      "context_length": 727,
      "question_length": 62,
      "prediction": " Federated States of Micronesia",
      "reference": "the Federated States of Micronesia",
      "metrics": {
        "f1": 0.962962962962963
      },
      "elapsed_time": 0.09547042846679688,
      "timestamp": "2025-12-24T20:46:12.526097"
    },
    {
      "sample_id": "squad_57277da6dd62a815002e9e88",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 343,
      "natural_ratio": 0.00261688232421875,
      "max_context_tokens": 131072,
      "context_length": 535,
      "question_length": 112,
      "prediction": "Russian and Austrian forces",
      "reference": "Russian and Austrian",
      "metrics": {
        "f1": 0.8571428571428571
      },
      "elapsed_time": 0.0641329288482666,
      "timestamp": "2025-12-24T20:46:12.590387"
    },
    {
      "sample_id": "squad_5706046752bb8914006897aa",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 259,
      "natural_ratio": 0.00197601318359375,
      "max_context_tokens": 131072,
      "context_length": 430,
      "question_length": 48,
      "prediction": "通常，迁徙的鸟类不会遵循直线路径，而是可能会沿着钩形或弧形线路迁徙，绕过地理障碍。",
      "reference": "an hooked or arched line",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.3070533275604248,
      "timestamp": "2025-12-24T20:46:12.897657"
    },
    {
      "sample_id": "squad_57267de2dd62a815002e8712",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 376,
      "natural_ratio": 0.00286865234375,
      "max_context_tokens": 131072,
      "context_length": 657,
      "question_length": 56,
      "prediction": "Orestis Laskos",
      "reference": "Orestis Laskos",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08636593818664551,
      "timestamp": "2025-12-24T20:46:12.984205"
    },
    {
      "sample_id": "squad_5726f8735951b619008f83a5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 597,
      "natural_ratio": 0.00455474853515625,
      "max_context_tokens": 131072,
      "context_length": 1098,
      "question_length": 57,
      "prediction": "540",
      "reference": "540",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.055632591247558594,
      "timestamp": "2025-12-24T20:46:13.040001"
    },
    {
      "sample_id": "squad_57277ff9dd62a815002e9ed7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 382,
      "natural_ratio": 0.0029144287109375,
      "max_context_tokens": 131072,
      "context_length": 673,
      "question_length": 52,
      "prediction": "中国",
      "reference": "China",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.03401970863342285,
      "timestamp": "2025-12-24T20:46:13.074171"
    },
    {
      "sample_id": "squad_56e720f137bdd419002c3d78",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 273,
      "natural_ratio": 0.00208282470703125,
      "max_context_tokens": 131072,
      "context_length": 412,
      "question_length": 94,
      "prediction": "roughly",
      "reference": "equal",
      "metrics": {
        "f1": 0.3333333333333333
      },
      "elapsed_time": 0.04036879539489746,
      "timestamp": "2025-12-24T20:46:13.114691"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_13879",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 101480,
      "natural_ratio": 0.77423095703125,
      "max_context_tokens": 131072,
      "context_length": 202883,
      "question_length": 37,
      "prediction": "贝克使用血液来引诱那东西。",
      "reference": "Beck uses a pint of his own blood.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.04998517036438,
      "timestamp": "2025-12-24T20:46:16.165034"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_24442",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 48274,
      "natural_ratio": 0.3683013916015625,
      "max_context_tokens": 131072,
      "context_length": 96447,
      "question_length": 61,
      "prediction": "Nello takes on the job of selling milk and receiving payment for it, then bringing the coins back to their home.",
      "reference": "Selling milk",
      "metrics": {
        "f1": 0.19354838709677416
      },
      "elapsed_time": 1.4681050777435303,
      "timestamp": "2025-12-24T20:46:17.633501"
    },
    {
      "sample_id": "squad_57266e09dd62a815002e848c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 371,
      "natural_ratio": 0.00283050537109375,
      "max_context_tokens": 131072,
      "context_length": 673,
      "question_length": 29,
      "prediction": "Último Dragón",
      "reference": "Último Dragón",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0782623291015625,
      "timestamp": "2025-12-24T20:46:17.711945"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_7020",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 25808,
      "natural_ratio": 0.1968994140625,
      "max_context_tokens": 131072,
      "context_length": 51524,
      "question_length": 53,
      "prediction": "lower regions",
      "reference": "Hades.",
      "metrics": {
        "f1": 0.25
      },
      "elapsed_time": 0.5995392799377441,
      "timestamp": "2025-12-24T20:46:18.311727"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_25181",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 87986,
      "natural_ratio": 0.6712799072265625,
      "max_context_tokens": 131072,
      "context_length": 175891,
      "question_length": 42,
      "prediction": "Zoe is an escort during the night.",
      "reference": "She is a prostitute",
      "metrics": {
        "f1": 0.7407407407407406
      },
      "elapsed_time": 2.518711805343628,
      "timestamp": "2025-12-24T20:46:20.830822"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_8650",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 93910,
      "natural_ratio": 0.7164764404296875,
      "max_context_tokens": 131072,
      "context_length": 187723,
      "question_length": 58,
      "prediction": "六天之后",
      "reference": "2 Years",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.0214004516601562,
      "timestamp": "2025-12-24T20:46:23.852621"
    },
    {
      "sample_id": "squad_56fa317bf34c681400b0c002",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 330,
      "natural_ratio": 0.0025177001953125,
      "max_context_tokens": 131072,
      "context_length": 551,
      "question_length": 70,
      "prediction": "young trees grow more quickly in open stands.",
      "reference": "open stands",
      "metrics": {
        "f1": 0.39285714285714285
      },
      "elapsed_time": 0.11964106559753418,
      "timestamp": "2025-12-24T20:46:23.972459"
    },
    {
      "sample_id": "squad_56cfb40b234ae51400d9bea5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 375,
      "natural_ratio": 0.00286102294921875,
      "max_context_tokens": 131072,
      "context_length": 635,
      "question_length": 76,
      "prediction": "学习投资",
      "reference": "learning investments",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.04430341720581055,
      "timestamp": "2025-12-24T20:46:24.016925"
    },
    {
      "sample_id": "squad_5733b4cf4776f419006610cb",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 646,
      "natural_ratio": 0.0049285888671875,
      "max_context_tokens": 131072,
      "context_length": 1200,
      "question_length": 52,
      "prediction": "penicillins 和 cephalosporins",
      "reference": "penicillins and cephalosporins",
      "metrics": {
        "f1": 0.9230769230769231
      },
      "elapsed_time": 0.1291515827178955,
      "timestamp": "2025-12-24T20:46:24.146240"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_16559",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 108265,
      "natural_ratio": 0.8259963989257812,
      "max_context_tokens": 131072,
      "context_length": 216420,
      "question_length": 70,
      "prediction": "Dr. Vladimir Nechorvich was bringing a small, square shiny object contained in a small plastic container marked 'S.G.' with him to give to the IMF.",
      "reference": "He was going to deliver a new biochemical weapon Chimera and a cure for it, Bellerophon.",
      "metrics": {
        "f1": 0.9019607843137255
      },
      "elapsed_time": 3.6849405765533447,
      "timestamp": "2025-12-24T20:46:27.831547"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_19326",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 115653,
      "natural_ratio": 0.8823623657226562,
      "max_context_tokens": 131072,
      "context_length": 231219,
      "question_length": 47,
      "prediction": "因为Graham拒绝与Ann发生关系，并且Ann认为Graham仍然在隐瞒过去的事情，所以导致了他们对Graham的愤怒。",
      "reference": "When Graham tells them he feels normal compared to them in his sex life",
      "metrics": {
        "f1": 0.20338983050847456
      },
      "elapsed_time": 3.235482692718506,
      "timestamp": "2025-12-24T20:46:31.067780"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_14044",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 93039,
      "natural_ratio": 0.7098312377929688,
      "max_context_tokens": 131072,
      "context_length": 185986,
      "question_length": 53,
      "prediction": "Crim Tartary",
      "reference": "CRIM TARTARY",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 3.206207275390625,
      "timestamp": "2025-12-24T20:46:34.274700"
    },
    {
      "sample_id": "squad_572833d43acd2414000df6de",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 316,
      "natural_ratio": 0.002410888671875,
      "max_context_tokens": 131072,
      "context_length": 564,
      "question_length": 29,
      "prediction": "The Federal War ended in 1863 with the signing of the Treaty of Coche.",
      "reference": "with the signing of the Treaty of Coche by both the centralist government of the time and the Federal Forces",
      "metrics": {
        "f1": 0.8095238095238095
      },
      "elapsed_time": 0.22941994667053223,
      "timestamp": "2025-12-24T20:46:34.504321"
    },
    {
      "sample_id": "squad_5727c9722ca10214002d9634",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 450,
      "natural_ratio": 0.0034332275390625,
      "max_context_tokens": 131072,
      "context_length": 796,
      "question_length": 65,
      "prediction": "Thunderbolt",
      "reference": "Thunderbolt",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04557085037231445,
      "timestamp": "2025-12-24T20:46:34.550056"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_16180",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 96838,
      "natural_ratio": 0.7388153076171875,
      "max_context_tokens": 131072,
      "context_length": 193565,
      "question_length": 71,
      "prediction": "女王的王室身份和期望与康恩的生活方式和价值观形成对比，导致了他们之间的关系复杂化。",
      "reference": "She has queen duites in her country while the hillemen engange in robbery of her domain",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.438176155090332,
      "timestamp": "2025-12-24T20:46:37.988612"
    },
    {
      "sample_id": "squad_5728f1a1af94a219006a9e1d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 409,
      "natural_ratio": 0.00312042236328125,
      "max_context_tokens": 131072,
      "context_length": 738,
      "question_length": 41,
      "prediction": "从1854年开始，萨摩军和海军进行了现代化。",
      "reference": "1854",
      "metrics": {
        "f1": 0.3076923076923077
      },
      "elapsed_time": 0.18488049507141113,
      "timestamp": "2025-12-24T20:46:38.173718"
    },
    {
      "sample_id": "squad_57284911ff5b5019007da0e9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 502,
      "natural_ratio": 0.0038299560546875,
      "max_context_tokens": 131072,
      "context_length": 918,
      "question_length": 46,
      "prediction": "Jainism and Buddhism",
      "reference": "Jainism and Buddhism",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0773470401763916,
      "timestamp": "2025-12-24T20:46:38.251237"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_1756",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 115638,
      "natural_ratio": 0.8822479248046875,
      "max_context_tokens": 131072,
      "context_length": 231136,
      "question_length": 101,
      "prediction": "文本中并未提到玛丽的兄弟或玛丽和伯恩在法国乡下投奔的是玛丽的兄弟或兄弟夫人的信息。根据提供的剧本内容，他们是在法国乡下的某个地方遇到了教授，并且教授骑摩托车接了他们。所以无法从给定的信息中得出玛丽的兄弟或兄弟夫人的名字。",
      "reference": "Eamon",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.850151777267456,
      "timestamp": "2025-12-24T20:46:39.101735"
    },
    {
      "sample_id": "squad_573227fbe17f3d14004226cb",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 534,
      "natural_ratio": 0.0040740966796875,
      "max_context_tokens": 131072,
      "context_length": 949,
      "question_length": 80,
      "prediction": "1994",
      "reference": "1994",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07120442390441895,
      "timestamp": "2025-12-24T20:46:39.173483"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_22349",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 92148,
      "natural_ratio": 0.703033447265625,
      "max_context_tokens": 131072,
      "context_length": 184176,
      "question_length": 81,
      "prediction": "两颗耳朵",
      "reference": "His is missing his ear",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 4.162071943283081,
      "timestamp": "2025-12-24T20:46:43.335930"
    },
    {
      "sample_id": "squad_5730ea7df6cb411900e2454e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 388,
      "natural_ratio": 0.002960205078125,
      "max_context_tokens": 131072,
      "context_length": 701,
      "question_length": 36,
      "prediction": "The Princess Margaret Hospital",
      "reference": "Princess Margaret Hospital",
      "metrics": {
        "f1": 0.9285714285714286
      },
      "elapsed_time": 0.07301592826843262,
      "timestamp": "2025-12-24T20:46:43.409180"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_17081",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 112447,
      "natural_ratio": 0.8579025268554688,
      "max_context_tokens": 131072,
      "context_length": 224804,
      "question_length": 51,
      "prediction": "FAMILIAR",
      "reference": "Familiars",
      "metrics": {
        "f1": 0.923076923076923
      },
      "elapsed_time": 3.643341064453125,
      "timestamp": "2025-12-24T20:46:47.052867"
    },
    {
      "sample_id": "squad_572fe8d0b2c2fd14005685b9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 410,
      "natural_ratio": 0.0031280517578125,
      "max_context_tokens": 131072,
      "context_length": 725,
      "question_length": 56,
      "prediction": "降解多种有机化合物的能力",
      "reference": "ability of bacteria to degrade a variety of organic compounds",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.09146499633789062,
      "timestamp": "2025-12-24T20:46:47.144874"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_17996",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 87012,
      "natural_ratio": 0.663848876953125,
      "max_context_tokens": 131072,
      "context_length": 173890,
      "question_length": 95,
      "prediction": "Sarah uses her machete to chop the zombie's head off when it bites Miguel's arm.",
      "reference": "She amputates his arm and cautorizes it to stop the spread of infection.  ",
      "metrics": {
        "f1": 0.878048780487805
      },
      "elapsed_time": 3.761756420135498,
      "timestamp": "2025-12-24T20:46:50.906991"
    },
    {
      "sample_id": "squad_5722caddf6b826140030fc32",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 584,
      "natural_ratio": 0.00445556640625,
      "max_context_tokens": 131072,
      "context_length": 1092,
      "question_length": 36,
      "prediction": "Sir John Conroy",
      "reference": "Sir John Conroy",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07497239112854004,
      "timestamp": "2025-12-24T20:46:50.982192"
    },
    {
      "sample_id": "squad_572e8d5bc246551400ce4335",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 318,
      "natural_ratio": 0.0024261474609375,
      "max_context_tokens": 131072,
      "context_length": 558,
      "question_length": 38,
      "prediction": "他的儿子工作在软件方面。",
      "reference": "software",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.09534811973571777,
      "timestamp": "2025-12-24T20:46:51.077724"
    },
    {
      "sample_id": "squad_5733797b4776f41900660b6d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 617,
      "natural_ratio": 0.00470733642578125,
      "max_context_tokens": 131072,
      "context_length": 1082,
      "question_length": 113,
      "prediction": "全球金融危机。",
      "reference": "global downturn",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.05583691596984863,
      "timestamp": "2025-12-24T20:46:51.133728"
    },
    {
      "sample_id": "squad_5731f87ae17f3d1400422593",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 366,
      "natural_ratio": 0.0027923583984375,
      "max_context_tokens": 131072,
      "context_length": 623,
      "question_length": 70,
      "prediction": "Two",
      "reference": "Two",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.034708499908447266,
      "timestamp": "2025-12-24T20:46:51.168592"
    },
    {
      "sample_id": "squad_57266a2e708984140094c547",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 314,
      "natural_ratio": 0.0023956298828125,
      "max_context_tokens": 131072,
      "context_length": 535,
      "question_length": 53,
      "prediction": "Nine Years' War",
      "reference": "Nine Years' War",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06358885765075684,
      "timestamp": "2025-12-24T20:46:51.232337"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_28027",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 89426,
      "natural_ratio": 0.6822662353515625,
      "max_context_tokens": 131072,
      "context_length": 178776,
      "question_length": 36,
      "prediction": "Zoe was expecting twins.",
      "reference": "Two.",
      "metrics": {
        "f1": 0.4210526315789474
      },
      "elapsed_time": 2.9396519660949707,
      "timestamp": "2025-12-24T20:46:54.172327"
    },
    {
      "sample_id": "squad_56f7eb38a6d7ea1400e172c3",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 249,
      "natural_ratio": 0.00189971923828125,
      "max_context_tokens": 131072,
      "context_length": 370,
      "question_length": 88,
      "prediction": "欺诈行为、腐败行为、胁迫行为",
      "reference": "fraudulent conduct",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.11500811576843262,
      "timestamp": "2025-12-24T20:46:54.287538"
    },
    {
      "sample_id": "squad_5726b6285951b619008f7b80",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 329,
      "natural_ratio": 0.00251007080078125,
      "max_context_tokens": 131072,
      "context_length": 582,
      "question_length": 36,
      "prediction": "Paul VI",
      "reference": "Paul VI",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.056487321853637695,
      "timestamp": "2025-12-24T20:46:54.344199"
    },
    {
      "sample_id": "squad_570cfa75fed7b91900d45b93",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 368,
      "natural_ratio": 0.0028076171875,
      "max_context_tokens": 131072,
      "context_length": 635,
      "question_length": 62,
      "prediction": "智能手机和平板电脑",
      "reference": "iPhone and iPad",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.06398415565490723,
      "timestamp": "2025-12-24T20:46:54.408344"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_6300",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 76951,
      "natural_ratio": 0.5870895385742188,
      "max_context_tokens": 131072,
      "context_length": 153821,
      "question_length": 41,
      "prediction": "因为她需要钱，所以她做了绑架自己来获取注意的行为。",
      "reference": "Bunny is one of Jackie's porn stars. ",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.6806819438934326,
      "timestamp": "2025-12-24T20:46:58.089391"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_9956",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 103742,
      "natural_ratio": 0.7914886474609375,
      "max_context_tokens": 131072,
      "context_length": 207400,
      "question_length": 45,
      "prediction": "RosebeggedDavidnottosayanythingtoherfamily.",
      "reference": "Don't tell anyone.",
      "metrics": {
        "f1": 0.6206896551724137
      },
      "elapsed_time": 5.150616884231567,
      "timestamp": "2025-12-24T20:47:03.240423"
    },
    {
      "sample_id": "squad_571aa5df10f8ca140030525d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 1394,
      "natural_ratio": 0.0106353759765625,
      "max_context_tokens": 131072,
      "context_length": 2702,
      "question_length": 47,
      "prediction": "Athanasius' innocence was reaffirmed.",
      "reference": "his innocence reaffirmed",
      "metrics": {
        "f1": 0.8387096774193548
      },
      "elapsed_time": 0.13881158828735352,
      "timestamp": "2025-12-24T20:47:03.379427"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_15924",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 54117,
      "natural_ratio": 0.41287994384765625,
      "max_context_tokens": 131072,
      "context_length": 108147,
      "question_length": 48,
      "prediction": "Captain Archbold of the Sephora arrives in the morning asking about Leggatt.",
      "reference": "The other ship's captain",
      "metrics": {
        "f1": 0.6857142857142856
      },
      "elapsed_time": 1.5543463230133057,
      "timestamp": "2025-12-24T20:47:04.934121"
    },
    {
      "sample_id": "squad_571a8edb10f8ca1400305135",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 868,
      "natural_ratio": 0.006622314453125,
      "max_context_tokens": 131072,
      "context_length": 1648,
      "question_length": 48,
      "prediction": "东罗马帝国，特别是埃及和巴勒斯坦地区。",
      "reference": "in the East",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.15557289123535156,
      "timestamp": "2025-12-24T20:47:05.089897"
    },
    {
      "sample_id": "squad_5726806a5951b619008f74e0",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 317,
      "natural_ratio": 0.00241851806640625,
      "max_context_tokens": 131072,
      "context_length": 527,
      "question_length": 67,
      "prediction": "Brunswick Dock",
      "reference": "Brunswick Dock",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05533266067504883,
      "timestamp": "2025-12-24T20:47:05.145390"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_31169",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 92667,
      "natural_ratio": 0.7069931030273438,
      "max_context_tokens": 131072,
      "context_length": 185227,
      "question_length": 68,
      "prediction": "hyperdrive",
      "reference": "Hyperdrive",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.18360567092895508,
      "timestamp": "2025-12-24T20:47:05.329292"
    },
    {
      "sample_id": "squad_572788e25951b619008f8cac",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 463,
      "natural_ratio": 0.00353240966796875,
      "max_context_tokens": 131072,
      "context_length": 862,
      "question_length": 25,
      "prediction": "Leaves gather sunlight and carry out photosynthesis.",
      "reference": "gather sunlight",
      "metrics": {
        "f1": 0.44776119402985076
      },
      "elapsed_time": 0.1276693344116211,
      "timestamp": "2025-12-24T20:47:05.457125"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_25005",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 81159,
      "natural_ratio": 0.6191940307617188,
      "max_context_tokens": 131072,
      "context_length": 162192,
      "question_length": 87,
      "prediction": "蚊子",
      "reference": "Mosquitoes.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.4242818355560303,
      "timestamp": "2025-12-24T20:47:08.881761"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_23635",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 105579,
      "natural_ratio": 0.8055038452148438,
      "max_context_tokens": 131072,
      "context_length": 211032,
      "question_length": 86,
      "prediction": "Lester invites George to escort Jill to the Republican Party election night soiree.",
      "reference": "Jackie",
      "metrics": {
        "f1": 0.3846153846153846
      },
      "elapsed_time": 5.338812351226807,
      "timestamp": "2025-12-24T20:47:14.220963"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_15773",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 53074,
      "natural_ratio": 0.4049224853515625,
      "max_context_tokens": 131072,
      "context_length": 106051,
      "question_length": 57,
      "prediction": "Mariachi被迫停止携带吉他箱，并且隐藏它直到情况结束。",
      "reference": "He is forced to kill the hit men and hide in a bar. ",
      "metrics": {
        "f1": 0.2608695652173913
      },
      "elapsed_time": 1.631789207458496,
      "timestamp": "2025-12-24T20:47:15.853083"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_2279",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 65029,
      "natural_ratio": 0.49613189697265625,
      "max_context_tokens": 131072,
      "context_length": 129997,
      "question_length": 21,
      "prediction": "Sidney是一名幸存者，也是电影《Scream》系列中的角色，她是一位作家，专门撰写关于伍德斯托克谋杀案的书籍。",
      "reference": "A call center worker at a crisis for abused women",
      "metrics": {
        "f1": 0.26865671641791045
      },
      "elapsed_time": 0.4890275001525879,
      "timestamp": "2025-12-24T20:47:16.342427"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_2635",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 100378,
      "natural_ratio": 0.7658233642578125,
      "max_context_tokens": 131072,
      "context_length": 200651,
      "question_length": 66,
      "prediction": "Willard告诉Chef如果他们不能返回，他会安排将Clean埋在河里。",
      "reference": "Call for an airstrike",
      "metrics": {
        "f1": 0.36363636363636365
      },
      "elapsed_time": 5.103848934173584,
      "timestamp": "2025-12-24T20:47:21.446663"
    },
    {
      "sample_id": "squad_572a213aaf94a219006aa81c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 1161,
      "natural_ratio": 0.00885772705078125,
      "max_context_tokens": 131072,
      "context_length": 2191,
      "question_length": 91,
      "prediction": "通过文化扩散和人群迁移的结合。",
      "reference": "cultural diffusion and migration of peoples",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.1324141025543213,
      "timestamp": "2025-12-24T20:47:21.579277"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_16890",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 95045,
      "natural_ratio": 0.7251358032226562,
      "max_context_tokens": 131072,
      "context_length": 189986,
      "question_length": 64,
      "prediction": "地下隧道中的敌方指挥控制中心",
      "reference": "command module",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.2615528106689453,
      "timestamp": "2025-12-24T20:47:24.841288"
    },
    {
      "sample_id": "squad_572ed29703f9891900756a55",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 492,
      "natural_ratio": 0.003753662109375,
      "max_context_tokens": 131072,
      "context_length": 893,
      "question_length": 51,
      "prediction": "Kilauea",
      "reference": "Kilauea",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07084941864013672,
      "timestamp": "2025-12-24T20:47:24.912331"
    },
    {
      "sample_id": "squad_570feffa80d9841400ab3743",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 299,
      "natural_ratio": 0.00228118896484375,
      "max_context_tokens": 131072,
      "context_length": 526,
      "question_length": 33,
      "prediction": "Luis Monge",
      "reference": "Luis Monge",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06478261947631836,
      "timestamp": "2025-12-24T20:47:24.977280"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_7816",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 79430,
      "natural_ratio": 0.6060028076171875,
      "max_context_tokens": 131072,
      "context_length": 158785,
      "question_length": 35,
      "prediction": "Dick Laurent is a porn actor.",
      "reference": "Amateur porn producer",
      "metrics": {
        "f1": 0.7857142857142857
      },
      "elapsed_time": 3.409165620803833,
      "timestamp": "2025-12-24T20:47:28.386803"
    },
    {
      "sample_id": "squad_56e24abacd28a01900c67c30",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 168,
      "natural_ratio": 0.00128173828125,
      "max_context_tokens": 131072,
      "context_length": 255,
      "question_length": 41,
      "prediction": "Estonian",
      "reference": "Estonian",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05544328689575195,
      "timestamp": "2025-12-24T20:47:28.442443"
    },
    {
      "sample_id": "squad_572a95fe34ae481900deab9d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 358,
      "natural_ratio": 0.0027313232421875,
      "max_context_tokens": 131072,
      "context_length": 639,
      "question_length": 37,
      "prediction": "其顾问是内阁成员。",
      "reference": "his or her ministers",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.08456802368164062,
      "timestamp": "2025-12-24T20:47:28.527177"
    },
    {
      "sample_id": "squad_57312ede497a881900248c31",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 378,
      "natural_ratio": 0.0028839111328125,
      "max_context_tokens": 131072,
      "context_length": 679,
      "question_length": 37,
      "prediction": "明 loyalists, pretenders and rebels",
      "reference": "Ming loyalists",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.10429787635803223,
      "timestamp": "2025-12-24T20:47:28.631643"
    },
    {
      "sample_id": "squad_570e43e60dc6ce1900204ef7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 448,
      "natural_ratio": 0.00341796875,
      "max_context_tokens": 131072,
      "context_length": 798,
      "question_length": 58,
      "prediction": "加拿大",
      "reference": "Canada",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.0350947380065918,
      "timestamp": "2025-12-24T20:47:28.666893"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_13931",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 116678,
      "natural_ratio": 0.8901824951171875,
      "max_context_tokens": 131072,
      "context_length": 233233,
      "question_length": 83,
      "prediction": "Lorkowski Cleaning",
      "reference": "An insurance company.",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 3.601198434829712,
      "timestamp": "2025-12-24T20:47:32.268424"
    },
    {
      "sample_id": "squad_5726a6d05951b619008f7928",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 561,
      "natural_ratio": 0.00428009033203125,
      "max_context_tokens": 131072,
      "context_length": 1045,
      "question_length": 37,
      "prediction": "纯金属中包含不必要的杂质。",
      "reference": "unwanted impurities",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.1013174057006836,
      "timestamp": "2025-12-24T20:47:32.370300"
    },
    {
      "sample_id": "squad_5727f3593acd2414000df0b2",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 630,
      "natural_ratio": 0.0048065185546875,
      "max_context_tokens": 131072,
      "context_length": 1149,
      "question_length": 72,
      "prediction": "Medill School of Journalism",
      "reference": "Medill School of Journalism",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0791769027709961,
      "timestamp": "2025-12-24T20:47:32.449652"
    },
    {
      "sample_id": "squad_57325f03b9d445190005eabf",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 411,
      "natural_ratio": 0.00313568115234375,
      "max_context_tokens": 131072,
      "context_length": 730,
      "question_length": 52,
      "prediction": "49.8%",
      "reference": "42.6%",
      "metrics": {
        "f1": 0.6
      },
      "elapsed_time": 0.07536816596984863,
      "timestamp": "2025-12-24T20:47:32.525182"
    },
    {
      "sample_id": "squad_56e145b6e3433e1400422d2c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 278,
      "natural_ratio": 0.0021209716796875,
      "max_context_tokens": 131072,
      "context_length": 444,
      "question_length": 73,
      "prediction": "10%",
      "reference": "10%",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.053433895111083984,
      "timestamp": "2025-12-24T20:47:32.578766"
    },
    {
      "sample_id": "squad_5726b2165951b619008f7ad6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 445,
      "natural_ratio": 0.00339508056640625,
      "max_context_tokens": 131072,
      "context_length": 820,
      "question_length": 31,
      "prediction": "523",
      "reference": "523 scheduled monuments",
      "metrics": {
        "f1": 0.5
      },
      "elapsed_time": 0.05495333671569824,
      "timestamp": "2025-12-24T20:47:32.633886"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_2625",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 100373,
      "natural_ratio": 0.7657852172851562,
      "max_context_tokens": 131072,
      "context_length": 200651,
      "question_length": 56,
      "prediction": "是的，观众知道库尔兹在离开时说的最后一句话。",
      "reference": "The audience does not know the final words of Kurtz. ",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.33271336555480957,
      "timestamp": "2025-12-24T20:47:32.966906"
    },
    {
      "sample_id": "squad_5726b597f1498d1400e8e850",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 379,
      "natural_ratio": 0.00289154052734375,
      "max_context_tokens": 131072,
      "context_length": 651,
      "question_length": 68,
      "prediction": "Encyclopedia Britannica defined literature in its 1911 editions as \"the best expression of the best thought reduced to writing.\"",
      "reference": "\"the best expression of the best thought reduced to writing.\"",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.2967348098754883,
      "timestamp": "2025-12-24T20:47:33.263802"
    },
    {
      "sample_id": "squad_5733cf984776f419006612ae",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 346,
      "natural_ratio": 0.0026397705078125,
      "max_context_tokens": 131072,
      "context_length": 522,
      "question_length": 131,
      "prediction": "婚姻",
      "reference": "marriage",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.03445148468017578,
      "timestamp": "2025-12-24T20:47:33.298397"
    },
    {
      "sample_id": "squad_571a7fd610f8ca14003050b3",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 615,
      "natural_ratio": 0.00469207763671875,
      "max_context_tokens": 131072,
      "context_length": 1137,
      "question_length": 53,
      "prediction": "Hermann Emil Fischer and Joseph von Mering",
      "reference": "Hermann Emil Fischer and Joseph von Mering",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.11858749389648438,
      "timestamp": "2025-12-24T20:47:33.417134"
    },
    {
      "sample_id": "squad_56db42dee7c41114004b4fd6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 510,
      "natural_ratio": 0.0038909912109375,
      "max_context_tokens": 131072,
      "context_length": 937,
      "question_length": 43,
      "prediction": "\"Death Star\"",
      "reference": "the Death Star",
      "metrics": {
        "f1": 0.9411764705882353
      },
      "elapsed_time": 0.0712127685546875,
      "timestamp": "2025-12-24T20:47:33.488561"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_19674",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 103549,
      "natural_ratio": 0.7900161743164062,
      "max_context_tokens": 131072,
      "context_length": 207032,
      "question_length": 26,
      "prediction": "Rath",
      "reference": "Nicolai",
      "metrics": {
        "f1": 0.2
      },
      "elapsed_time": 3.1785671710968018,
      "timestamp": "2025-12-24T20:47:36.667575"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_17699",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 86932,
      "natural_ratio": 0.663238525390625,
      "max_context_tokens": 131072,
      "context_length": 173807,
      "question_length": 17,
      "prediction": "Auroch是一种已灭绝的大型野牛，生活在欧洲、亚洲和北非的草原上，是现代家牛的祖先。",
      "reference": "A prehistoric creature",
      "metrics": {
        "f1": 0.2448979591836735
      },
      "elapsed_time": 2.4220495223999023,
      "timestamp": "2025-12-24T20:47:39.089993"
    },
    {
      "sample_id": "squad_573405584776f41900661711",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 419,
      "natural_ratio": 0.00319671630859375,
      "max_context_tokens": 131072,
      "context_length": 716,
      "question_length": 82,
      "prediction": "1974革命及随后的经济全球化",
      "reference": "1974 revolution and the consequent economic globalization",
      "metrics": {
        "f1": 0.2051282051282051
      },
      "elapsed_time": 0.1281750202178955,
      "timestamp": "2025-12-24T20:47:39.218363"
    },
    {
      "sample_id": "squad_56e078cb7aa994140058e533",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 197,
      "natural_ratio": 0.00150299072265625,
      "max_context_tokens": 131072,
      "context_length": 318,
      "question_length": 37,
      "prediction": "每个铜币的价值是半便士（halfpenny）。",
      "reference": "a halfpenny",
      "metrics": {
        "f1": 0.5161290322580644
      },
      "elapsed_time": 0.16755151748657227,
      "timestamp": "2025-12-24T20:47:39.386129"
    },
    {
      "sample_id": "squad_57268a8d708984140094c976",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 480,
      "natural_ratio": 0.003662109375,
      "max_context_tokens": 131072,
      "context_length": 868,
      "question_length": 52,
      "prediction": "词序",
      "reference": "word order",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.04661107063293457,
      "timestamp": "2025-12-24T20:47:39.432920"
    },
    {
      "sample_id": "squad_57303cafb2c2fd1400568aa9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 449,
      "natural_ratio": 0.00342559814453125,
      "max_context_tokens": 131072,
      "context_length": 751,
      "question_length": 108,
      "prediction": "Median",
      "reference": "The Median",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 0.035346269607543945,
      "timestamp": "2025-12-24T20:47:39.468426"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_5324",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 124398,
      "natural_ratio": 0.9490814208984375,
      "max_context_tokens": 131072,
      "context_length": 248722,
      "question_length": 34,
      "prediction": "因为洛杉矶在故事设定中经历了一场巨大的地震，导致与外界的陆地连接被切断，变成了一个岛屿。",
      "reference": "Because of an earthquake.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 4.469070196151733,
      "timestamp": "2025-12-24T20:47:43.937885"
    },
    {
      "sample_id": "squad_57326d6ce99e3014001e67af",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 330,
      "natural_ratio": 0.0025177001953125,
      "max_context_tokens": 131072,
      "context_length": 542,
      "question_length": 78,
      "prediction": "Columbine II",
      "reference": "Columbine II",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07038354873657227,
      "timestamp": "2025-12-24T20:47:44.008804"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_29457",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 45589,
      "natural_ratio": 0.34781646728515625,
      "max_context_tokens": 131072,
      "context_length": 91086,
      "question_length": 52,
      "prediction": "Molyneuz described Beaucaire's swordsmanship as a revelation of skill, noting that despite holding only a light dress sword, Beaucaire managed to cut through his adversaries and create space for himself, even as the ring closed around him.",
      "reference": "He compared it to that of a gentleman.",
      "metrics": {
        "f1": 0.7727272727272727
      },
      "elapsed_time": 1.711404800415039,
      "timestamp": "2025-12-24T20:47:45.720567"
    },
    {
      "sample_id": "squad_56eaa9720030b61400a35015",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 338,
      "natural_ratio": 0.0025787353515625,
      "max_context_tokens": 131072,
      "context_length": 537,
      "question_length": 100,
      "prediction": "An unholy alliance",
      "reference": "unholy alliance",
      "metrics": {
        "f1": 0.9090909090909091
      },
      "elapsed_time": 0.06790947914123535,
      "timestamp": "2025-12-24T20:47:45.788691"
    },
    {
      "sample_id": "squad_5730359db2c2fd1400568a74",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 437,
      "natural_ratio": 0.00333404541015625,
      "max_context_tokens": 131072,
      "context_length": 792,
      "question_length": 42,
      "prediction": "空中雷达概念",
      "reference": "airborne radar",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.059699296951293945,
      "timestamp": "2025-12-24T20:47:45.848607"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_10885",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 11403,
      "natural_ratio": 0.08699798583984375,
      "max_context_tokens": 131072,
      "context_length": 22721,
      "question_length": 45,
      "prediction": "文本中并没有提到猴子与水桶的相关情节，因此无法回答猴子在水桶边发生了什么。",
      "reference": "The Bee stings him.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.44664645195007324,
      "timestamp": "2025-12-24T20:47:46.295541"
    },
    {
      "sample_id": "squad_5727ae3e2ca10214002d9383",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 417,
      "natural_ratio": 0.00318145751953125,
      "max_context_tokens": 131072,
      "context_length": 737,
      "question_length": 58,
      "prediction": "六种",
      "reference": "five or six methods",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.04741978645324707,
      "timestamp": "2025-12-24T20:47:46.343164"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_5269",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 20525,
      "natural_ratio": 0.15659332275390625,
      "max_context_tokens": 131072,
      "context_length": 40951,
      "question_length": 60,
      "prediction": "林肯表示，根据普遍法律和宪法，联邦是永久性的。宪法中隐含或明确表达了所有国家政府的基本法律。可以合理地断言，没有任何政府在其根本法中有终止自身条款的先例。因此，在遵守宪法和法律的前提下执行所有现行法律，联邦将永远存在。",
      "reference": "It could not be legally done without the consent of all states in the north and south.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 1.0367035865783691,
      "timestamp": "2025-12-24T20:47:47.380168"
    },
    {
      "sample_id": "squad_56f96f929b226e1400dd144a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 341,
      "natural_ratio": 0.00260162353515625,
      "max_context_tokens": 131072,
      "context_length": 592,
      "question_length": 50,
      "prediction": "1999年",
      "reference": "1999",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 0.07683086395263672,
      "timestamp": "2025-12-24T20:47:47.457201"
    },
    {
      "sample_id": "squad_571ad0d09499d21900609b3d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 392,
      "natural_ratio": 0.00299072265625,
      "max_context_tokens": 131072,
      "context_length": 717,
      "question_length": 27,
      "prediction": "Ciba的研究人员发现了Hydralazine。",
      "reference": "researchers at Ciba",
      "metrics": {
        "f1": 0.45161290322580644
      },
      "elapsed_time": 0.13158869743347168,
      "timestamp": "2025-12-24T20:47:47.589023"
    },
    {
      "sample_id": "squad_572932863f37b319004780f1",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 402,
      "natural_ratio": 0.0030670166015625,
      "max_context_tokens": 131072,
      "context_length": 656,
      "question_length": 109,
      "prediction": "contradictions",
      "reference": "contradictions",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.055635690689086914,
      "timestamp": "2025-12-24T20:47:47.644842"
    },
    {
      "sample_id": "squad_5727b1f42ca10214002d941a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 790,
      "natural_ratio": 0.0060272216796875,
      "max_context_tokens": 131072,
      "context_length": 1472,
      "question_length": 69,
      "prediction": "23.3%",
      "reference": "23.3%",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08043503761291504,
      "timestamp": "2025-12-24T20:47:47.725440"
    },
    {
      "sample_id": "squad_56fa3d788f12f31900630100",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 300,
      "natural_ratio": 0.002288818359375,
      "max_context_tokens": 131072,
      "context_length": 463,
      "question_length": 97,
      "prediction": "wood density",
      "reference": "density",
      "metrics": {
        "f1": 0.7368421052631579
      },
      "elapsed_time": 0.043265342712402344,
      "timestamp": "2025-12-24T20:47:47.768859"
    },
    {
      "sample_id": "squad_572f332ba23a5019007fc4da",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 300,
      "natural_ratio": 0.002288818359375,
      "max_context_tokens": 131072,
      "context_length": 514,
      "question_length": 47,
      "prediction": "44,000人",
      "reference": "44,000",
      "metrics": {
        "f1": 0.923076923076923
      },
      "elapsed_time": 0.09393072128295898,
      "timestamp": "2025-12-24T20:47:47.862941"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_7962",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 85287,
      "natural_ratio": 0.6506881713867188,
      "max_context_tokens": 131072,
      "context_length": 170479,
      "question_length": 55,
      "prediction": "在沙发上",
      "reference": "On the couch",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 2.4557337760925293,
      "timestamp": "2025-12-24T20:47:50.319015"
    },
    {
      "sample_id": "squad_573240390fdd8d15006c6885",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 378,
      "natural_ratio": 0.0028839111328125,
      "max_context_tokens": 131072,
      "context_length": 630,
      "question_length": 87,
      "prediction": "1961年",
      "reference": "in 1961",
      "metrics": {
        "f1": 0.6
      },
      "elapsed_time": 0.07908439636230469,
      "timestamp": "2025-12-24T20:47:50.398295"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_10286",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 92183,
      "natural_ratio": 0.7033004760742188,
      "max_context_tokens": 131072,
      "context_length": 184304,
      "question_length": 23,
      "prediction": "Zenocrate",
      "reference": "Zabina",
      "metrics": {
        "f1": 0.4615384615384615
      },
      "elapsed_time": 3.610213041305542,
      "timestamp": "2025-12-24T20:47:54.008848"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_10134",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 32663,
      "natural_ratio": 0.24919891357421875,
      "max_context_tokens": 131072,
      "context_length": 65246,
      "question_length": 40,
      "prediction": "Edward",
      "reference": "Edward",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 1.2580296993255615,
      "timestamp": "2025-12-24T20:47:55.267191"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_23702",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 109821,
      "natural_ratio": 0.8378677368164062,
      "max_context_tokens": 131072,
      "context_length": 219543,
      "question_length": 60,
      "prediction": "French Pete",
      "reference": "'Frisco Kid.",
      "metrics": {
        "f1": 0.39999999999999997
      },
      "elapsed_time": 3.8830578327178955,
      "timestamp": "2025-12-24T20:47:59.150614"
    },
    {
      "sample_id": "squad_5709a8ad200fba1400368221",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 383,
      "natural_ratio": 0.00292205810546875,
      "max_context_tokens": 131072,
      "context_length": 653,
      "question_length": 74,
      "prediction": "1850",
      "reference": "1850",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07059383392333984,
      "timestamp": "2025-12-24T20:47:59.221752"
    },
    {
      "sample_id": "squad_57266bc1708984140094c575",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 399,
      "natural_ratio": 0.00304412841796875,
      "max_context_tokens": 131072,
      "context_length": 716,
      "question_length": 42,
      "prediction": "英国",
      "reference": "Britain",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.03540849685668945,
      "timestamp": "2025-12-24T20:47:59.257321"
    },
    {
      "sample_id": "squad_572fbec0947a6a140053cc30",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 548,
      "natural_ratio": 0.004180908203125,
      "max_context_tokens": 131072,
      "context_length": 1002,
      "question_length": 55,
      "prediction": "乔治三世",
      "reference": "George III of Great Britain",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.05999326705932617,
      "timestamp": "2025-12-24T20:47:59.317467"
    },
    {
      "sample_id": "squad_56db3337e7c41114004b4f33",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 262,
      "natural_ratio": 0.0019989013671875,
      "max_context_tokens": 131072,
      "context_length": 421,
      "question_length": 63,
      "prediction": "Delhi",
      "reference": "Delhi",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.041733741760253906,
      "timestamp": "2025-12-24T20:47:59.359351"
    },
    {
      "sample_id": "squad_572b8669111d821400f38eda",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 629,
      "natural_ratio": 0.00479888916015625,
      "max_context_tokens": 131072,
      "context_length": 1176,
      "question_length": 42,
      "prediction": "secondary education comprises the formal education that occurs during adolescence",
      "reference": "formal education that occurs during adolescence",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 0.1255795955657959,
      "timestamp": "2025-12-24T20:47:59.485082"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_15411",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 120859,
      "natural_ratio": 0.9220809936523438,
      "max_context_tokens": 131072,
      "context_length": 241630,
      "question_length": 48,
      "prediction": "Patrick最初开始接近Kat是因为有人付钱让他去约她，但他并不在意金钱，而是真的对她感兴趣。",
      "reference": "Because Joey pays him.",
      "metrics": {
        "f1": 0.14545454545454545
      },
      "elapsed_time": 4.07348895072937,
      "timestamp": "2025-12-24T20:48:03.558920"
    },
    {
      "sample_id": "squad_571a441a10f8ca1400304f95",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 313,
      "natural_ratio": 0.00238800048828125,
      "max_context_tokens": 131072,
      "context_length": 524,
      "question_length": 63,
      "prediction": "California和西部美国",
      "reference": "In California and the western US",
      "metrics": {
        "f1": 0.5517241379310345
      },
      "elapsed_time": 0.06971979141235352,
      "timestamp": "2025-12-24T20:48:03.629198"
    },
    {
      "sample_id": "squad_56e01bf5231d4119001abf29",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 306,
      "natural_ratio": 0.0023345947265625,
      "max_context_tokens": 131072,
      "context_length": 495,
      "question_length": 78,
      "prediction": "Lascaux洞穴壁画",
      "reference": "Lascaux cave paintings",
      "metrics": {
        "f1": 0.5
      },
      "elapsed_time": 0.08555889129638672,
      "timestamp": "2025-12-24T20:48:03.714935"
    },
    {
      "sample_id": "squad_5733e6a54776f41900661476",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 441,
      "natural_ratio": 0.00336456298828125,
      "max_context_tokens": 131072,
      "context_length": 797,
      "question_length": 45,
      "prediction": "Manchester United",
      "reference": "Manchester United",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.045070648193359375,
      "timestamp": "2025-12-24T20:48:03.760162"
    },
    {
      "sample_id": "squad_5728b5f5ff5b5019007da526",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 401,
      "natural_ratio": 0.00305938720703125,
      "max_context_tokens": 131072,
      "context_length": 682,
      "question_length": 81,
      "prediction": "测试和审查",
      "reference": "eliminate errors",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.0543825626373291,
      "timestamp": "2025-12-24T20:48:03.814694"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_1370",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 71887,
      "natural_ratio": 0.5484542846679688,
      "max_context_tokens": 131072,
      "context_length": 143699,
      "question_length": 36,
      "prediction": "Saturn",
      "reference": "Jupiter",
      "metrics": {
        "f1": 0.4615384615384615
      },
      "elapsed_time": 3.0742993354797363,
      "timestamp": "2025-12-24T20:48:06.889307"
    },
    {
      "sample_id": "squad_57295082af94a219006aa296",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 562,
      "natural_ratio": 0.0042877197265625,
      "max_context_tokens": 131072,
      "context_length": 1000,
      "question_length": 85,
      "prediction": "unit tests",
      "reference": "unit tests are written first",
      "metrics": {
        "f1": 0.7777777777777778
      },
      "elapsed_time": 0.05031943321228027,
      "timestamp": "2025-12-24T20:48:06.939825"
    },
    {
      "sample_id": "squad_570a64c64103511400d596c2",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 550,
      "natural_ratio": 0.0041961669921875,
      "max_context_tokens": 131072,
      "context_length": 1020,
      "question_length": 40,
      "prediction": "zoology",
      "reference": "zoology",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.045862436294555664,
      "timestamp": "2025-12-24T20:48:06.985848"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_8123",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 86619,
      "natural_ratio": 0.6608505249023438,
      "max_context_tokens": 131072,
      "context_length": 173146,
      "question_length": 53,
      "prediction": "Marlene Lewis可能是与角色Martha有关联的重要人物，可能是她的过去或当前生活中的一部分，具体关系需要进一步剧情发展来揭示。",
      "reference": "It's the name that women in the cult use to hide their identity.",
      "metrics": {
        "f1": 0.3333333333333333
      },
      "elapsed_time": 2.2241742610931396,
      "timestamp": "2025-12-24T20:48:09.210393"
    },
    {
      "sample_id": "squad_56e0fa4e231d4119001ac520",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 704,
      "natural_ratio": 0.00537109375,
      "max_context_tokens": 131072,
      "context_length": 1316,
      "question_length": 53,
      "prediction": "Alexey Leonov",
      "reference": "Alexey Leonov",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07416176795959473,
      "timestamp": "2025-12-24T20:48:09.284748"
    },
    {
      "sample_id": "squad_56e104dfcd28a01900c67447",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 449,
      "natural_ratio": 0.00342559814453125,
      "max_context_tokens": 131072,
      "context_length": 794,
      "question_length": 64,
      "prediction": "几内亚湾",
      "reference": "Gulf of Guinea",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.0691521167755127,
      "timestamp": "2025-12-24T20:48:09.354057"
    },
    {
      "sample_id": "squad_57267841708984140094c731",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 330,
      "natural_ratio": 0.0025177001953125,
      "max_context_tokens": 131072,
      "context_length": 554,
      "question_length": 67,
      "prediction": "Querétaro",
      "reference": "Querétaro",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06414222717285156,
      "timestamp": "2025-12-24T20:48:09.418360"
    },
    {
      "sample_id": "squad_570c38b26b8089140040fc41",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 563,
      "natural_ratio": 0.00429534912109375,
      "max_context_tokens": 131072,
      "context_length": 1010,
      "question_length": 77,
      "prediction": "Melbourne grew as a major port for exporting agricultural products like wool by becoming the primary port for Victoria, which had a developing manufacturing sector and was rich in agricultural outputs.",
      "reference": "gold mining",
      "metrics": {
        "f1": 0.45714285714285713
      },
      "elapsed_time": 0.3745112419128418,
      "timestamp": "2025-12-24T20:48:09.793056"
    },
    {
      "sample_id": "squad_5725b96289a1e219009abd78",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 363,
      "natural_ratio": 0.00276947021484375,
      "max_context_tokens": 131072,
      "context_length": 644,
      "question_length": 43,
      "prediction": "Rome",
      "reference": "Rome",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0456852912902832,
      "timestamp": "2025-12-24T20:48:09.838902"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_11526",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 112939,
      "natural_ratio": 0.8616561889648438,
      "max_context_tokens": 131072,
      "context_length": 225819,
      "question_length": 20,
      "prediction": "Haw dies in the laboratory, having destroyed his apparatus and reduced his gold to protyle.",
      "reference": "Haw is found dead in his laboratory.",
      "metrics": {
        "f1": 0.85
      },
      "elapsed_time": 4.035501956939697,
      "timestamp": "2025-12-24T20:48:13.874757"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_21753",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 113959,
      "natural_ratio": 0.8694381713867188,
      "max_context_tokens": 131072,
      "context_length": 227820,
      "question_length": 58,
      "prediction": "通过时间机器将未来的自己送回过去杀死，这样未来的自己就不会存在来领取报酬。",
      "reference": "They are paid with gold bars strapped to the targets.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 4.015922546386719,
      "timestamp": "2025-12-24T20:48:17.891379"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_18211",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 116528,
      "natural_ratio": 0.8890380859375,
      "max_context_tokens": 131072,
      "context_length": 232971,
      "question_length": 46,
      "prediction": "Crain's hope when he built his house was to build it for his wife Rene and their children, but tragically, he never had any children.",
      "reference": "He wanted to fill it with children with his wife.",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 3.8044066429138184,
      "timestamp": "2025-12-24T20:48:21.696554"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_13637",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 112135,
      "natural_ratio": 0.8555221557617188,
      "max_context_tokens": 131072,
      "context_length": 224154,
      "question_length": 76,
      "prediction": "GRAHAM和CLIVE",
      "reference": "Graeme.",
      "metrics": {
        "f1": 0.5882352941176471
      },
      "elapsed_time": 3.708746910095215,
      "timestamp": "2025-12-24T20:48:25.405990"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_13930",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 116665,
      "natural_ratio": 0.8900833129882812,
      "max_context_tokens": 131072,
      "context_length": 233233,
      "question_length": 57,
      "prediction": "Lynn",
      "reference": "Lynn, a client's daughter",
      "metrics": {
        "f1": 0.3
      },
      "elapsed_time": 0.20778250694274902,
      "timestamp": "2025-12-24T20:48:25.614451"
    },
    {
      "sample_id": "squad_56e8e4930b45c0140094cd3d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 259,
      "natural_ratio": 0.00197601318359375,
      "max_context_tokens": 131072,
      "context_length": 411,
      "question_length": 67,
      "prediction": "Purbeck marble",
      "reference": "Purbeck",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.06473350524902344,
      "timestamp": "2025-12-24T20:48:25.679715"
    },
    {
      "sample_id": "squad_56df6c9e56340a1900b29b00",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 271,
      "natural_ratio": 0.00206756591796875,
      "max_context_tokens": 131072,
      "context_length": 464,
      "question_length": 39,
      "prediction": "Chesapeake Energy Arena",
      "reference": "Chesapeake Energy Arena",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0741879940032959,
      "timestamp": "2025-12-24T20:48:25.754064"
    },
    {
      "sample_id": "squad_57333f7dd058e614000b57c8",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 348,
      "natural_ratio": 0.002655029296875,
      "max_context_tokens": 131072,
      "context_length": 573,
      "question_length": 83,
      "prediction": "Paul Krugman",
      "reference": "Paul Krugman",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06371784210205078,
      "timestamp": "2025-12-24T20:48:25.817934"
    },
    {
      "sample_id": "squad_56f9fa1e8f12f31900630038",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 309,
      "natural_ratio": 0.00235748291015625,
      "max_context_tokens": 131072,
      "context_length": 529,
      "question_length": 50,
      "prediction": "Laas Geel complex",
      "reference": "the Laas Geel complex",
      "metrics": {
        "f1": 0.9166666666666666
      },
      "elapsed_time": 0.0735161304473877,
      "timestamp": "2025-12-24T20:48:25.891607"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_32283",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 44763,
      "natural_ratio": 0.34151458740234375,
      "max_context_tokens": 131072,
      "context_length": 89437,
      "question_length": 49,
      "prediction": "这是挑战或决斗的象征",
      "reference": "A fight to the death.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 1.2971868515014648,
      "timestamp": "2025-12-24T20:48:27.189114"
    },
    {
      "sample_id": "squad_5728cfb7ff5b5019007da70c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 699,
      "natural_ratio": 0.00533294677734375,
      "max_context_tokens": 131072,
      "context_length": 1300,
      "question_length": 58,
      "prediction": "英国参与成立了临时政府。",
      "reference": "interim government",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.09363818168640137,
      "timestamp": "2025-12-24T20:48:27.282957"
    },
    {
      "sample_id": "squad_570b03246b8089140040f6d4",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 367,
      "natural_ratio": 0.00279998779296875,
      "max_context_tokens": 131072,
      "context_length": 637,
      "question_length": 58,
      "prediction": "The Northern College of Music",
      "reference": "the Northern College of Music",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07625293731689453,
      "timestamp": "2025-12-24T20:48:27.359373"
    },
    {
      "sample_id": "squad_572796b2f1498d1400e8fcea",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 331,
      "natural_ratio": 0.00252532958984375,
      "max_context_tokens": 131072,
      "context_length": 543,
      "question_length": 80,
      "prediction": "100,000 voters",
      "reference": "100,000",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.10508298873901367,
      "timestamp": "2025-12-24T20:48:27.464621"
    },
    {
      "sample_id": "squad_572e8843cb0c0d14000f125d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 373,
      "natural_ratio": 0.00284576416015625,
      "max_context_tokens": 131072,
      "context_length": 658,
      "question_length": 49,
      "prediction": "14 August",
      "reference": "14 August",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05513787269592285,
      "timestamp": "2025-12-24T20:48:27.519919"
    },
    {
      "sample_id": "squad_5726684d708984140094c506",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 346,
      "natural_ratio": 0.0026397705078125,
      "max_context_tokens": 131072,
      "context_length": 581,
      "question_length": 71,
      "prediction": "150,000",
      "reference": "150,000",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.09477925300598145,
      "timestamp": "2025-12-24T20:48:27.614891"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_19319",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 115651,
      "natural_ratio": 0.8823471069335938,
      "max_context_tokens": 131072,
      "context_length": 231219,
      "question_length": 43,
      "prediction": "根据文档内容，John被召唤到他老板的办公室是因为John的客户Kirkland获得了其他法律代表，不再需要John的服务。",
      "reference": "He was about to be fired due to his frequent cancellation of meetings with important clients.",
      "metrics": {
        "f1": 0.2285714285714286
      },
      "elapsed_time": 0.4591846466064453,
      "timestamp": "2025-12-24T20:48:28.074452"
    },
    {
      "sample_id": "squad_571cf2c5dd7acb1400e4c1bf",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 616,
      "natural_ratio": 0.00469970703125,
      "max_context_tokens": 131072,
      "context_length": 1137,
      "question_length": 56,
      "prediction": "Veronal",
      "reference": "Veronal",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0452885627746582,
      "timestamp": "2025-12-24T20:48:28.120274"
    },
    {
      "sample_id": "squad_5726ecd2dd62a815002e956c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 513,
      "natural_ratio": 0.00391387939453125,
      "max_context_tokens": 131072,
      "context_length": 950,
      "question_length": 37,
      "prediction": "学生被教导记住汉字的意义的本土韩语发音和基于汉字的汉语发音（即每个汉字的形声字发音）。",
      "reference": "native Korean pronunciation for the hanja's meaning",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.30089545249938965,
      "timestamp": "2025-12-24T20:48:28.421347"
    },
    {
      "sample_id": "squad_570d0581b3d812140066d39f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 358,
      "natural_ratio": 0.0027313232421875,
      "max_context_tokens": 131072,
      "context_length": 615,
      "question_length": 62,
      "prediction": "Mac，Apple的LaserWriter打印机，以及Mac-specific软件如Boston Software的MacPublisher和Aldus PageMaker。",
      "reference": "Mac, Apple's LaserWriter printer, and Mac-specific software like Boston Software's MacPublisher",
      "metrics": {
        "f1": 0.7368421052631579
      },
      "elapsed_time": 0.2852649688720703,
      "timestamp": "2025-12-24T20:48:28.706792"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_1954",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 110128,
      "natural_ratio": 0.8402099609375,
      "max_context_tokens": 131072,
      "context_length": 220171,
      "question_length": 46,
      "prediction": "通过食物任务和与俄罗斯游击队的交易获得。",
      "reference": "By raiding local farms.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 4.841007709503174,
      "timestamp": "2025-12-24T20:48:33.548133"
    },
    {
      "sample_id": "squad_572a0f656aef05140015521c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 433,
      "natural_ratio": 0.00330352783203125,
      "max_context_tokens": 131072,
      "context_length": 774,
      "question_length": 53,
      "prediction": "imperfect",
      "reference": "imperfective",
      "metrics": {
        "f1": 0.9411764705882353
      },
      "elapsed_time": 0.051378488540649414,
      "timestamp": "2025-12-24T20:48:33.600056"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_8609",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 94939,
      "natural_ratio": 0.7243270874023438,
      "max_context_tokens": 131072,
      "context_length": 189753,
      "question_length": 86,
      "prediction": "Lerner",
      "reference": "Lerner",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 4.0880820751190186,
      "timestamp": "2025-12-24T20:48:37.688492"
    },
    {
      "sample_id": "squad_56db792ce7c41114004b517f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 277,
      "natural_ratio": 0.00211334228515625,
      "max_context_tokens": 131072,
      "context_length": 458,
      "question_length": 57,
      "prediction": "foreign correspondents in Beijing",
      "reference": "foreign correspondents",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 0.07682561874389648,
      "timestamp": "2025-12-24T20:48:37.765505"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_22589",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 93475,
      "natural_ratio": 0.7131576538085938,
      "max_context_tokens": 131072,
      "context_length": 186884,
      "question_length": 27,
      "prediction": "Pauline resides in a house with Malcolm and their guests, including Claude and Ingrid.",
      "reference": "Long Island",
      "metrics": {
        "f1": 0.6206896551724138
      },
      "elapsed_time": 3.018845558166504,
      "timestamp": "2025-12-24T20:48:40.784749"
    },
    {
      "sample_id": "squad_573421ecd058e614000b69b6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 410,
      "natural_ratio": 0.0031280517578125,
      "max_context_tokens": 131072,
      "context_length": 709,
      "question_length": 71,
      "prediction": "X-rays, CAT scans, PET scans or NMR",
      "reference": "X-rays, CAT scans, PET scans or NMR",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.1394803524017334,
      "timestamp": "2025-12-24T20:48:40.924425"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_1491",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 91185,
      "natural_ratio": 0.6956863403320312,
      "max_context_tokens": 131072,
      "context_length": 182284,
      "question_length": 47,
      "prediction": "在枫树上的一窝里",
      "reference": "A MAPLE TREE",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 2.914879322052002,
      "timestamp": "2025-12-24T20:48:43.839632"
    },
    {
      "sample_id": "squad_56f726613d8e2e1400e373a5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 223,
      "natural_ratio": 0.00170135498046875,
      "max_context_tokens": 131072,
      "context_length": 332,
      "question_length": 75,
      "prediction": "当他们的迁徙运动结束时",
      "reference": "When their migratory movements ended",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.0966348648071289,
      "timestamp": "2025-12-24T20:48:43.936460"
    },
    {
      "sample_id": "squad_572f7a7704bcaa1900d769dc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 599,
      "natural_ratio": 0.00457000732421875,
      "max_context_tokens": 131072,
      "context_length": 1093,
      "question_length": 65,
      "prediction": "越来越晦涩的写作内容",
      "reference": "increasingly esoteric writing",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.08584046363830566,
      "timestamp": "2025-12-24T20:48:44.022464"
    },
    {
      "sample_id": "squad_57313d9505b4da19006bcf21",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 353,
      "natural_ratio": 0.00269317626953125,
      "max_context_tokens": 131072,
      "context_length": 618,
      "question_length": 49,
      "prediction": "通过菊花链连接，并且数据输入端连接到微处理器。",
      "reference": "in a daisy chain",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.16376924514770508,
      "timestamp": "2025-12-24T20:48:44.186394"
    },
    {
      "sample_id": "squad_56e105afcd28a01900c6744c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 196,
      "natural_ratio": 0.001495361328125,
      "max_context_tokens": 131072,
      "context_length": 303,
      "question_length": 50,
      "prediction": "Chinese DFH-3 geostationary communications satellite",
      "reference": "the Chinese DFH-3 geostationary communications satellite",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.1301741600036621,
      "timestamp": "2025-12-24T20:48:44.316730"
    },
    {
      "sample_id": "squad_5727871b708984140094e000",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 568,
      "natural_ratio": 0.00433349609375,
      "max_context_tokens": 131072,
      "context_length": 1047,
      "question_length": 50,
      "prediction": "作为劳动者",
      "reference": "laborers",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.05513143539428711,
      "timestamp": "2025-12-24T20:48:44.372015"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_13074",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 16681,
      "natural_ratio": 0.12726593017578125,
      "max_context_tokens": 131072,
      "context_length": 33278,
      "question_length": 45,
      "prediction": "音乐停止，舞会参与者停止跳舞，整个场景陷入了一种不安的静止状态。",
      "reference": "A guest in blood-splattered robes and a mask, with signs of the Red Death appeared.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.5307235717773438,
      "timestamp": "2025-12-24T20:48:44.902967"
    },
    {
      "sample_id": "squad_56dcfef266d3e219004dab95",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 178,
      "natural_ratio": 0.0013580322265625,
      "max_context_tokens": 131072,
      "context_length": 259,
      "question_length": 58,
      "prediction": "中央为高原地形",
      "reference": "plateau",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.06041455268859863,
      "timestamp": "2025-12-24T20:48:44.963550"
    },
    {
      "sample_id": "squad_57268b01708984140094c98e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 334,
      "natural_ratio": 0.0025482177734375,
      "max_context_tokens": 131072,
      "context_length": 573,
      "question_length": 55,
      "prediction": "Euro 2004",
      "reference": "Euro 2004",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08420038223266602,
      "timestamp": "2025-12-24T20:48:45.047907"
    },
    {
      "sample_id": "squad_573041b7947a6a140053d358",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 322,
      "natural_ratio": 0.0024566650390625,
      "max_context_tokens": 131072,
      "context_length": 558,
      "question_length": 46,
      "prediction": "11",
      "reference": "eleven",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.04332590103149414,
      "timestamp": "2025-12-24T20:48:45.091379"
    },
    {
      "sample_id": "squad_56e03cc27aa994140058e3f8",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 176,
      "natural_ratio": 0.0013427734375,
      "max_context_tokens": 131072,
      "context_length": 268,
      "question_length": 44,
      "prediction": "Cartooning",
      "reference": "Cartooning",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04966568946838379,
      "timestamp": "2025-12-24T20:48:45.141198"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_7304",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 79150,
      "natural_ratio": 0.6038665771484375,
      "max_context_tokens": 131072,
      "context_length": 158213,
      "question_length": 47,
      "prediction": "因为这是他最后的机会，可以赚到足够的钱让自己和女儿有一个新的开始。",
      "reference": "So that he can have enough money to take his daughter to California",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.9830141067504883,
      "timestamp": "2025-12-24T20:48:49.124570"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_17636",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 108668,
      "natural_ratio": 0.829071044921875,
      "max_context_tokens": 131072,
      "context_length": 217262,
      "question_length": 34,
      "prediction": "Character Marker",
      "reference": "The spectacles.",
      "metrics": {
        "f1": 0.631578947368421
      },
      "elapsed_time": 3.495741367340088,
      "timestamp": "2025-12-24T20:48:52.620660"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_7892",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 66060,
      "natural_ratio": 0.503997802734375,
      "max_context_tokens": 131072,
      "context_length": 132052,
      "question_length": 29,
      "prediction": "不存在第八个类别，提供的内容只有七个部分：A. Introductory Note, B. Exclusive Rights in Copyrighted Works, C. Fair Use, D. Reproduction by Libraries and Archives, E. Liability for Infringement, F. Guidelines for Off-air Recording of Broadcast Programming for Educational Purposes。",
      "reference": "architectural works",
      "metrics": {
        "f1": 0.45161290322580644
      },
      "elapsed_time": 2.210745096206665,
      "timestamp": "2025-12-24T20:48:54.832151"
    },
    {
      "sample_id": "squad_5725dab089a1e219009abfcc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 430,
      "natural_ratio": 0.0032806396484375,
      "max_context_tokens": 131072,
      "context_length": 746,
      "question_length": 75,
      "prediction": "Premier League",
      "reference": "Premier League",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.060882568359375,
      "timestamp": "2025-12-24T20:48:54.893243"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_28937",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 109126,
      "natural_ratio": 0.8325653076171875,
      "max_context_tokens": 131072,
      "context_length": 218168,
      "question_length": 44,
      "prediction": "Honey Whitlock is kidnapped and taken to Baltimore, where she is held hostage and forced to participate in the making of a film by the terrorist group known as \"The Sprocket Holes\". She eventually escapes and becomes a symbol of resistance against mainstream cinema.",
      "reference": "She is kidnapped by Cecil B. Demented.",
      "metrics": {
        "f1": 0.7906976744186047
      },
      "elapsed_time": 4.184330940246582,
      "timestamp": "2025-12-24T20:48:59.077972"
    },
    {
      "sample_id": "squad_570e66e60dc6ce190020501d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 433,
      "natural_ratio": 0.00330352783203125,
      "max_context_tokens": 131072,
      "context_length": 784,
      "question_length": 43,
      "prediction": "Sanskrit grammatical tradition, Vyākaraṇa，始于晚期吠陀时期，在Pāṇini的Aṣṭādhyāyī（约公元前5世纪）中达到顶峰。",
      "reference": "Vyākaraṇa",
      "metrics": {
        "f1": 0.19354838709677416
      },
      "elapsed_time": 0.4852609634399414,
      "timestamp": "2025-12-24T20:48:59.563800"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_14124",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 94177,
      "natural_ratio": 0.7185134887695312,
      "max_context_tokens": 131072,
      "context_length": 188264,
      "question_length": 50,
      "prediction": "James Howlett没有对他的父亲的杀人凶手采取任何行动，因为文中并未提及他父亲被杀的情节。",
      "reference": "Kills him.  ",
      "metrics": {
        "f1": 0.20408163265306123
      },
      "elapsed_time": 3.4025917053222656,
      "timestamp": "2025-12-24T20:49:02.966792"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_18651",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 109036,
      "natural_ratio": 0.831878662109375,
      "max_context_tokens": 131072,
      "context_length": 217986,
      "question_length": 46,
      "prediction": "Doris",
      "reference": "Natalie Belasario",
      "metrics": {
        "f1": 0.5000000000000001
      },
      "elapsed_time": 2.692782402038574,
      "timestamp": "2025-12-24T20:49:05.659921"
    },
    {
      "sample_id": "squad_57275cfcf1498d1400e8f6ef",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 388,
      "natural_ratio": 0.002960205078125,
      "max_context_tokens": 131072,
      "context_length": 653,
      "question_length": 83,
      "prediction": "短",
      "reference": "short life expectancy",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.03921055793762207,
      "timestamp": "2025-12-24T20:49:05.699676"
    },
    {
      "sample_id": "squad_570a3a0e6d058f1900182d06",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 432,
      "natural_ratio": 0.0032958984375,
      "max_context_tokens": 131072,
      "context_length": 789,
      "question_length": 36,
      "prediction": "William IV",
      "reference": "William IV",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04569506645202637,
      "timestamp": "2025-12-24T20:49:05.745529"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_19024",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 117804,
      "natural_ratio": 0.898773193359375,
      "max_context_tokens": 131072,
      "context_length": 235515,
      "question_length": 54,
      "prediction": "Woody prevented the Del Fuegos from chasing him by slamming a dart into Jack's shoulder, causing Jack to boil red and focus on the dart instead of pursuing Woody.",
      "reference": "Woody cut the fuel lines on their motorcycles.",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 4.229392766952515,
      "timestamp": "2025-12-24T20:49:09.975269"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_5366",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 88405,
      "natural_ratio": 0.6744766235351562,
      "max_context_tokens": 131072,
      "context_length": 176716,
      "question_length": 54,
      "prediction": "美国新泽西州",
      "reference": "Canada",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 2.7506203651428223,
      "timestamp": "2025-12-24T20:49:12.726611"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_18394",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 54346,
      "natural_ratio": 0.4146270751953125,
      "max_context_tokens": 131072,
      "context_length": 108571,
      "question_length": 82,
      "prediction": "The robots developed by the United Nations to fight the Soviets were called claws.",
      "reference": "Claws",
      "metrics": {
        "f1": 0.11494252873563218
      },
      "elapsed_time": 1.549367904663086,
      "timestamp": "2025-12-24T20:49:14.276294"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_25669",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 51741,
      "natural_ratio": 0.39475250244140625,
      "max_context_tokens": 131072,
      "context_length": 103388,
      "question_length": 55,
      "prediction": "Bartleby consistently responds with \"I would prefer not to\" when asked to work or move.",
      "reference": "\"I would prefer not to.\"",
      "metrics": {
        "f1": 0.7692307692307693
      },
      "elapsed_time": 1.549182415008545,
      "timestamp": "2025-12-24T20:49:15.825876"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_26483",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 66432,
      "natural_ratio": 0.5068359375,
      "max_context_tokens": 131072,
      "context_length": 132748,
      "question_length": 77,
      "prediction": "Grant acquired the Velociraptor larynx by blowing through a model resonating chamber, mimicking the \"egg\" cry he heard the raptors give.",
      "reference": "It was replicated by Billy with a 3D printer.",
      "metrics": {
        "f1": 0.7555555555555556
      },
      "elapsed_time": 2.817246913909912,
      "timestamp": "2025-12-24T20:49:18.643498"
    },
    {
      "sample_id": "squad_572699885951b619008f778a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 384,
      "natural_ratio": 0.0029296875,
      "max_context_tokens": 131072,
      "context_length": 660,
      "question_length": 69,
      "prediction": "The Arthashastra provides an account of the science of politics for a wise ruler.",
      "reference": "The Arthashastra",
      "metrics": {
        "f1": 0.32989690721649484
      },
      "elapsed_time": 0.2089247703552246,
      "timestamp": "2025-12-24T20:49:18.852628"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_29967",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 113600,
      "natural_ratio": 0.86669921875,
      "max_context_tokens": 131072,
      "context_length": 227114,
      "question_length": 47,
      "prediction": "纳米武器",
      "reference": "Nanotech base weapon.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 4.243772029876709,
      "timestamp": "2025-12-24T20:49:23.096727"
    },
    {
      "sample_id": "squad_5730299db2c2fd14005689a8",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 358,
      "natural_ratio": 0.0027313232421875,
      "max_context_tokens": 131072,
      "context_length": 634,
      "question_length": 42,
      "prediction": "5",
      "reference": "five slaves",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.04069042205810547,
      "timestamp": "2025-12-24T20:49:23.137955"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_29172",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 75258,
      "natural_ratio": 0.5741729736328125,
      "max_context_tokens": 131072,
      "context_length": 150428,
      "question_length": 48,
      "prediction": "Barton Fink has written a play called \"Bare Ruined Choirs\".",
      "reference": "Bare Ruined Choirs",
      "metrics": {
        "f1": 0.4675324675324675
      },
      "elapsed_time": 3.3954873085021973,
      "timestamp": "2025-12-24T20:49:26.533797"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_29455",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 45580,
      "natural_ratio": 0.347747802734375,
      "max_context_tokens": 131072,
      "context_length": 91086,
      "question_length": 35,
      "prediction": "Beaucaire intends to marry Lady Mary Carlisle.",
      "reference": "The woman that King Louis XV had chosen for him in the first place.",
      "metrics": {
        "f1": 0.75
      },
      "elapsed_time": 0.22222590446472168,
      "timestamp": "2025-12-24T20:49:26.756371"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_9617",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 118768,
      "natural_ratio": 0.9061279296875,
      "max_context_tokens": 131072,
      "context_length": 237440,
      "question_length": 57,
      "prediction": "RPG",
      "reference": "The weapon that could destroy surrogates.",
      "metrics": {
        "f1": 0.2857142857142857
      },
      "elapsed_time": 4.036792039871216,
      "timestamp": "2025-12-24T20:49:30.793557"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_14083",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 91012,
      "natural_ratio": 0.694366455078125,
      "max_context_tokens": 131072,
      "context_length": 181959,
      "question_length": 26,
      "prediction": "Kirsty",
      "reference": "Kirsty",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 2.877040386199951,
      "timestamp": "2025-12-24T20:49:33.671359"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_23087",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 104656,
      "natural_ratio": 0.7984619140625,
      "max_context_tokens": 131072,
      "context_length": 209218,
      "question_length": 55,
      "prediction": "Anaheim Amigos",
      "reference": "The Boston Celtics.",
      "metrics": {
        "f1": 0.6363636363636365
      },
      "elapsed_time": 4.150201082229614,
      "timestamp": "2025-12-24T20:49:37.821939"
    },
    {
      "sample_id": "squad_572ec021c246551400ce4600",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 405,
      "natural_ratio": 0.00308990478515625,
      "max_context_tokens": 131072,
      "context_length": 697,
      "question_length": 74,
      "prediction": "否",
      "reference": "They met no serious resistance, and wielded little violence against the monarchists.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.0411829948425293,
      "timestamp": "2025-12-24T20:49:37.863314"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_10076",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 94828,
      "natural_ratio": 0.723480224609375,
      "max_context_tokens": 131072,
      "context_length": 189560,
      "question_length": 56,
      "prediction": "Rory",
      "reference": "Brian.",
      "metrics": {
        "f1": 0.2222222222222222
      },
      "elapsed_time": 3.122079610824585,
      "timestamp": "2025-12-24T20:49:40.985797"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_7362",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 31097,
      "natural_ratio": 0.23725128173828125,
      "max_context_tokens": 131072,
      "context_length": 62058,
      "question_length": 96,
      "prediction": "他希望将尸体暂时安放在一个地窖的 vault 中，两周后再进行最终安葬。",
      "reference": "Roderick insists they inter her in the family tomb for two weeks.",
      "metrics": {
        "f1": 0.14814814814814817
      },
      "elapsed_time": 0.9024953842163086,
      "timestamp": "2025-12-24T20:49:41.888583"
    },
    {
      "sample_id": "squad_57268eea708984140094ca16",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 525,
      "natural_ratio": 0.00400543212890625,
      "max_context_tokens": 131072,
      "context_length": 950,
      "question_length": 61,
      "prediction": "Olympiacos",
      "reference": "Olympiacos",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06761789321899414,
      "timestamp": "2025-12-24T20:49:41.956378"
    },
    {
      "sample_id": "squad_56e1063ecd28a01900c67457",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 417,
      "natural_ratio": 0.00318145751953125,
      "max_context_tokens": 131072,
      "context_length": 751,
      "question_length": 44,
      "prediction": "太平洋（Pacific ocean）",
      "reference": "Pacific",
      "metrics": {
        "f1": 0.56
      },
      "elapsed_time": 0.07538151741027832,
      "timestamp": "2025-12-24T20:49:42.031916"
    },
    {
      "sample_id": "squad_57310e2b05b4da19006bcd34",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 646,
      "natural_ratio": 0.0049285888671875,
      "max_context_tokens": 131072,
      "context_length": 1216,
      "question_length": 37,
      "prediction": "The \"Big Four\" created the Treaty of Versailles, which includes France, Italy, United Kingdom, and the United States.",
      "reference": "Big Three",
      "metrics": {
        "f1": 0.48484848484848486
      },
      "elapsed_time": 0.2882659435272217,
      "timestamp": "2025-12-24T20:49:42.320345"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_7347",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 84711,
      "natural_ratio": 0.6462936401367188,
      "max_context_tokens": 131072,
      "context_length": 169359,
      "question_length": 23,
      "prediction": "文本中并未提到有UFO着陆的场景。",
      "reference": "Mojave Deser",
      "metrics": {
        "f1": 0.07407407407407408
      },
      "elapsed_time": 3.545433521270752,
      "timestamp": "2025-12-24T20:49:45.866130"
    },
    {
      "sample_id": "squad_572ef6bfcb0c0d14000f16a6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 337,
      "natural_ratio": 0.00257110595703125,
      "max_context_tokens": 131072,
      "context_length": 563,
      "question_length": 71,
      "prediction": "Material transport elevators",
      "reference": "Material transport elevators",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07138514518737793,
      "timestamp": "2025-12-24T20:49:45.937748"
    },
    {
      "sample_id": "squad_571cf8175efbb31900334e68",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 646,
      "natural_ratio": 0.0049285888671875,
      "max_context_tokens": 131072,
      "context_length": 1212,
      "question_length": 41,
      "prediction": "Merck",
      "reference": "Merck",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.053060054779052734,
      "timestamp": "2025-12-24T20:49:45.991038"
    },
    {
      "sample_id": "squad_572a0e601d04691400779713",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 300,
      "natural_ratio": 0.002288818359375,
      "max_context_tokens": 131072,
      "context_length": 503,
      "question_length": 58,
      "prediction": "1483",
      "reference": "(1483)",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 0.06831932067871094,
      "timestamp": "2025-12-24T20:49:46.059589"
    },
    {
      "sample_id": "squad_5728d3393acd2414000dff75",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 355,
      "natural_ratio": 0.00270843505859375,
      "max_context_tokens": 131072,
      "context_length": 610,
      "question_length": 61,
      "prediction": "希腊传统中提到的烟雾和嚼冬青叶的现象似乎被最近的研究所证实。",
      "reference": "existence of vapours and chewing of laurel-leaves",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.22972917556762695,
      "timestamp": "2025-12-24T20:49:46.289546"
    },
    {
      "sample_id": "squad_56ccf53362d2951400fa64ff",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 811,
      "natural_ratio": 0.00618743896484375,
      "max_context_tokens": 131072,
      "context_length": 1528,
      "question_length": 54,
      "prediction": "一个 licensed border market",
      "reference": "a licensed border market",
      "metrics": {
        "f1": 0.9375
      },
      "elapsed_time": 0.07137751579284668,
      "timestamp": "2025-12-24T20:49:46.361118"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_26210",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 122910,
      "natural_ratio": 0.9377288818359375,
      "max_context_tokens": 131072,
      "context_length": 245733,
      "question_length": 47,
      "prediction": "无法确定具体年龄，文本中没有提到飞碟的具体制造时间。",
      "reference": "over 100,000 years old.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.260341167449951,
      "timestamp": "2025-12-24T20:49:49.621796"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_3760",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 123124,
      "natural_ratio": 0.939361572265625,
      "max_context_tokens": 131072,
      "context_length": 246144,
      "question_length": 65,
      "prediction": "Parnassus gained a bad cold.",
      "reference": "Immortality",
      "metrics": {
        "f1": 0.4166666666666667
      },
      "elapsed_time": 4.091855049133301,
      "timestamp": "2025-12-24T20:49:53.714393"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_8748",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 89183,
      "natural_ratio": 0.6804122924804688,
      "max_context_tokens": 131072,
      "context_length": 178274,
      "question_length": 52,
      "prediction": "木鼠",
      "reference": "woodchuck",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 2.830012559890747,
      "timestamp": "2025-12-24T20:49:56.545135"
    },
    {
      "sample_id": "squad_57304b80069b531400832009",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 475,
      "natural_ratio": 0.00362396240234375,
      "max_context_tokens": 131072,
      "context_length": 851,
      "question_length": 60,
      "prediction": "Charleston's Neck area",
      "reference": "the city's Neck area",
      "metrics": {
        "f1": 0.8461538461538461
      },
      "elapsed_time": 0.10833740234375,
      "timestamp": "2025-12-24T20:49:56.653705"
    },
    {
      "sample_id": "squad_5731d9f3e17f3d1400422499",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 307,
      "natural_ratio": 0.00234222412109375,
      "max_context_tokens": 131072,
      "context_length": 549,
      "question_length": 26,
      "prediction": "1960年4月21日",
      "reference": "April 21, 1960",
      "metrics": {
        "f1": 0.4761904761904762
      },
      "elapsed_time": 0.12587881088256836,
      "timestamp": "2025-12-24T20:49:56.779775"
    },
    {
      "sample_id": "squad_570c488cb3d812140066d079",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 338,
      "natural_ratio": 0.0025787353515625,
      "max_context_tokens": 131072,
      "context_length": 592,
      "question_length": 45,
      "prediction": "late 1890s",
      "reference": "late 1890s",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.09444642066955566,
      "timestamp": "2025-12-24T20:49:56.874385"
    },
    {
      "sample_id": "squad_56e14927e3433e1400422d5a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 707,
      "natural_ratio": 0.00539398193359375,
      "max_context_tokens": 131072,
      "context_length": 1334,
      "question_length": 40,
      "prediction": "Lon Chaney",
      "reference": "Lon Chaney",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05995631217956543,
      "timestamp": "2025-12-24T20:49:56.934489"
    },
    {
      "sample_id": "squad_57267331708984140094c68b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 370,
      "natural_ratio": 0.0028228759765625,
      "max_context_tokens": 131072,
      "context_length": 672,
      "question_length": 29,
      "prediction": "1.4百万",
      "reference": "1.4 million",
      "metrics": {
        "f1": 0.42857142857142855
      },
      "elapsed_time": 0.06398510932922363,
      "timestamp": "2025-12-24T20:49:56.998634"
    },
    {
      "sample_id": "squad_572fc92904bcaa1900d76d30",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 510,
      "natural_ratio": 0.0038909912109375,
      "max_context_tokens": 131072,
      "context_length": 923,
      "question_length": 58,
      "prediction": "Sir Dave Richards，1999年4月",
      "reference": "The current chairman is Sir Dave Richards, who was appointed in April 199",
      "metrics": {
        "f1": 0.6486486486486486
      },
      "elapsed_time": 0.13461804389953613,
      "timestamp": "2025-12-24T20:49:57.133408"
    },
    {
      "sample_id": "squad_571a30bb10f8ca1400304f53",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 494,
      "natural_ratio": 0.0037689208984375,
      "max_context_tokens": 131072,
      "context_length": 897,
      "question_length": 51,
      "prediction": "Bainbridge 和 Vashon 岛屿",
      "reference": "Bainbridge and Vashon",
      "metrics": {
        "f1": 0.896551724137931
      },
      "elapsed_time": 0.1345221996307373,
      "timestamp": "2025-12-24T20:49:57.268088"
    },
    {
      "sample_id": "squad_570da0f416d0071400510c17",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 454,
      "natural_ratio": 0.0034637451171875,
      "max_context_tokens": 131072,
      "context_length": 821,
      "question_length": 47,
      "prediction": "missiles",
      "reference": "missiles",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04470968246459961,
      "timestamp": "2025-12-24T20:49:57.312945"
    },
    {
      "sample_id": "squad_5705f85275f01819005e77e8",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 285,
      "natural_ratio": 0.00217437744140625,
      "max_context_tokens": 131072,
      "context_length": 487,
      "question_length": 43,
      "prediction": "随着秋季白昼变短，鸟类返回到温暖的地区。",
      "reference": "autumn",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.16001510620117188,
      "timestamp": "2025-12-24T20:49:57.473109"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_13541",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 120233,
      "natural_ratio": 0.9173049926757812,
      "max_context_tokens": 131072,
      "context_length": 240404,
      "question_length": 22,
      "prediction": "Mongo lives with Mary.",
      "reference": "With Precious' grandmother.",
      "metrics": {
        "f1": 0.7777777777777778
      },
      "elapsed_time": 3.8615312576293945,
      "timestamp": "2025-12-24T20:50:01.334995"
    },
    {
      "sample_id": "squad_5706910552bb891400689a66",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 160,
      "natural_ratio": 0.001220703125,
      "max_context_tokens": 131072,
      "context_length": 227,
      "question_length": 54,
      "prediction": "蜡嘴雀属的物种",
      "reference": "the waxwings Bombycilla",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.08518838882446289,
      "timestamp": "2025-12-24T20:50:01.420727"
    },
    {
      "sample_id": "squad_571dfb4e55697319006390de",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 537,
      "natural_ratio": 0.00409698486328125,
      "max_context_tokens": 131072,
      "context_length": 987,
      "question_length": 48,
      "prediction": "plantation records, court records, land deeds和其他来源的帮助。",
      "reference": "Colonial records of French and Spanish slave ships and sales, and plantation records in all the former colonies",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.15746188163757324,
      "timestamp": "2025-12-24T20:50:01.578417"
    },
    {
      "sample_id": "squad_56ce55feaab44d1400b886ce",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 144,
      "natural_ratio": 0.0010986328125,
      "max_context_tokens": 131072,
      "context_length": 215,
      "question_length": 34,
      "prediction": "Solar energy comes from the Sun.",
      "reference": "the Sun",
      "metrics": {
        "f1": 0.358974358974359
      },
      "elapsed_time": 0.09025287628173828,
      "timestamp": "2025-12-24T20:50:01.668845"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_15225",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 124005,
      "natural_ratio": 0.9460830688476562,
      "max_context_tokens": 131072,
      "context_length": 247925,
      "question_length": 45,
      "prediction": "因为安娜知道关于猎杀者的一些信息，而其他人则没有。",
      "reference": "She is always unarmed, there is no fun in hunting her.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 4.011860132217407,
      "timestamp": "2025-12-24T20:50:05.681055"
    },
    {
      "sample_id": "squad_56f7529fa6d7ea1400e171af",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 146,
      "natural_ratio": 0.0011138916015625,
      "max_context_tokens": 131072,
      "context_length": 161,
      "question_length": 91,
      "prediction": "Shawn Vancour",
      "reference": "Shawn Vancour",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07451152801513672,
      "timestamp": "2025-12-24T20:50:05.756105"
    },
    {
      "sample_id": "squad_572fb0a5b2c2fd140056835d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 574,
      "natural_ratio": 0.0043792724609375,
      "max_context_tokens": 131072,
      "context_length": 1053,
      "question_length": 55,
      "prediction": "自杀和酗酒",
      "reference": "suicides and drunkenness",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.06600165367126465,
      "timestamp": "2025-12-24T20:50:05.822274"
    },
    {
      "sample_id": "squad_5733c1a94776f419006611a8",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 538,
      "natural_ratio": 0.0041046142578125,
      "max_context_tokens": 131072,
      "context_length": 971,
      "question_length": 66,
      "prediction": "Midwest Fencing Conference",
      "reference": "Midwest Fencing Conference",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07865357398986816,
      "timestamp": "2025-12-24T20:50:05.901084"
    },
    {
      "sample_id": "squad_57303233a23a5019007fcf58",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 387,
      "natural_ratio": 0.00295257568359375,
      "max_context_tokens": 131072,
      "context_length": 681,
      "question_length": 54,
      "prediction": "Whitehead considered himself a rank amateur as a philosopher.",
      "reference": "rank amateur",
      "metrics": {
        "f1": 0.4
      },
      "elapsed_time": 0.13351655006408691,
      "timestamp": "2025-12-24T20:50:06.034756"
    },
    {
      "sample_id": "squad_5726b60ef1498d1400e8e874",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 597,
      "natural_ratio": 0.00455474853515625,
      "max_context_tokens": 131072,
      "context_length": 1100,
      "question_length": 55,
      "prediction": "所有臣民",
      "reference": "all subjects",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.05529665946960449,
      "timestamp": "2025-12-24T20:50:06.090202"
    },
    {
      "sample_id": "squad_570e0a430b85d914000d7c89",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 316,
      "natural_ratio": 0.002410888671875,
      "max_context_tokens": 131072,
      "context_length": 550,
      "question_length": 43,
      "prediction": "非本地物种的无意引入",
      "reference": "non-native species",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.0803232192993164,
      "timestamp": "2025-12-24T20:50:06.170675"
    },
    {
      "sample_id": "squad_572ef20ecb0c0d14000f1696",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 306,
      "natural_ratio": 0.0023345947265625,
      "max_context_tokens": 131072,
      "context_length": 512,
      "question_length": 60,
      "prediction": "2007",
      "reference": "2007",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06320738792419434,
      "timestamp": "2025-12-24T20:50:06.234030"
    },
    {
      "sample_id": "squad_56df49d18bc80c19004e4a0a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 376,
      "natural_ratio": 0.00286865234375,
      "max_context_tokens": 131072,
      "context_length": 663,
      "question_length": 50,
      "prediction": "早期人类通过 scavenging 获得肉食，即他们利用被 predator 杀死或自然死亡的大型动物的尸体。",
      "reference": "scavenging",
      "metrics": {
        "f1": 0.33333333333333337
      },
      "elapsed_time": 0.3125438690185547,
      "timestamp": "2025-12-24T20:50:06.546722"
    },
    {
      "sample_id": "squad_5726706bdd62a815002e84dc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 530,
      "natural_ratio": 0.0040435791015625,
      "max_context_tokens": 131072,
      "context_length": 942,
      "question_length": 79,
      "prediction": "Lady Flora Hastings",
      "reference": "one of her mother's ladies-in-waiting, Lady Flora Hastings",
      "metrics": {
        "f1": 0.8235294117647058
      },
      "elapsed_time": 0.06523776054382324,
      "timestamp": "2025-12-24T20:50:06.612116"
    },
    {
      "sample_id": "squad_57290eaa3f37b31900477fe3",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 624,
      "natural_ratio": 0.0047607421875,
      "max_context_tokens": 131072,
      "context_length": 1139,
      "question_length": 70,
      "prediction": "Kerry fired campaign manager Jim Jordan。",
      "reference": "he fired campaign manager Jim Jordan",
      "metrics": {
        "f1": 0.8749999999999999
      },
      "elapsed_time": 0.10832834243774414,
      "timestamp": "2025-12-24T20:50:06.720602"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_20826",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 72361,
      "natural_ratio": 0.5520706176757812,
      "max_context_tokens": 131072,
      "context_length": 144654,
      "question_length": 28,
      "prediction": "Suzie",
      "reference": "Suzie",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 2.783482074737549,
      "timestamp": "2025-12-24T20:50:09.504423"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_31100",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 49053,
      "natural_ratio": 0.37424468994140625,
      "max_context_tokens": 131072,
      "context_length": 98001,
      "question_length": 66,
      "prediction": "他递给女士一张纸条，上面写着“关于生死攸关的事务”。",
      "reference": "a composing draught for her sister",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 1.4088904857635498,
      "timestamp": "2025-12-24T20:50:10.913711"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_8465",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 122567,
      "natural_ratio": 0.9351119995117188,
      "max_context_tokens": 131072,
      "context_length": 245061,
      "question_length": 34,
      "prediction": "Bloom和Stephen想要找回Stephen，并且面对Diamond Dog。同时，Penelope被要求支付赎金。",
      "reference": "To fake the deaths of Bloom and Stephen",
      "metrics": {
        "f1": 0.5306122448979592
      },
      "elapsed_time": 4.129548788070679,
      "timestamp": "2025-12-24T20:50:15.043658"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_8131",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 86624,
      "natural_ratio": 0.660888671875,
      "max_context_tokens": 131072,
      "context_length": 173146,
      "question_length": 63,
      "prediction": "Martha shows Sarah a large walk-in closet and tells her to share all of the clothes if they fit. They walk into a small bedroom where Martha explains that they sleep there. Martha also mentions that Sarah can take any bed that's open or sleep downstairs. Before leaving, Martha tells Sarah that they don't eat until the evening and that her body doesn't need much food. She also mentions that Sarah can take care of the babies if she wants to, but it's not required.",
      "reference": "She gives her a drugged drink.",
      "metrics": {
        "f1": 0.6511627906976745
      },
      "elapsed_time": 1.123199701309204,
      "timestamp": "2025-12-24T20:50:16.167602"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_7454",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 79403,
      "natural_ratio": 0.6057968139648438,
      "max_context_tokens": 131072,
      "context_length": 158713,
      "question_length": 53,
      "prediction": "The Family",
      "reference": "The Family",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 2.6778604984283447,
      "timestamp": "2025-12-24T20:50:18.845810"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_3782",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 123111,
      "natural_ratio": 0.9392623901367188,
      "max_context_tokens": 131072,
      "context_length": 246144,
      "question_length": 39,
      "prediction": "Parnassus exchanges a fake tube for the real one that Tony has.",
      "reference": "A trance that powers the dream world",
      "metrics": {
        "f1": 0.6857142857142857
      },
      "elapsed_time": 0.35411596298217773,
      "timestamp": "2025-12-24T20:50:19.200302"
    },
    {
      "sample_id": "squad_56e78cf937bdd419002c4114",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 172,
      "natural_ratio": 0.001312255859375,
      "max_context_tokens": 131072,
      "context_length": 286,
      "question_length": 18,
      "prediction": "1963年",
      "reference": "1963",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 0.07377934455871582,
      "timestamp": "2025-12-24T20:50:19.274613"
    },
    {
      "sample_id": "squad_57304d5c069b53140083202a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 329,
      "natural_ratio": 0.00251007080078125,
      "max_context_tokens": 131072,
      "context_length": 574,
      "question_length": 44,
      "prediction": "守门员",
      "reference": "goalkeeper",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.05507516860961914,
      "timestamp": "2025-12-24T20:50:19.329844"
    },
    {
      "sample_id": "squad_57316c1d05b4da19006bd127",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 365,
      "natural_ratio": 0.00278472900390625,
      "max_context_tokens": 131072,
      "context_length": 626,
      "question_length": 64,
      "prediction": "至少使用了100万块玻璃和石质拼砖。",
      "reference": "100 million",
      "metrics": {
        "f1": 0.15999999999999998
      },
      "elapsed_time": 0.17445802688598633,
      "timestamp": "2025-12-24T20:50:19.504466"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_24108",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 15390,
      "natural_ratio": 0.1174163818359375,
      "max_context_tokens": 131072,
      "context_length": 30679,
      "question_length": 61,
      "prediction": "他们意识到彼此为了对方而牺牲了自己的珍贵物品，虽然感到难过，但也为对方的爱意感动。",
      "reference": "How priceless their love is.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.5384526252746582,
      "timestamp": "2025-12-24T20:50:20.043130"
    },
    {
      "sample_id": "squad_56d26abd59d6e41400145fa2",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 487,
      "natural_ratio": 0.00371551513671875,
      "max_context_tokens": 131072,
      "context_length": 865,
      "question_length": 69,
      "prediction": "Abhidharma",
      "reference": "Abhidharma",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05574822425842285,
      "timestamp": "2025-12-24T20:50:20.099058"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_25129",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 76986,
      "natural_ratio": 0.5873565673828125,
      "max_context_tokens": 131072,
      "context_length": 153878,
      "question_length": 54,
      "prediction": "Guido is initially shocked and then accepts it, as he believes it was necessary for his father's revenge.",
      "reference": "He is appalled",
      "metrics": {
        "f1": 0.5294117647058824
      },
      "elapsed_time": 2.72221040725708,
      "timestamp": "2025-12-24T20:50:22.821583"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_23147",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 88177,
      "natural_ratio": 0.6727371215820312,
      "max_context_tokens": 131072,
      "context_length": 176258,
      "question_length": 56,
      "prediction": "Lindsay Lohan",
      "reference": "Audrina Patridge.",
      "metrics": {
        "f1": 0.45454545454545453
      },
      "elapsed_time": 2.3783020973205566,
      "timestamp": "2025-12-24T20:50:25.200293"
    },
    {
      "sample_id": "squad_57263f2889a1e219009ac5c2",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 474,
      "natural_ratio": 0.0036163330078125,
      "max_context_tokens": 131072,
      "context_length": 836,
      "question_length": 73,
      "prediction": "均匀分布在整个灯泡的内表面。",
      "reference": "across the entire surface of the envelope",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.11982965469360352,
      "timestamp": "2025-12-24T20:50:25.320316"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_2686",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 96475,
      "natural_ratio": 0.7360458374023438,
      "max_context_tokens": 131072,
      "context_length": 192850,
      "question_length": 61,
      "prediction": "根据剧本描述，在那场比赛之后，罢工结束，所有球员都回到了正式的球队中。",
      "reference": "They went back to their regular lives.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.5993785858154297,
      "timestamp": "2025-12-24T20:50:28.920039"
    },
    {
      "sample_id": "squad_5726c4fbdd62a815002e8fe5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 660,
      "natural_ratio": 0.005035400390625,
      "max_context_tokens": 131072,
      "context_length": 1248,
      "question_length": 32,
      "prediction": "Paolo Carlini",
      "reference": "Paolo Carlini",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08420681953430176,
      "timestamp": "2025-12-24T20:50:29.004438"
    },
    {
      "sample_id": "squad_572a1dd76aef0514001552b2",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 375,
      "natural_ratio": 0.00286102294921875,
      "max_context_tokens": 131072,
      "context_length": 672,
      "question_length": 38,
      "prediction": "互补色",
      "reference": "complementary",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.04486083984375,
      "timestamp": "2025-12-24T20:50:29.049463"
    },
    {
      "sample_id": "squad_572ba77b34ae481900deaec8",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 577,
      "natural_ratio": 0.00440216064453125,
      "max_context_tokens": 131072,
      "context_length": 1057,
      "question_length": 57,
      "prediction": "腐败是教育领域的一个重大障碍，阻碍了非洲实现全民基础教育。",
      "reference": "charitable funding",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.18457674980163574,
      "timestamp": "2025-12-24T20:50:29.234201"
    },
    {
      "sample_id": "squad_56e82e1900c9c71400d77600",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 538,
      "natural_ratio": 0.0041046142578125,
      "max_context_tokens": 131072,
      "context_length": 960,
      "question_length": 77,
      "prediction": "literate and upper class",
      "reference": "upper class",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.0750417709350586,
      "timestamp": "2025-12-24T20:50:29.309398"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_20560",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 122711,
      "natural_ratio": 0.9362106323242188,
      "max_context_tokens": 131072,
      "context_length": 245309,
      "question_length": 74,
      "prediction": "文中并没有提到General说Bart有什么优势，且文中的主角是Glass而不是Bart，因此无法从给定的故事中得出答案。请提供正确的信息或重新检查问题。",
      "reference": "His having been a soldier",
      "metrics": {
        "f1": 0.2285714285714286
      },
      "elapsed_time": 3.7931127548217773,
      "timestamp": "2025-12-24T20:50:33.102913"
    },
    {
      "sample_id": "squad_5730b2b08ab72b1400f9c6b7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 338,
      "natural_ratio": 0.0025787353515625,
      "max_context_tokens": 131072,
      "context_length": 579,
      "question_length": 58,
      "prediction": "Samoa",
      "reference": "Samoa",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05210256576538086,
      "timestamp": "2025-12-24T20:50:33.155531"
    },
    {
      "sample_id": "squad_570c3599ec8fbc190045bdb5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 515,
      "natural_ratio": 0.00392913818359375,
      "max_context_tokens": 131072,
      "context_length": 959,
      "question_length": 32,
      "prediction": "FBI agents",
      "reference": "FBI agents",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05712151527404785,
      "timestamp": "2025-12-24T20:50:33.212789"
    },
    {
      "sample_id": "squad_5706df8f90286e26004fc721",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 316,
      "natural_ratio": 0.002410888671875,
      "max_context_tokens": 131072,
      "context_length": 542,
      "question_length": 51,
      "prediction": "Ruth Hoskins taught the game of Monopoly to a group of local Quakers.",
      "reference": "a group of local Quakers",
      "metrics": {
        "f1": 0.5161290322580645
      },
      "elapsed_time": 0.20586156845092773,
      "timestamp": "2025-12-24T20:50:33.418796"
    },
    {
      "sample_id": "squad_57261c1f38643c19005ad013",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 411,
      "natural_ratio": 0.00313568115234375,
      "max_context_tokens": 131072,
      "context_length": 719,
      "question_length": 64,
      "prediction": "India",
      "reference": "India",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.03490400314331055,
      "timestamp": "2025-12-24T20:50:33.453829"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_20830",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 72358,
      "natural_ratio": 0.5520477294921875,
      "max_context_tokens": 131072,
      "context_length": 144654,
      "question_length": 22,
      "prediction": "Gideon的丈夫Harry去世了。",
      "reference": "Harry",
      "metrics": {
        "f1": 0.4347826086956522
      },
      "elapsed_time": 0.2162470817565918,
      "timestamp": "2025-12-24T20:50:33.670350"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_23983",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 112621,
      "natural_ratio": 0.8592300415039062,
      "max_context_tokens": 131072,
      "context_length": 225158,
      "question_length": 44,
      "prediction": "Gregor leads Seamus and Deidre to a back entrance to the chateau.",
      "reference": "Post Office",
      "metrics": {
        "f1": 0.5
      },
      "elapsed_time": 3.8989334106445312,
      "timestamp": "2025-12-24T20:50:37.569669"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_30850",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 51803,
      "natural_ratio": 0.39522552490234375,
      "max_context_tokens": 131072,
      "context_length": 103479,
      "question_length": 87,
      "prediction": "洞穴中",
      "reference": "A cave",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 1.6917691230773926,
      "timestamp": "2025-12-24T20:50:39.262117"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_21688",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 50229,
      "natural_ratio": 0.38321685791015625,
      "max_context_tokens": 131072,
      "context_length": 100386,
      "question_length": 33,
      "prediction": "因为Pluto认出了老鼠味，误以为遇到了猎物。",
      "reference": "The narrator is drunk.",
      "metrics": {
        "f1": 0.17142857142857143
      },
      "elapsed_time": 1.4056265354156494,
      "timestamp": "2025-12-24T20:50:40.668134"
    },
    {
      "sample_id": "squad_57323fc60fdd8d15006c687d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 298,
      "natural_ratio": 0.0022735595703125,
      "max_context_tokens": 131072,
      "context_length": 508,
      "question_length": 49,
      "prediction": "11 mi (18 km)的地下隧道",
      "reference": "underground tunnels",
      "metrics": {
        "f1": 0.08333333333333334
      },
      "elapsed_time": 0.13720345497131348,
      "timestamp": "2025-12-24T20:50:40.805504"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_21958",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 54139,
      "natural_ratio": 0.41304779052734375,
      "max_context_tokens": 131072,
      "context_length": 108198,
      "question_length": 41,
      "prediction": "Shakespeare",
      "reference": "Shakespeare.",
      "metrics": {
        "f1": 0.9333333333333333
      },
      "elapsed_time": 1.4755516052246094,
      "timestamp": "2025-12-24T20:50:42.281370"
    },
    {
      "sample_id": "squad_56e8311437bdd419002c4495",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 321,
      "natural_ratio": 0.00244903564453125,
      "max_context_tokens": 131072,
      "context_length": 558,
      "question_length": 45,
      "prediction": "Gallo-Romance",
      "reference": "Gallo-Romance",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07762646675109863,
      "timestamp": "2025-12-24T20:50:42.359184"
    },
    {
      "sample_id": "squad_5727d29d2ca10214002d9762",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 822,
      "natural_ratio": 0.0062713623046875,
      "max_context_tokens": 131072,
      "context_length": 1541,
      "question_length": 63,
      "prediction": "个体（Individuals）",
      "reference": "individuals",
      "metrics": {
        "f1": 0.846153846153846
      },
      "elapsed_time": 0.0835108757019043,
      "timestamp": "2025-12-24T20:50:42.442943"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_31637",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 121345,
      "natural_ratio": 0.9257888793945312,
      "max_context_tokens": 131072,
      "context_length": 242606,
      "question_length": 45,
      "prediction": "Elise用相机拍了Josh的照片，揭示了他真实的样子，这激起了Josh的愤怒。",
      "reference": "She took a picture of him",
      "metrics": {
        "f1": 0.22727272727272724
      },
      "elapsed_time": 3.391883373260498,
      "timestamp": "2025-12-24T20:50:45.835227"
    },
    {
      "sample_id": "squad_57319559a5e9cc1400cdc109",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 477,
      "natural_ratio": 0.00363922119140625,
      "max_context_tokens": 131072,
      "context_length": 846,
      "question_length": 68,
      "prediction": "Caltech",
      "reference": "Caltech",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.050898075103759766,
      "timestamp": "2025-12-24T20:50:45.886707"
    },
    {
      "sample_id": "squad_57260a8fec44d21400f3d82d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 572,
      "natural_ratio": 0.004364013671875,
      "max_context_tokens": 131072,
      "context_length": 1058,
      "question_length": 46,
      "prediction": "美国",
      "reference": "U.S. forces",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.04092264175415039,
      "timestamp": "2025-12-24T20:50:45.927831"
    },
    {
      "sample_id": "squad_572823e42ca10214002d9ebd",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 429,
      "natural_ratio": 0.00327301025390625,
      "max_context_tokens": 131072,
      "context_length": 782,
      "question_length": 37,
      "prediction": "1961年",
      "reference": "1961",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 0.07873964309692383,
      "timestamp": "2025-12-24T20:50:46.006806"
    },
    {
      "sample_id": "squad_57269f5add62a815002e8b46",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 352,
      "natural_ratio": 0.002685546875,
      "max_context_tokens": 131072,
      "context_length": 578,
      "question_length": 87,
      "prediction": "Brick Gothic",
      "reference": "\"Backsteingotik\"",
      "metrics": {
        "f1": 0.6363636363636365
      },
      "elapsed_time": 0.05557394027709961,
      "timestamp": "2025-12-24T20:50:46.062540"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_23225",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 70149,
      "natural_ratio": 0.5351943969726562,
      "max_context_tokens": 131072,
      "context_length": 140206,
      "question_length": 52,
      "prediction": "Spencer did not choose a bridegroom for Gloriana in the text provided. The passage focuses on stories featuring characters like Una, St. George, and others, but does not mention Spencer choosing a groom for Gloriana.",
      "reference": "Arther",
      "metrics": {
        "f1": 0.3448275862068966
      },
      "elapsed_time": 2.4589040279388428,
      "timestamp": "2025-12-24T20:50:48.521789"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_29492",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 53665,
      "natural_ratio": 0.40943145751953125,
      "max_context_tokens": 131072,
      "context_length": 107242,
      "question_length": 49,
      "prediction": "Sol已经去世，并且Max需要给Rav Cohen提供那个数字。",
      "reference": "Sol died of a stroke",
      "metrics": {
        "f1": 0.33333333333333337
      },
      "elapsed_time": 2.2490408420562744,
      "timestamp": "2025-12-24T20:50:50.771160"
    },
    {
      "sample_id": "squad_56e3c1098c00841900fbaee7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 263,
      "natural_ratio": 0.00200653076171875,
      "max_context_tokens": 131072,
      "context_length": 431,
      "question_length": 55,
      "prediction": "1637",
      "reference": "1637",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06676673889160156,
      "timestamp": "2025-12-24T20:50:50.838111"
    },
    {
      "sample_id": "squad_571129e6b654c5140001fbfd",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 845,
      "natural_ratio": 0.00644683837890625,
      "max_context_tokens": 131072,
      "context_length": 1572,
      "question_length": 79,
      "prediction": "导致接触针弯曲，并且使卡带的ROM板向卡带内部施压。",
      "reference": "bent the contact pins slightly",
      "metrics": {
        "f1": 0.05128205128205127
      },
      "elapsed_time": 0.22012066841125488,
      "timestamp": "2025-12-24T20:50:51.058382"
    },
    {
      "sample_id": "squad_57336f6e4776f41900660acd",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 592,
      "natural_ratio": 0.0045166015625,
      "max_context_tokens": 131072,
      "context_length": 1084,
      "question_length": 61,
      "prediction": "Over 100",
      "reference": "Over 100",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07868432998657227,
      "timestamp": "2025-12-24T20:50:51.137192"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_16199",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 96821,
      "natural_ratio": 0.7386856079101562,
      "max_context_tokens": 131072,
      "context_length": 193565,
      "question_length": 37,
      "prediction": "五万骑兵军队被摧毁。",
      "reference": "Turanian.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.248096466064453,
      "timestamp": "2025-12-24T20:50:54.385623"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_8671",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 93907,
      "natural_ratio": 0.7164535522460938,
      "max_context_tokens": 131072,
      "context_length": 187723,
      "question_length": 51,
      "prediction": "Count Octave continued to live with remorse and eventually died in Italy, according to the narrative.",
      "reference": "He aged rapidly and died shortly after. ",
      "metrics": {
        "f1": 0.7894736842105262
      },
      "elapsed_time": 3.1824872493743896,
      "timestamp": "2025-12-24T20:50:57.568483"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_12969",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 82375,
      "natural_ratio": 0.6284713745117188,
      "max_context_tokens": 131072,
      "context_length": 164642,
      "question_length": 68,
      "prediction": "Anya踢了Rasputin的脸，导致他手中的 reliquary 落地并破碎，从而释放了其中的邪恶力量。",
      "reference": "She smashes Rasputin's reliquary.",
      "metrics": {
        "f1": 0.4727272727272727
      },
      "elapsed_time": 3.1095550060272217,
      "timestamp": "2025-12-24T20:51:00.678389"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_27176",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 124084,
      "natural_ratio": 0.946685791015625,
      "max_context_tokens": 131072,
      "context_length": 248076,
      "question_length": 52,
      "prediction": "他们都不喜欢命运、灵魂伴侣和真爱这些概念。",
      "reference": "Parking lots",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 4.201406955718994,
      "timestamp": "2025-12-24T20:51:04.880140"
    },
    {
      "sample_id": "squad_5730175f04bcaa1900d7716d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 449,
      "natural_ratio": 0.00342559814453125,
      "max_context_tokens": 131072,
      "context_length": 794,
      "question_length": 64,
      "prediction": "11,000-16,000",
      "reference": "11,000–16,000",
      "metrics": {
        "f1": 0.8000000000000002
      },
      "elapsed_time": 0.1631453037261963,
      "timestamp": "2025-12-24T20:51:05.043871"
    },
    {
      "sample_id": "squad_572eb28c03f9891900756979",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 374,
      "natural_ratio": 0.0028533935546875,
      "max_context_tokens": 131072,
      "context_length": 639,
      "question_length": 70,
      "prediction": "土地购买、拨款发放、恢复工作、保护区建立",
      "reference": "land purchases, grant making, restoration, and establishment of reserves",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.14650893211364746,
      "timestamp": "2025-12-24T20:51:05.190631"
    },
    {
      "sample_id": "squad_56dd34f19a695914005b955c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 271,
      "natural_ratio": 0.00206756591796875,
      "max_context_tokens": 131072,
      "context_length": 390,
      "question_length": 112,
      "prediction": "总理",
      "reference": "prime minister",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.031078577041625977,
      "timestamp": "2025-12-24T20:51:05.221859"
    },
    {
      "sample_id": "squad_57341a094776f41900661876",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 364,
      "natural_ratio": 0.002777099609375,
      "max_context_tokens": 131072,
      "context_length": 582,
      "question_length": 106,
      "prediction": "8.3%",
      "reference": "8.3%",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06412386894226074,
      "timestamp": "2025-12-24T20:51:05.286116"
    },
    {
      "sample_id": "squad_57313c1205b4da19006bcf06",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 403,
      "natural_ratio": 0.00307464599609375,
      "max_context_tokens": 131072,
      "context_length": 709,
      "question_length": 58,
      "prediction": "4世纪BC",
      "reference": "the 4th-century BC",
      "metrics": {
        "f1": 0.35294117647058826
      },
      "elapsed_time": 0.05450582504272461,
      "timestamp": "2025-12-24T20:51:05.340758"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_7314",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 79161,
      "natural_ratio": 0.6039505004882812,
      "max_context_tokens": 131072,
      "context_length": 158213,
      "question_length": 69,
      "prediction": "乔治的妻子玛尔塔与女儿克里斯汀关系恶化，克里斯汀恨乔治，觉得他背叛了她。",
      "reference": "takes custody of her",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.3921346664428711,
      "timestamp": "2025-12-24T20:51:05.733284"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_11164",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 105878,
      "natural_ratio": 0.8077850341796875,
      "max_context_tokens": 131072,
      "context_length": 211676,
      "question_length": 41,
      "prediction": "他们没有在旧磨坊拍摄，而是改用了其他元素，如消防站和老消防车。",
      "reference": "Waterford Vermont",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.7985785007476807,
      "timestamp": "2025-12-24T20:51:09.532249"
    },
    {
      "sample_id": "squad_57319d4eb9d445190005e40b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 448,
      "natural_ratio": 0.00341796875,
      "max_context_tokens": 131072,
      "context_length": 789,
      "question_length": 67,
      "prediction": "The European University Institute focuses on the social sciences.",
      "reference": "social sciences",
      "metrics": {
        "f1": 0.375
      },
      "elapsed_time": 0.13007903099060059,
      "timestamp": "2025-12-24T20:51:09.662886"
    },
    {
      "sample_id": "squad_5731d461e17f3d1400422469",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 389,
      "natural_ratio": 0.00296783447265625,
      "max_context_tokens": 131072,
      "context_length": 678,
      "question_length": 60,
      "prediction": "Social Gospel Movement",
      "reference": "the Social Gospel Movement",
      "metrics": {
        "f1": 0.9655172413793104
      },
      "elapsed_time": 0.05558919906616211,
      "timestamp": "2025-12-24T20:51:09.718648"
    },
    {
      "sample_id": "squad_57343074d058e614000b6acc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 363,
      "natural_ratio": 0.00276947021484375,
      "max_context_tokens": 131072,
      "context_length": 615,
      "question_length": 71,
      "prediction": "Infectious Disease",
      "reference": "field of Infectious Disease",
      "metrics": {
        "f1": 0.9600000000000001
      },
      "elapsed_time": 0.06361579895019531,
      "timestamp": "2025-12-24T20:51:09.782417"
    },
    {
      "sample_id": "squad_5730a3b12461fd1900a9cf23",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 452,
      "natural_ratio": 0.003448486328125,
      "max_context_tokens": 131072,
      "context_length": 808,
      "question_length": 57,
      "prediction": "totally unintelligible",
      "reference": "unintelligible",
      "metrics": {
        "f1": 0.7777777777777778
      },
      "elapsed_time": 0.07420587539672852,
      "timestamp": "2025-12-24T20:51:09.856752"
    },
    {
      "sample_id": "squad_56df4aa88bc80c19004e4a1c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 343,
      "natural_ratio": 0.00261688232421875,
      "max_context_tokens": 131072,
      "context_length": 607,
      "question_length": 40,
      "prediction": "2000年",
      "reference": "2000",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 0.0740199089050293,
      "timestamp": "2025-12-24T20:51:09.930910"
    },
    {
      "sample_id": "squad_57324734e99e3014001e662f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 745,
      "natural_ratio": 0.00568389892578125,
      "max_context_tokens": 131072,
      "context_length": 1402,
      "question_length": 49,
      "prediction": "Robert Michel作为少数党领袖时，通常充当共和党总统的“发言人”。",
      "reference": "point man\" for Republican presidents",
      "metrics": {
        "f1": 0.3773584905660377
      },
      "elapsed_time": 0.20119214057922363,
      "timestamp": "2025-12-24T20:51:10.132257"
    },
    {
      "sample_id": "squad_572684165951b619008f7532",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 342,
      "natural_ratio": 0.0026092529296875,
      "max_context_tokens": 131072,
      "context_length": 581,
      "question_length": 63,
      "prediction": "Plant-derived pesticides（植物源性杀虫剂）",
      "reference": "Plant-derived pesticides, or \"botanicals",
      "metrics": {
        "f1": 0.6829268292682927
      },
      "elapsed_time": 0.134749174118042,
      "timestamp": "2025-12-24T20:51:10.267154"
    },
    {
      "sample_id": "squad_572756af708984140094dc67",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 349,
      "natural_ratio": 0.00266265869140625,
      "max_context_tokens": 131072,
      "context_length": 589,
      "question_length": 69,
      "prediction": "boll weevil",
      "reference": "boll weevil",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06379914283752441,
      "timestamp": "2025-12-24T20:51:10.331089"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_7010",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 25796,
      "natural_ratio": 0.196807861328125,
      "max_context_tokens": 131072,
      "context_length": 51524,
      "question_length": 28,
      "prediction": "Claudius的死亡",
      "reference": "Claudius",
      "metrics": {
        "f1": 0.8421052631578948
      },
      "elapsed_time": 0.6276087760925293,
      "timestamp": "2025-12-24T20:51:10.958919"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_15906",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 54109,
      "natural_ratio": 0.41281890869140625,
      "max_context_tokens": 131072,
      "context_length": 108147,
      "question_length": 32,
      "prediction": "早上，船长独自一人在甲板上，感到与船的陌生和孤独，同时也在思考即将到来的漫长旅程。他注意到一些细微的变化，比如船上的静谧和周围环境的宁静，这让他意识到自己肩负的重大责任。他还回忆起前一晚发生的事件，并开始考虑如何处理船上的情况。",
      "reference": "The other captain comes looking for him and the narrator is able to lie.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 2.0770556926727295,
      "timestamp": "2025-12-24T20:51:13.036279"
    },
    {
      "sample_id": "squad_57303e52b2c2fd1400568ad6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 450,
      "natural_ratio": 0.0034332275390625,
      "max_context_tokens": 131072,
      "context_length": 812,
      "question_length": 49,
      "prediction": "英国、爱尔兰、以及英伦诸岛的所有州和领土。",
      "reference": "all of the states and territories of the British Isles",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.1673896312713623,
      "timestamp": "2025-12-24T20:51:13.203874"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_25268",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 110071,
      "natural_ratio": 0.8397750854492188,
      "max_context_tokens": 131072,
      "context_length": 220056,
      "question_length": 46,
      "prediction": "vault",
      "reference": "The vault.",
      "metrics": {
        "f1": 0.7142857142857143
      },
      "elapsed_time": 4.104003429412842,
      "timestamp": "2025-12-24T20:51:17.308210"
    },
    {
      "sample_id": "squad_57341210d058e614000b68e5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 348,
      "natural_ratio": 0.002655029296875,
      "max_context_tokens": 131072,
      "context_length": 609,
      "question_length": 48,
      "prediction": "超过一个世纪",
      "reference": "over a century",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.06020784378051758,
      "timestamp": "2025-12-24T20:51:17.368933"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_4147",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 23122,
      "natural_ratio": 0.1764068603515625,
      "max_context_tokens": 131072,
      "context_length": 46159,
      "question_length": 46,
      "prediction": "文本中并未提到三名宇航员的飞船信息，无法作答。",
      "reference": "Sunbird",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.6557512283325195,
      "timestamp": "2025-12-24T20:51:18.024968"
    },
    {
      "sample_id": "squad_570cda09b3d812140066d2c6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 420,
      "natural_ratio": 0.003204345703125,
      "max_context_tokens": 131072,
      "context_length": 725,
      "question_length": 75,
      "prediction": "2世纪",
      "reference": "2nd",
      "metrics": {
        "f1": 0.3333333333333333
      },
      "elapsed_time": 0.046966552734375,
      "timestamp": "2025-12-24T20:51:18.072118"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_29215",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 105289,
      "natural_ratio": 0.8032913208007812,
      "max_context_tokens": 131072,
      "context_length": 210488,
      "question_length": 51,
      "prediction": "因为Vitti的精神状况需要专业的治疗和监督，而Ben Sobel被指定为他的监护人。",
      "reference": "For further therapy.",
      "metrics": {
        "f1": 0.1702127659574468
      },
      "elapsed_time": 3.689772367477417,
      "timestamp": "2025-12-24T20:51:21.762277"
    },
    {
      "sample_id": "squad_5732554de17f3d1400422897",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 326,
      "natural_ratio": 0.0024871826171875,
      "max_context_tokens": 131072,
      "context_length": 577,
      "question_length": 35,
      "prediction": "The Bronx's highest point is in the northwest corner, west of Van Cortlandt Park and in the Chapel Farm area near the Riverdale Country School.",
      "reference": "in the northwest corner, west of Van Cortlandt Park and in the Chapel Farm area near the Riverdale Country School",
      "metrics": {
        "f1": 0.8828125
      },
      "elapsed_time": 0.34400343894958496,
      "timestamp": "2025-12-24T20:51:22.106516"
    },
    {
      "sample_id": "squad_5729296d6aef051400154afc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 399,
      "natural_ratio": 0.00304412841796875,
      "max_context_tokens": 131072,
      "context_length": 685,
      "question_length": 73,
      "prediction": "智能测试场景",
      "reference": "intelligent test scenarios",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.056058645248413086,
      "timestamp": "2025-12-24T20:51:22.162765"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_13087",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 16679,
      "natural_ratio": 0.12725067138671875,
      "max_context_tokens": 131072,
      "context_length": 33278,
      "question_length": 41,
      "prediction": "Thus, with a singular shriek, the last of the lights perished, and\nthe darkness, with an evil enjoyment, closed over their eyes.",
      "reference": "That the red Death keep everyone in fear of drying.",
      "metrics": {
        "f1": 0.7906976744186047
      },
      "elapsed_time": 0.35608792304992676,
      "timestamp": "2025-12-24T20:51:22.519110"
    },
    {
      "sample_id": "squad_56e0a80a7aa994140058e695",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 579,
      "natural_ratio": 0.00441741943359375,
      "max_context_tokens": 131072,
      "context_length": 1028,
      "question_length": 90,
      "prediction": "Rossiyskaya Gazeta",
      "reference": "Rossiyskaya Gazeta",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0997629165649414,
      "timestamp": "2025-12-24T20:51:22.619041"
    },
    {
      "sample_id": "squad_5731dd950fdd8d15006c65b1",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 332,
      "natural_ratio": 0.002532958984375,
      "max_context_tokens": 131072,
      "context_length": 577,
      "question_length": 47,
      "prediction": "六所",
      "reference": "six",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.04351305961608887,
      "timestamp": "2025-12-24T20:51:22.662687"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_7585",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 81799,
      "natural_ratio": 0.6240768432617188,
      "max_context_tokens": 131072,
      "context_length": 163541,
      "question_length": 18,
      "prediction": "被Frank用刀刺伤侧腹后死亡。",
      "reference": "Frank accidentally stabbed her.",
      "metrics": {
        "f1": 0.30303030303030304
      },
      "elapsed_time": 2.5395758152008057,
      "timestamp": "2025-12-24T20:51:25.202606"
    },
    {
      "sample_id": "squad_572b4a57be1ee31400cb830b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 505,
      "natural_ratio": 0.00385284423828125,
      "max_context_tokens": 131072,
      "context_length": 933,
      "question_length": 37,
      "prediction": "葡萄牙人",
      "reference": "Portuguese",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.050108909606933594,
      "timestamp": "2025-12-24T20:51:25.252885"
    },
    {
      "sample_id": "squad_57064fb952bb8914006899d8",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 450,
      "natural_ratio": 0.0034332275390625,
      "max_context_tokens": 131072,
      "context_length": 834,
      "question_length": 27,
      "prediction": "Qur'an permitted sexual relations between a male master and his female slave outside of marriage.",
      "reference": "sexual relations between a male master and his female slave",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 0.20503807067871094,
      "timestamp": "2025-12-24T20:51:25.458085"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_26928",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 67426,
      "natural_ratio": 0.5144195556640625,
      "max_context_tokens": 131072,
      "context_length": 134786,
      "question_length": 27,
      "prediction": "因为贝基在美少女大赛中获胜，而 Amber 认为她应该赢。",
      "reference": "Her dance costume is missing",
      "metrics": {
        "f1": 0.23809523809523808
      },
      "elapsed_time": 3.1678123474121094,
      "timestamp": "2025-12-24T20:51:28.626268"
    },
    {
      "sample_id": "squad_5727a5f04b864d190016395d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 509,
      "natural_ratio": 0.00388336181640625,
      "max_context_tokens": 131072,
      "context_length": 906,
      "question_length": 72,
      "prediction": "New York City",
      "reference": "New York City",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06368017196655273,
      "timestamp": "2025-12-24T20:51:28.690116"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_30836",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 51780,
      "natural_ratio": 0.395050048828125,
      "max_context_tokens": 131072,
      "context_length": 103479,
      "question_length": 42,
      "prediction": "Aeneas was heading to Italy before the storm.",
      "reference": "Italy",
      "metrics": {
        "f1": 0.2222222222222222
      },
      "elapsed_time": 0.2108299732208252,
      "timestamp": "2025-12-24T20:51:28.901207"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_1016",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 102138,
      "natural_ratio": 0.7792510986328125,
      "max_context_tokens": 131072,
      "context_length": 204189,
      "question_length": 47,
      "prediction": "Landy",
      "reference": "Paz",
      "metrics": {
        "f1": 0.25
      },
      "elapsed_time": 3.1755006313323975,
      "timestamp": "2025-12-24T20:51:32.077016"
    },
    {
      "sample_id": "squad_573113b905b4da19006bcd4e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 433,
      "natural_ratio": 0.00330352783203125,
      "max_context_tokens": 131072,
      "context_length": 785,
      "question_length": 42,
      "prediction": "late Jurassic period",
      "reference": "late Jurassic period",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.059835195541381836,
      "timestamp": "2025-12-24T20:51:32.137004"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_16696",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 106078,
      "natural_ratio": 0.8093109130859375,
      "max_context_tokens": 131072,
      "context_length": 212070,
      "question_length": 47,
      "prediction": "Papa Midnite tells Constantine that Mammon, the son of Satan, is using the Spear of Destiny to cross over to Earth and create his own Hell. Constantine learns that Mammon is helping Gabriel, and that Gabriel has pierced dead air with the Spear, indicating that Angela has vanished. Constantine also learns that he needs to save Isabel and that Gabriel is still a threat.",
      "reference": "That all of Hell is waiting for Constantine to pass away.",
      "metrics": {
        "f1": 0.8571428571428571
      },
      "elapsed_time": 4.227287769317627,
      "timestamp": "2025-12-24T20:51:36.364624"
    },
    {
      "sample_id": "squad_570aa9676d058f1900183052",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 402,
      "natural_ratio": 0.0030670166015625,
      "max_context_tokens": 131072,
      "context_length": 701,
      "question_length": 64,
      "prediction": "总统",
      "reference": "president",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.03881382942199707,
      "timestamp": "2025-12-24T20:51:36.403955"
    },
    {
      "sample_id": "squad_57260ae238643c19005acf7a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 647,
      "natural_ratio": 0.00493621826171875,
      "max_context_tokens": 131072,
      "context_length": 1177,
      "question_length": 77,
      "prediction": "照明的运营成本计算时需要考虑的因素包括：照明需求、灯泡的成本及其更换的人工成本（考虑到有效的灯泡寿命）、用于照明的电力成本、灯泡操作对供暖和空调系统的影响。",
      "reference": "illumination requirements, cost of the lamp and labor cost to replace lamps (taking into account effective lamp lifetime), cost of electricity used",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.47531914710998535,
      "timestamp": "2025-12-24T20:51:36.879480"
    },
    {
      "sample_id": "squad_5727db044b864d1900163ea7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 359,
      "natural_ratio": 0.00273895263671875,
      "max_context_tokens": 131072,
      "context_length": 642,
      "question_length": 36,
      "prediction": "Tom Nettles是南方 Baptist 神经学学院的历史神学教授。",
      "reference": "historical theology",
      "metrics": {
        "f1": 0.4210526315789474
      },
      "elapsed_time": 0.20324921607971191,
      "timestamp": "2025-12-24T20:51:37.082889"
    },
    {
      "sample_id": "squad_57287a762ca10214002da3b4",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 341,
      "natural_ratio": 0.00260162353515625,
      "max_context_tokens": 131072,
      "context_length": 551,
      "question_length": 92,
      "prediction": "八个",
      "reference": "Eight",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.043744802474975586,
      "timestamp": "2025-12-24T20:51:37.126776"
    },
    {
      "sample_id": "squad_572b7afb34ae481900deae41",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 461,
      "natural_ratio": 0.00351715087890625,
      "max_context_tokens": 131072,
      "context_length": 816,
      "question_length": 66,
      "prediction": "理性思想",
      "reference": "rational thought",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.04294109344482422,
      "timestamp": "2025-12-24T20:51:37.169849"
    },
    {
      "sample_id": "squad_57302da5b2c2fd1400568a0d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 384,
      "natural_ratio": 0.0029296875,
      "max_context_tokens": 131072,
      "context_length": 667,
      "question_length": 62,
      "prediction": "未具体提及在战斗中加入轻步兵和骑兵的其他类型 infantry，根据描述，主要提到的是轻步兵由最年轻和社会地位较低的人组成。",
      "reference": "heavy infantry",
      "metrics": {
        "f1": 0.25806451612903225
      },
      "elapsed_time": 0.3719320297241211,
      "timestamp": "2025-12-24T20:51:37.541924"
    },
    {
      "sample_id": "squad_572e81e0dfa6aa1500f8d05d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 516,
      "natural_ratio": 0.003936767578125,
      "max_context_tokens": 131072,
      "context_length": 918,
      "question_length": 74,
      "prediction": "Gaddafi鼓励其他贝都因儿童为自己的身份感到骄傲。",
      "reference": "Bullied for being a Bedouin, he was proud of his identity and encouraged pride in other Bedouin children.",
      "metrics": {
        "f1": 0.21276595744680854
      },
      "elapsed_time": 0.16398286819458008,
      "timestamp": "2025-12-24T20:51:37.706047"
    },
    {
      "sample_id": "squad_572fb38ea23a5019007fc8cb",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 492,
      "natural_ratio": 0.003753662109375,
      "max_context_tokens": 131072,
      "context_length": 907,
      "question_length": 37,
      "prediction": "卫星接收器，尤其是工作在约12 GHz频率的接收器。",
      "reference": "satellite receivers",
      "metrics": {
        "f1": 0.06060606060606061
      },
      "elapsed_time": 0.1906425952911377,
      "timestamp": "2025-12-24T20:51:37.896835"
    },
    {
      "sample_id": "squad_5735c081e853931400426b43",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 413,
      "natural_ratio": 0.00315093994140625,
      "max_context_tokens": 131072,
      "context_length": 754,
      "question_length": 32,
      "prediction": "\"the stone house of arms and ammunition\"",
      "reference": "the stone house of arms and ammunition",
      "metrics": {
        "f1": 0.9743589743589743
      },
      "elapsed_time": 0.11436891555786133,
      "timestamp": "2025-12-24T20:51:38.011351"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_20359",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 115668,
      "natural_ratio": 0.882476806640625,
      "max_context_tokens": 131072,
      "context_length": 231257,
      "question_length": 40,
      "prediction": "Alex最终杀死了试图占有项链的Victor，并且Jason在与鲨鱼搏斗后也受伤了，但项链被留在了船上。",
      "reference": "Alex realizes that it could be used against him as evidence and he throws it in the ocean",
      "metrics": {
        "f1": 0.3692307692307692
      },
      "elapsed_time": 4.2111287117004395,
      "timestamp": "2025-12-24T20:51:42.222888"
    },
    {
      "sample_id": "squad_5727de03ff5b5019007d9717",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 294,
      "natural_ratio": 0.0022430419921875,
      "max_context_tokens": 131072,
      "context_length": 504,
      "question_length": 45,
      "prediction": "因奴隶制和宣教问题。",
      "reference": "over slavery and missions",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.1057591438293457,
      "timestamp": "2025-12-24T20:51:42.329163"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_2168",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 103316,
      "natural_ratio": 0.788238525390625,
      "max_context_tokens": 131072,
      "context_length": 206512,
      "question_length": 80,
      "prediction": " trumpet",
      "reference": "Trumpet",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 3.0804507732391357,
      "timestamp": "2025-12-24T20:51:45.409956"
    },
    {
      "sample_id": "squad_572e81e0dfa6aa1500f8d05e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 526,
      "natural_ratio": 0.0040130615234375,
      "max_context_tokens": 131072,
      "context_length": 918,
      "question_length": 95,
      "prediction": "在萨哈的中学期间，卡扎菲很受欢迎；一些在那里结交的朋友后来在他的政府中担任重要职务，最著名的是他最好的朋友阿卜杜勒·萨拉姆·贾勒杜德。",
      "reference": "Gaddafi was popular at school; some friends made there received significant jobs in his later administration, most notably his best friend Abdul Salam Jalloud.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.4864227771759033,
      "timestamp": "2025-12-24T20:51:45.896614"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_25018",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 81139,
      "natural_ratio": 0.6190414428710938,
      "max_context_tokens": 131072,
      "context_length": 162192,
      "question_length": 46,
      "prediction": "拉丁语",
      "reference": "Spanish.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.474669933319092,
      "timestamp": "2025-12-24T20:51:49.371648"
    },
    {
      "sample_id": "squad_57303a4904bcaa1900d773d8",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 314,
      "natural_ratio": 0.0023956298828125,
      "max_context_tokens": 131072,
      "context_length": 533,
      "question_length": 56,
      "prediction": "2005",
      "reference": "2005",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07098388671875,
      "timestamp": "2025-12-24T20:51:49.442852"
    },
    {
      "sample_id": "squad_5733af354776f4190066101e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 491,
      "natural_ratio": 0.00374603271484375,
      "max_context_tokens": 131072,
      "context_length": 879,
      "question_length": 63,
      "prediction": "humanities, social, and natural sciences",
      "reference": "humanities, social, and natural sciences",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.1065976619720459,
      "timestamp": "2025-12-24T20:51:49.549660"
    },
    {
      "sample_id": "squad_57061f8275f01819005e79ba",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 450,
      "natural_ratio": 0.0034332275390625,
      "max_context_tokens": 131072,
      "context_length": 810,
      "question_length": 51,
      "prediction": "1993年",
      "reference": "1993",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 0.07600045204162598,
      "timestamp": "2025-12-24T20:51:49.625812"
    },
    {
      "sample_id": "squad_57315bfaa5e9cc1400cdbf01",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 508,
      "natural_ratio": 0.003875732421875,
      "max_context_tokens": 131072,
      "context_length": 902,
      "question_length": 74,
      "prediction": "法国大革命",
      "reference": "Bastille",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.05523276329040527,
      "timestamp": "2025-12-24T20:51:49.681183"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_24781",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 15313,
      "natural_ratio": 0.11682891845703125,
      "max_context_tokens": 131072,
      "context_length": 30565,
      "question_length": 22,
      "prediction": "Kate is portrayed as waiting for her husband Tam, showing concern and worry as the story progresses.",
      "reference": "As someone to be feared.",
      "metrics": {
        "f1": 0.7222222222222222
      },
      "elapsed_time": 0.5106298923492432,
      "timestamp": "2025-12-24T20:51:50.192071"
    },
    {
      "sample_id": "squad_5726a6b4f1498d1400e8e623",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 354,
      "natural_ratio": 0.0027008056640625,
      "max_context_tokens": 131072,
      "context_length": 630,
      "question_length": 38,
      "prediction": "General Saw Maung",
      "reference": "General Saw Maung",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06647872924804688,
      "timestamp": "2025-12-24T20:51:50.258745"
    },
    {
      "sample_id": "squad_56de33fc4396321400ee2696",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 252,
      "natural_ratio": 0.001922607421875,
      "max_context_tokens": 131072,
      "context_length": 379,
      "question_length": 86,
      "prediction": "Montesquieu",
      "reference": "Montesquieu",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.060485124588012695,
      "timestamp": "2025-12-24T20:51:50.319364"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_15159",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 51379,
      "natural_ratio": 0.39199066162109375,
      "max_context_tokens": 131072,
      "context_length": 102680,
      "question_length": 38,
      "prediction": "Lord Dorimont的仆人不小心拿走了那份未发表的帕拉代作品。",
      "reference": "Lady Augusta",
      "metrics": {
        "f1": 0.21052631578947367
      },
      "elapsed_time": 1.6050212383270264,
      "timestamp": "2025-12-24T20:51:51.924669"
    },
    {
      "sample_id": "squad_5732a702328d981900601fe0",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 454,
      "natural_ratio": 0.0034637451171875,
      "max_context_tokens": 131072,
      "context_length": 793,
      "question_length": 76,
      "prediction": "Antarctica",
      "reference": "Antarctica",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05502605438232422,
      "timestamp": "2025-12-24T20:51:51.979833"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_29144",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 74721,
      "natural_ratio": 0.5700759887695312,
      "max_context_tokens": 131072,
      "context_length": 149353,
      "question_length": 49,
      "prediction": "因为Dinant被Lamira拒绝了，而且她还嫁给了别人，这让他感到非常沮丧和愤怒。",
      "reference": "The woman he loves is marrying a rich, old man.",
      "metrics": {
        "f1": 0.2857142857142857
      },
      "elapsed_time": 2.8343427181243896,
      "timestamp": "2025-12-24T20:51:54.814566"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_29649",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 105859,
      "natural_ratio": 0.8076400756835938,
      "max_context_tokens": 131072,
      "context_length": 211651,
      "question_length": 27,
      "prediction": "Beowulf",
      "reference": "Beowulf. ",
      "metrics": {
        "f1": 0.9333333333333333
      },
      "elapsed_time": 3.878094434738159,
      "timestamp": "2025-12-24T20:51:58.693060"
    },
    {
      "sample_id": "squad_5725fbcbec44d21400f3d7c7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 740,
      "natural_ratio": 0.005645751953125,
      "max_context_tokens": 131072,
      "context_length": 1384,
      "question_length": 57,
      "prediction": "146 BC",
      "reference": "146 BC",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07603812217712402,
      "timestamp": "2025-12-24T20:51:58.769618"
    },
    {
      "sample_id": "squad_56e7287e37bdd419002c3dd3",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 684,
      "natural_ratio": 0.005218505859375,
      "max_context_tokens": 131072,
      "context_length": 1217,
      "question_length": 111,
      "prediction": "1895",
      "reference": "1895",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06927275657653809,
      "timestamp": "2025-12-24T20:51:58.839023"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_23123",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 88167,
      "natural_ratio": 0.6726608276367188,
      "max_context_tokens": 131072,
      "context_length": 176258,
      "question_length": 37,
      "prediction": "在Marc搬到Indian Hills学校后，他遇到了Rebecca Ahn。具体来说，是在Marc转学到Indian Hills高中后，通过朋友介绍认识的Rebecca。",
      "reference": "Indian Hills High School",
      "metrics": {
        "f1": 0.34615384615384615
      },
      "elapsed_time": 0.4885704517364502,
      "timestamp": "2025-12-24T20:51:59.327938"
    },
    {
      "sample_id": "squad_56e1963acd28a01900c679e9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 376,
      "natural_ratio": 0.00286865234375,
      "max_context_tokens": 131072,
      "context_length": 664,
      "question_length": 49,
      "prediction": "在激发态存在",
      "reference": "excited form",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.06616377830505371,
      "timestamp": "2025-12-24T20:51:59.394239"
    },
    {
      "sample_id": "squad_57307b79069b531400832116",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 370,
      "natural_ratio": 0.0028228759765625,
      "max_context_tokens": 131072,
      "context_length": 627,
      "question_length": 74,
      "prediction": "他们通过在国外建立联系、募集资金和提高 Awareness，以及利用现有的商业网络进行贸易，为希腊的建立做出了贡献。",
      "reference": "important in establishing the fledgling state, raising funds and awareness abroad",
      "metrics": {
        "f1": 0.1971830985915493
      },
      "elapsed_time": 0.28453850746154785,
      "timestamp": "2025-12-24T20:51:59.678925"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_11181",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 105883,
      "natural_ratio": 0.8078231811523438,
      "max_context_tokens": 131072,
      "context_length": 211676,
      "question_length": 50,
      "prediction": "因为她认为那是对她的不尊重，她不想裸体出演。",
      "reference": "She wanted to get paid an extra $800,000",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.2921414375305176,
      "timestamp": "2025-12-24T20:51:59.971326"
    },
    {
      "sample_id": "squad_5732b191d6dcfa19001e8a7e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 533,
      "natural_ratio": 0.00406646728515625,
      "max_context_tokens": 131072,
      "context_length": 881,
      "question_length": 146,
      "prediction": "September 10, 2003",
      "reference": "September 10, 2003",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.12708187103271484,
      "timestamp": "2025-12-24T20:52:00.098907"
    },
    {
      "sample_id": "squad_56f799d2a6d7ea1400e17260",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 380,
      "natural_ratio": 0.002899169921875,
      "max_context_tokens": 131072,
      "context_length": 687,
      "question_length": 33,
      "prediction": "Glencoe 和 Sea Nymph",
      "reference": "Glencoe and Sea Nymph",
      "metrics": {
        "f1": 0.9285714285714286
      },
      "elapsed_time": 0.09398007392883301,
      "timestamp": "2025-12-24T20:52:00.193025"
    },
    {
      "sample_id": "squad_56e3c47839bdeb14003478fa",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 272,
      "natural_ratio": 0.0020751953125,
      "max_context_tokens": 131072,
      "context_length": 440,
      "question_length": 64,
      "prediction": "Age of Enlightenment",
      "reference": "Age of Enlightenment",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05042552947998047,
      "timestamp": "2025-12-24T20:52:00.243583"
    },
    {
      "sample_id": "squad_5728ec04ff5b5019007da956",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 475,
      "natural_ratio": 0.00362396240234375,
      "max_context_tokens": 131072,
      "context_length": 872,
      "question_length": 38,
      "prediction": "Emperor Kammu ruled in the early Heian period, specifically the late 8th and early 9th centuries.",
      "reference": "early Heian period",
      "metrics": {
        "f1": 0.3130434782608696
      },
      "elapsed_time": 0.27340245246887207,
      "timestamp": "2025-12-24T20:52:00.517116"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_14413",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 77349,
      "natural_ratio": 0.5901260375976562,
      "max_context_tokens": 131072,
      "context_length": 154641,
      "question_length": 17,
      "prediction": "Beatrice is a character in the play who initially appears to disdain love and marriage, but ultimately falls in love with Benedick.",
      "reference": "Leonato's niece",
      "metrics": {
        "f1": 0.5555555555555556
      },
      "elapsed_time": 2.7258663177490234,
      "timestamp": "2025-12-24T20:52:03.243302"
    },
    {
      "sample_id": "squad_5728027a4b864d190016420a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 346,
      "natural_ratio": 0.0026397705078125,
      "max_context_tokens": 131072,
      "context_length": 597,
      "question_length": 56,
      "prediction": "numbers in the names of the encodings indicate the number of bits per code value (for UTF encodings) or the number of bytes per code value (for UCS encodings)",
      "reference": "the number of bits per code value (for UTF encodings) or the number of bytes per code value",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 0.3902311325073242,
      "timestamp": "2025-12-24T20:52:03.633717"
    },
    {
      "sample_id": "squad_572857772ca10214002da2a4",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 328,
      "natural_ratio": 0.00250244140625,
      "max_context_tokens": 131072,
      "context_length": 587,
      "question_length": 30,
      "prediction": "在德主义的观点中，上帝是完全超越的：上帝存在，但不会干预世界，除了创造世界之外。在这种观点中，上帝不是拟人化的，并且不会回答祈祷或产生奇迹。",
      "reference": "God exists, but does not intervene in the world beyond what was necessary to create it",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.45363378524780273,
      "timestamp": "2025-12-24T20:52:04.087517"
    },
    {
      "sample_id": "squad_56e12dc2e3433e1400422c8c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 786,
      "natural_ratio": 0.0059967041015625,
      "max_context_tokens": 131072,
      "context_length": 1469,
      "question_length": 63,
      "prediction": "1965",
      "reference": "1965",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07033371925354004,
      "timestamp": "2025-12-24T20:52:04.157987"
    },
    {
      "sample_id": "squad_5731dd77e17f3d14004224c6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 777,
      "natural_ratio": 0.00592803955078125,
      "max_context_tokens": 131072,
      "context_length": 1452,
      "question_length": 63,
      "prediction": "bar-tailed godwit",
      "reference": "bar-tailed godwit",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08136844635009766,
      "timestamp": "2025-12-24T20:52:04.239551"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_7628",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 77860,
      "natural_ratio": 0.594024658203125,
      "max_context_tokens": 131072,
      "context_length": 155635,
      "question_length": 45,
      "prediction": "Duncan",
      "reference": "Matthew Hale.",
      "metrics": {
        "f1": 0.14285714285714285
      },
      "elapsed_time": 3.361879825592041,
      "timestamp": "2025-12-24T20:52:07.601774"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_17643",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 70379,
      "natural_ratio": 0.5369491577148438,
      "max_context_tokens": 131072,
      "context_length": 140636,
      "question_length": 83,
      "prediction": "Mark Renton does not go to prison after he and Spud rob the bookstore because the story does not mention them being caught or going to prison for this specific event.",
      "reference": "He entered drug rehab.",
      "metrics": {
        "f1": 0.7058823529411764
      },
      "elapsed_time": 2.4151601791381836,
      "timestamp": "2025-12-24T20:52:10.017305"
    },
    {
      "sample_id": "squad_5726a39d5951b619008f789d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 775,
      "natural_ratio": 0.00591278076171875,
      "max_context_tokens": 131072,
      "context_length": 1458,
      "question_length": 52,
      "prediction": "1812年",
      "reference": "1812",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 0.08556556701660156,
      "timestamp": "2025-12-24T20:52:10.103050"
    },
    {
      "sample_id": "squad_57302226947a6a140053d16a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 360,
      "natural_ratio": 0.00274658203125,
      "max_context_tokens": 131072,
      "context_length": 614,
      "question_length": 66,
      "prediction": "1903年",
      "reference": "1903",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 0.07492899894714355,
      "timestamp": "2025-12-24T20:52:10.178116"
    },
    {
      "sample_id": "squad_572fb814a23a5019007fc8dd",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 420,
      "natural_ratio": 0.003204345703125,
      "max_context_tokens": 131072,
      "context_length": 749,
      "question_length": 51,
      "prediction": "数据库管理员可以恢复被损坏的数据库。",
      "reference": "database administrator",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.1176760196685791,
      "timestamp": "2025-12-24T20:52:10.295989"
    },
    {
      "sample_id": "squad_57336c1cd058e614000b5ac1",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 428,
      "natural_ratio": 0.003265380859375,
      "max_context_tokens": 131072,
      "context_length": 776,
      "question_length": 41,
      "prediction": "French",
      "reference": "French",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04608488082885742,
      "timestamp": "2025-12-24T20:52:10.342228"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_11954",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 63253,
      "natural_ratio": 0.48258209228515625,
      "max_context_tokens": 131072,
      "context_length": 126395,
      "question_length": 71,
      "prediction": "Talthybius told Hecuba that Cassandra was chosen as a prize for Agamemnon, to be his bride.",
      "reference": "Cassandra will become a concubine of Agamemnon",
      "metrics": {
        "f1": 0.8292682926829268
      },
      "elapsed_time": 2.2129247188568115,
      "timestamp": "2025-12-24T20:52:12.555522"
    },
    {
      "sample_id": "squad_56e7af3b37bdd419002c433a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 269,
      "natural_ratio": 0.00205230712890625,
      "max_context_tokens": 131072,
      "context_length": 441,
      "question_length": 57,
      "prediction": "独立日假期周末",
      "reference": "Independence Day",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.06716442108154297,
      "timestamp": "2025-12-24T20:52:12.622866"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_13470",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 96974,
      "natural_ratio": 0.7398529052734375,
      "max_context_tokens": 131072,
      "context_length": 193877,
      "question_length": 31,
      "prediction": "Christiania",
      "reference": "Kristiania",
      "metrics": {
        "f1": 0.7999999999999999
      },
      "elapsed_time": 3.655646562576294,
      "timestamp": "2025-12-24T20:52:16.278867"
    },
    {
      "sample_id": "squad_57325d96b9d445190005eab7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 306,
      "natural_ratio": 0.0023345947265625,
      "max_context_tokens": 131072,
      "context_length": 508,
      "question_length": 65,
      "prediction": "8.2%",
      "reference": "8.2%",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07059431076049805,
      "timestamp": "2025-12-24T20:52:16.349629"
    },
    {
      "sample_id": "squad_5727740ff1498d1400e8f88d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 738,
      "natural_ratio": 0.0056304931640625,
      "max_context_tokens": 131072,
      "context_length": 1367,
      "question_length": 70,
      "prediction": "John Ray",
      "reference": "John Ray",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.049657583236694336,
      "timestamp": "2025-12-24T20:52:16.399420"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_28168",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 53138,
      "natural_ratio": 0.4054107666015625,
      "max_context_tokens": 131072,
      "context_length": 106158,
      "question_length": 78,
      "prediction": "七年后",
      "reference": "Seven years.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 1.6715772151947021,
      "timestamp": "2025-12-24T20:52:18.071333"
    },
    {
      "sample_id": "squad_5726f9ba708984140094d772",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 316,
      "natural_ratio": 0.002410888671875,
      "max_context_tokens": 131072,
      "context_length": 525,
      "question_length": 67,
      "prediction": "在其他水源稀缺的季节性温带、高山和极地气候条件下，人类需要水时会从冰川获取水。",
      "reference": "when other sources may be scant",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.3059556484222412,
      "timestamp": "2025-12-24T20:52:18.377486"
    },
    {
      "sample_id": "squad_5727b28f2ca10214002d9432",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 551,
      "natural_ratio": 0.00420379638671875,
      "max_context_tokens": 131072,
      "context_length": 993,
      "question_length": 69,
      "prediction": "120",
      "reference": "120,000 characters",
      "metrics": {
        "f1": 0.4
      },
      "elapsed_time": 0.056427717208862305,
      "timestamp": "2025-12-24T20:52:18.434051"
    },
    {
      "sample_id": "squad_570b85a5ec8fbc190045ba5d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 397,
      "natural_ratio": 0.00302886962890625,
      "max_context_tokens": 131072,
      "context_length": 695,
      "question_length": 60,
      "prediction": "the spread of grasslands and the development of grazers have been strongly linked。",
      "reference": "development of grazers",
      "metrics": {
        "f1": 0.42857142857142855
      },
      "elapsed_time": 0.18365716934204102,
      "timestamp": "2025-12-24T20:52:18.617846"
    },
    {
      "sample_id": "squad_56cbd2f96d243a140015ed72",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 523,
      "natural_ratio": 0.00399017333984375,
      "max_context_tokens": 131072,
      "context_length": 952,
      "question_length": 55,
      "prediction": "1835年",
      "reference": "1835",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 0.07533645629882812,
      "timestamp": "2025-12-24T20:52:18.693313"
    },
    {
      "sample_id": "squad_57314737497a881900248d3f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 511,
      "natural_ratio": 0.00389862060546875,
      "max_context_tokens": 131072,
      "context_length": 919,
      "question_length": 63,
      "prediction": "早期型号的F-16战斗机转换为QF-16配置。",
      "reference": "early model F-16 aircraft converted to QF-16 configuration",
      "metrics": {
        "f1": 0.25
      },
      "elapsed_time": 0.19479775428771973,
      "timestamp": "2025-12-24T20:52:18.888273"
    },
    {
      "sample_id": "squad_56d37f7059d6e41400146559",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 279,
      "natural_ratio": 0.00212860107421875,
      "max_context_tokens": 131072,
      "context_length": 467,
      "question_length": 51,
      "prediction": "2013年",
      "reference": "2013",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 0.07298064231872559,
      "timestamp": "2025-12-24T20:52:18.961385"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_5129",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 75154,
      "natural_ratio": 0.5733795166015625,
      "max_context_tokens": 131072,
      "context_length": 150201,
      "question_length": 67,
      "prediction": "Kobo",
      "reference": "Sugai",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.0788028240203857,
      "timestamp": "2025-12-24T20:52:22.040544"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_2683",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 96465,
      "natural_ratio": 0.7359695434570312,
      "max_context_tokens": 131072,
      "context_length": 192850,
      "question_length": 40,
      "prediction": "自尊心",
      "reference": "He has heart and Martel doesn't.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.19597601890563965,
      "timestamp": "2025-12-24T20:52:22.236866"
    },
    {
      "sample_id": "squad_57321d1eb9d445190005e81c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 328,
      "natural_ratio": 0.00250244140625,
      "max_context_tokens": 131072,
      "context_length": 562,
      "question_length": 54,
      "prediction": "3.16 million TEUs",
      "reference": "3.16 million",
      "metrics": {
        "f1": 0.8275862068965517
      },
      "elapsed_time": 0.10459470748901367,
      "timestamp": "2025-12-24T20:52:22.341705"
    },
    {
      "sample_id": "squad_5726768af1498d1400e8e082",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 293,
      "natural_ratio": 0.00223541259765625,
      "max_context_tokens": 131072,
      "context_length": 512,
      "question_length": 35,
      "prediction": "1855年4月",
      "reference": "April 1855",
      "metrics": {
        "f1": 0.4
      },
      "elapsed_time": 0.09593367576599121,
      "timestamp": "2025-12-24T20:52:22.437809"
    },
    {
      "sample_id": "squad_5705edea52bb8914006896d6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 208,
      "natural_ratio": 0.0015869140625,
      "max_context_tokens": 131072,
      "context_length": 300,
      "question_length": 77,
      "prediction": "World Health Organization",
      "reference": "The World Health Organization",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.050499916076660156,
      "timestamp": "2025-12-24T20:52:22.488454"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_25562",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 113661,
      "natural_ratio": 0.8671646118164062,
      "max_context_tokens": 131072,
      "context_length": 227197,
      "question_length": 86,
      "prediction": "减少恐惧和怜悯的能力，增强肌肉力量，提高感官能力，任何可以使更好的士兵的东西。",
      "reference": "Super-soldiers",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 4.174748659133911,
      "timestamp": "2025-12-24T20:52:26.665563"
    },
    {
      "sample_id": "squad_572816beff5b5019007d9ce8",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 1093,
      "natural_ratio": 0.00833892822265625,
      "max_context_tokens": 131072,
      "context_length": 2064,
      "question_length": 83,
      "prediction": "Joseph P. Kerwin",
      "reference": "Joseph P. Kerwin",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.09242391586303711,
      "timestamp": "2025-12-24T20:52:26.758544"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_26009",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 71251,
      "natural_ratio": 0.5436019897460938,
      "max_context_tokens": 131072,
      "context_length": 142429,
      "question_length": 33,
      "prediction": "通过使用毒药和她的长袍与花环，将它们置于公主手中。",
      "reference": "With a knife.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 2.3222455978393555,
      "timestamp": "2025-12-24T20:52:29.081227"
    },
    {
      "sample_id": "squad_57336bbb4776f41900660a8d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 401,
      "natural_ratio": 0.00305938720703125,
      "max_context_tokens": 131072,
      "context_length": 691,
      "question_length": 72,
      "prediction": "$500 million",
      "reference": "$500 million",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0767204761505127,
      "timestamp": "2025-12-24T20:52:29.158118"
    },
    {
      "sample_id": "squad_56e6fd9cde9d371400068113",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 185,
      "natural_ratio": 0.00141143798828125,
      "max_context_tokens": 131072,
      "context_length": 290,
      "question_length": 41,
      "prediction": "New Zealand",
      "reference": "New Zealand",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.03992486000061035,
      "timestamp": "2025-12-24T20:52:29.198181"
    },
    {
      "sample_id": "squad_57337520d058e614000b5b4c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 602,
      "natural_ratio": 0.0045928955078125,
      "max_context_tokens": 131072,
      "context_length": 1074,
      "question_length": 91,
      "prediction": "是serious违约风险。",
      "reference": "serious",
      "metrics": {
        "f1": 0.7000000000000001
      },
      "elapsed_time": 0.07511782646179199,
      "timestamp": "2025-12-24T20:52:29.273441"
    },
    {
      "sample_id": "squad_57340136d058e614000b6785",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 444,
      "natural_ratio": 0.003387451171875,
      "max_context_tokens": 131072,
      "context_length": 795,
      "question_length": 54,
      "prediction": "intent",
      "reference": "intent",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0341336727142334,
      "timestamp": "2025-12-24T20:52:29.307734"
    },
    {
      "sample_id": "squad_5727ce204b864d1900163d8a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 602,
      "natural_ratio": 0.0045928955078125,
      "max_context_tokens": 131072,
      "context_length": 1124,
      "question_length": 41,
      "prediction": "为了世俗化和专业化科学。",
      "reference": "as part of his efforts to secularise and professionalise science",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.0947105884552002,
      "timestamp": "2025-12-24T20:52:29.402580"
    },
    {
      "sample_id": "squad_57318f8ae6313a140071d09e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 296,
      "natural_ratio": 0.00225830078125,
      "max_context_tokens": 131072,
      "context_length": 508,
      "question_length": 45,
      "prediction": "Gaddafi的父親",
      "reference": "His father",
      "metrics": {
        "f1": 0.35294117647058826
      },
      "elapsed_time": 0.06274700164794922,
      "timestamp": "2025-12-24T20:52:29.465465"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_15976",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 91697,
      "natural_ratio": 0.6995925903320312,
      "max_context_tokens": 131072,
      "context_length": 183312,
      "question_length": 42,
      "prediction": "通过梦境得知的。在他的梦中，他看到了即将到来的灾难和疯狂的人们。",
      "reference": "In his dreams",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 2.8105521202087402,
      "timestamp": "2025-12-24T20:52:32.276388"
    },
    {
      "sample_id": "squad_5705f3a375f01819005e77c5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 124,
      "natural_ratio": 0.000946044921875,
      "max_context_tokens": 131072,
      "context_length": 175,
      "question_length": 34,
      "prediction": " several years",
      "reference": "several years",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04430866241455078,
      "timestamp": "2025-12-24T20:52:32.320865"
    },
    {
      "sample_id": "squad_5726c8615951b619008f7dfb",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 345,
      "natural_ratio": 0.00263214111328125,
      "max_context_tokens": 131072,
      "context_length": 584,
      "question_length": 66,
      "prediction": "Charlemagne",
      "reference": "Charlemagne",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05460381507873535,
      "timestamp": "2025-12-24T20:52:32.375611"
    },
    {
      "sample_id": "squad_56cdd4d762d2951400fa68cc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 425,
      "natural_ratio": 0.00324249267578125,
      "max_context_tokens": 131072,
      "context_length": 771,
      "question_length": 39,
      "prediction": "The Longmen Shan Fault System is situated in the eastern border of the Tibetan Plateau.",
      "reference": "the eastern border of the Tibetan Plateau",
      "metrics": {
        "f1": 0.640625
      },
      "elapsed_time": 0.20470356941223145,
      "timestamp": "2025-12-24T20:52:32.580453"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_18882",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 13898,
      "natural_ratio": 0.1060333251953125,
      "max_context_tokens": 131072,
      "context_length": 27719,
      "question_length": 38,
      "prediction": "Timmy Tiptoes is a little fat comfortable grey squirrel.",
      "reference": "A squirrel",
      "metrics": {
        "f1": 0.6428571428571429
      },
      "elapsed_time": 0.41289710998535156,
      "timestamp": "2025-12-24T20:52:32.993655"
    },
    {
      "sample_id": "squad_5725c34889a1e219009abe0e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 321,
      "natural_ratio": 0.00244903564453125,
      "max_context_tokens": 131072,
      "context_length": 561,
      "question_length": 41,
      "prediction": "以色列",
      "reference": "Israel",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.03500676155090332,
      "timestamp": "2025-12-24T20:52:33.028870"
    },
    {
      "sample_id": "squad_572ef06703f9891900756af9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 489,
      "natural_ratio": 0.00373077392578125,
      "max_context_tokens": 131072,
      "context_length": 866,
      "question_length": 72,
      "prediction": "Monogamous marriages",
      "reference": "Arranged marriages",
      "metrics": {
        "f1": 0.8571428571428572
      },
      "elapsed_time": 0.06492948532104492,
      "timestamp": "2025-12-24T20:52:33.093944"
    },
    {
      "sample_id": "squad_573243d40fdd8d15006c68aa",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 370,
      "natural_ratio": 0.0028228759765625,
      "max_context_tokens": 131072,
      "context_length": 654,
      "question_length": 47,
      "prediction": "Bagratid Dynasty",
      "reference": "Bagratid Dynasty",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06365633010864258,
      "timestamp": "2025-12-24T20:52:33.157738"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_13185",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 59546,
      "natural_ratio": 0.4542999267578125,
      "max_context_tokens": 131072,
      "context_length": 119028,
      "question_length": 24,
      "prediction": "Margit regrets being with Bengt Gauteson instead of Gudmund, as she feels trapped and unfulfilled in her current life.",
      "reference": "She regrets marrying  Bengt Gauteson",
      "metrics": {
        "f1": 0.7567567567567568
      },
      "elapsed_time": 2.0954174995422363,
      "timestamp": "2025-12-24T20:52:35.253484"
    },
    {
      "sample_id": "squad_572941266aef051400154c0a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 351,
      "natural_ratio": 0.00267791748046875,
      "max_context_tokens": 131072,
      "context_length": 563,
      "question_length": 99,
      "prediction": "国际能源署（International Energy Agency）",
      "reference": "International Energy Agency",
      "metrics": {
        "f1": 0.8852459016393442
      },
      "elapsed_time": 0.11172723770141602,
      "timestamp": "2025-12-24T20:52:35.365458"
    },
    {
      "sample_id": "squad_5731eb9bb9d445190005e69a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 384,
      "natural_ratio": 0.0029296875,
      "max_context_tokens": 131072,
      "context_length": 697,
      "question_length": 31,
      "prediction": "Raphael Lemkin",
      "reference": "Raphael Lemkin",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06578397750854492,
      "timestamp": "2025-12-24T20:52:35.431407"
    },
    {
      "sample_id": "squad_56de244f4396321400ee25f0",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 392,
      "natural_ratio": 0.00299072265625,
      "max_context_tokens": 131072,
      "context_length": 683,
      "question_length": 62,
      "prediction": "三",
      "reference": "three",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.04155778884887695,
      "timestamp": "2025-12-24T20:52:35.473132"
    },
    {
      "sample_id": "squad_572f9463947a6a140053ca88",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 422,
      "natural_ratio": 0.0032196044921875,
      "max_context_tokens": 131072,
      "context_length": 742,
      "question_length": 62,
      "prediction": "19%",
      "reference": "19%",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05477142333984375,
      "timestamp": "2025-12-24T20:52:35.528039"
    },
    {
      "sample_id": "squad_56bec3303aeaaa14008c9393",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 376,
      "natural_ratio": 0.00286865234375,
      "max_context_tokens": 131072,
      "context_length": 632,
      "question_length": 80,
      "prediction": "L.A. Reid",
      "reference": "L.A. Reid",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06397509574890137,
      "timestamp": "2025-12-24T20:52:35.592150"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_24903",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 114975,
      "natural_ratio": 0.8771896362304688,
      "max_context_tokens": 131072,
      "context_length": 229858,
      "question_length": 53,
      "prediction": "Maya becomes a soldier-bee and helps defend the hive.",
      "reference": "teaching",
      "metrics": {
        "f1": 0.48275862068965514
      },
      "elapsed_time": 4.156004428863525,
      "timestamp": "2025-12-24T20:52:39.748520"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_13488",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 80726,
      "natural_ratio": 0.6158905029296875,
      "max_context_tokens": 131072,
      "context_length": 161328,
      "question_length": 84,
      "prediction": "Julia regrets that she will make Paramore unhappy.",
      "reference": "Not having been brave enough to kill Charteris.",
      "metrics": {
        "f1": 0.7894736842105262
      },
      "elapsed_time": 2.806964874267578,
      "timestamp": "2025-12-24T20:52:42.556217"
    },
    {
      "sample_id": "squad_5727fb69ff5b5019007d99ec",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 377,
      "natural_ratio": 0.00287628173828125,
      "max_context_tokens": 131072,
      "context_length": 676,
      "question_length": 39,
      "prediction": "Prince Richard, Duke of Gloucester",
      "reference": "Prince Richard",
      "metrics": {
        "f1": 0.5833333333333334
      },
      "elapsed_time": 0.10577106475830078,
      "timestamp": "2025-12-24T20:52:42.662200"
    },
    {
      "sample_id": "squad_57346018879d6814001ca583",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 601,
      "natural_ratio": 0.00458526611328125,
      "max_context_tokens": 131072,
      "context_length": 1104,
      "question_length": 59,
      "prediction": "管理可能超过其栖息地承载能力的种群，从而威胁到其他物种的福祉，或在某些情况下损害人类健康或安全。",
      "reference": "managing populations",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.33670473098754883,
      "timestamp": "2025-12-24T20:52:42.999095"
    },
    {
      "sample_id": "squad_570bdec2ec8fbc190045bbae",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 423,
      "natural_ratio": 0.00322723388671875,
      "max_context_tokens": 131072,
      "context_length": 746,
      "question_length": 60,
      "prediction": "Contemporary biodiversity physics",
      "reference": "Contemporary biodiversity physics",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06481671333312988,
      "timestamp": "2025-12-24T20:52:43.064107"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_26429",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 97592,
      "natural_ratio": 0.74456787109375,
      "max_context_tokens": 131072,
      "context_length": 195071,
      "question_length": 74,
      "prediction": "Luther问是否Stickell准备好了接受新的任务。",
      "reference": "A flight attendant.",
      "metrics": {
        "f1": 0.2777777777777778
      },
      "elapsed_time": 3.3089709281921387,
      "timestamp": "2025-12-24T20:52:46.373471"
    },
    {
      "sample_id": "squad_5732a8641d5d2e14009ff888",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 437,
      "natural_ratio": 0.00333404541015625,
      "max_context_tokens": 131072,
      "context_length": 705,
      "question_length": 130,
      "prediction": "Governments",
      "reference": "Governments",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04977130889892578,
      "timestamp": "2025-12-24T20:52:46.423453"
    },
    {
      "sample_id": "squad_572ee52903f9891900756ac9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 339,
      "natural_ratio": 0.00258636474609375,
      "max_context_tokens": 131072,
      "context_length": 570,
      "question_length": 69,
      "prediction": "饱和（Saturation）",
      "reference": "saturation",
      "metrics": {
        "f1": 0.8333333333333333
      },
      "elapsed_time": 0.07427358627319336,
      "timestamp": "2025-12-24T20:52:46.497906"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_31130",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 70901,
      "natural_ratio": 0.5409317016601562,
      "max_context_tokens": 131072,
      "context_length": 141730,
      "question_length": 32,
      "prediction": "Sera is a prostitute.",
      "reference": "She is a prostitute",
      "metrics": {
        "f1": 0.9090909090909091
      },
      "elapsed_time": 3.3987112045288086,
      "timestamp": "2025-12-24T20:52:49.896991"
    },
    {
      "sample_id": "squad_57361c88012e2f140011a1a9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 477,
      "natural_ratio": 0.00363922119140625,
      "max_context_tokens": 131072,
      "context_length": 875,
      "question_length": 40,
      "prediction": "超过700百万美元",
      "reference": "$700 million",
      "metrics": {
        "f1": 0.23529411764705882
      },
      "elapsed_time": 0.09045529365539551,
      "timestamp": "2025-12-24T20:52:49.987625"
    }
  ]
}