{
  "metadata": {
    "experiment_type": "natural_length",
    "model": "qwen2.5-7b",
    "dataset": "mixed",
    "task_type": "reading_comprehension",
    "num_samples": 1000,
    "timestamp": "2026-01-05T14:15:41.221910"
  },
  "results": [
    {
      "sample_id": "narrativeqa_narrativeqa_18672",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 109051,
      "natural_ratio": 0.8319931030273438,
      "max_context_tokens": 131072,
      "context_length": 217986,
      "question_length": 77,
      "prediction": "Nicky \"Fingers\" Bonnano",
      "reference": "Nickey Fingers",
      "metrics": {
        "f1": 0.846153846153846
      },
      "elapsed_time": 3.2082765102386475,
      "timestamp": "2026-01-05T13:59:18.855117"
    },
    {
      "sample_id": "squad_572bc0c3111d821400f38f70",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 669,
      "natural_ratio": 0.00510406494140625,
      "max_context_tokens": 131072,
      "context_length": 1235,
      "question_length": 64,
      "prediction": "part of the process",
      "reference": "philosophy of the process of education or the philosophy of the discipline of education",
      "metrics": {
        "f1": 0.7857142857142858
      },
      "elapsed_time": 0.07616853713989258,
      "timestamp": "2026-01-05T13:59:18.931840"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_24993",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 81153,
      "natural_ratio": 0.6191482543945312,
      "max_context_tokens": 131072,
      "context_length": 162192,
      "question_length": 75,
      "prediction": "Three ships, a thousand men",
      "reference": "17 ships and 1500 men",
      "metrics": {
        "f1": 0.689655172413793
      },
      "elapsed_time": 3.4722275733947754,
      "timestamp": "2026-01-05T13:59:22.404420"
    },
    {
      "sample_id": "squad_57270b64708984140094d905",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 505,
      "natural_ratio": 0.00385284423828125,
      "max_context_tokens": 131072,
      "context_length": 881,
      "question_length": 89,
      "prediction": "nutrition literacy",
      "reference": "nutrition literacy",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.051860809326171875,
      "timestamp": "2026-01-05T13:59:22.456452"
    },
    {
      "sample_id": "squad_572b3656be1ee31400cb82a5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 394,
      "natural_ratio": 0.0030059814453125,
      "max_context_tokens": 131072,
      "context_length": 704,
      "question_length": 44,
      "prediction": "Hume said that people's ideas come from their impressions, which correspond roughly with sensations.",
      "reference": "their \"impressions\"",
      "metrics": {
        "f1": 0.6285714285714286
      },
      "elapsed_time": 0.21019697189331055,
      "timestamp": "2026-01-05T13:59:22.666809"
    },
    {
      "sample_id": "squad_5728d223ff5b5019007da750",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 346,
      "natural_ratio": 0.0026397705078125,
      "max_context_tokens": 131072,
      "context_length": 587,
      "question_length": 65,
      "prediction": "12",
      "reference": "12",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04542994499206543,
      "timestamp": "2026-01-05T13:59:22.712371"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_8685",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 99028,
      "natural_ratio": 0.755523681640625,
      "max_context_tokens": 131072,
      "context_length": 197983,
      "question_length": 34,
      "prediction": "In the woods.",
      "reference": "Reims",
      "metrics": {
        "f1": 0.37499999999999994
      },
      "elapsed_time": 3.458158016204834,
      "timestamp": "2026-01-05T13:59:26.170890"
    },
    {
      "sample_id": "squad_56d12cc017492d1400aabb58",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 241,
      "natural_ratio": 0.00183868408203125,
      "max_context_tokens": 131072,
      "context_length": 388,
      "question_length": 55,
      "prediction": "2007",
      "reference": "2007",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07135295867919922,
      "timestamp": "2026-01-05T13:59:26.242414"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_27810",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 96960,
      "natural_ratio": 0.73974609375,
      "max_context_tokens": 131072,
      "context_length": 193837,
      "question_length": 43,
      "prediction": "a case of morphine syringes",
      "reference": "A scalpel.",
      "metrics": {
        "f1": 0.5217391304347827
      },
      "elapsed_time": 4.735579490661621,
      "timestamp": "2026-01-05T13:59:30.978400"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_20952",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 109513,
      "natural_ratio": 0.8355178833007812,
      "max_context_tokens": 131072,
      "context_length": 218948,
      "question_length": 38,
      "prediction": "Nina Sayers is a ballet dancer.",
      "reference": "She is a professional ballet dancer.",
      "metrics": {
        "f1": 0.8387096774193549
      },
      "elapsed_time": 2.7735278606414795,
      "timestamp": "2026-01-05T13:59:33.752330"
    },
    {
      "sample_id": "squad_572ea4b0c246551400ce445a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 385,
      "natural_ratio": 0.00293731689453125,
      "max_context_tokens": 131072,
      "context_length": 671,
      "question_length": 59,
      "prediction": "stratosphere",
      "reference": "the stratosphere near the pole.",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 0.0629429817199707,
      "timestamp": "2026-01-05T13:59:33.816249"
    },
    {
      "sample_id": "squad_573041b7947a6a140053d356",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 331,
      "natural_ratio": 0.00252532958984375,
      "max_context_tokens": 131072,
      "context_length": 558,
      "question_length": 65,
      "prediction": "goalkeeper",
      "reference": "goalkeeper",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0464930534362793,
      "timestamp": "2026-01-05T13:59:33.862889"
    },
    {
      "sample_id": "squad_5726da61f1498d1400e8ed3b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 340,
      "natural_ratio": 0.002593994140625,
      "max_context_tokens": 131072,
      "context_length": 567,
      "question_length": 73,
      "prediction": "Tha Feelstyle",
      "reference": "Tha Feelstyle",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0658421516418457,
      "timestamp": "2026-01-05T13:59:33.928871"
    },
    {
      "sample_id": "squad_57108ff2a58dae1900cd6a2f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 538,
      "natural_ratio": 0.0041046142578125,
      "max_context_tokens": 131072,
      "context_length": 944,
      "question_length": 93,
      "prediction": "French and Latin",
      "reference": "French and Latin",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05676388740539551,
      "timestamp": "2026-01-05T13:59:33.985765"
    },
    {
      "sample_id": "squad_56df20e5c65bf219000b3f7b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 233,
      "natural_ratio": 0.00177764892578125,
      "max_context_tokens": 131072,
      "context_length": 395,
      "question_length": 31,
      "prediction": "Nazareth",
      "reference": "Nazareth",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.044890642166137695,
      "timestamp": "2026-01-05T13:59:34.030782"
    },
    {
      "sample_id": "squad_56f8cee09e9bad19000a0554",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 402,
      "natural_ratio": 0.0030670166015625,
      "max_context_tokens": 131072,
      "context_length": 701,
      "question_length": 64,
      "prediction": "prefrontal cortex",
      "reference": "prefrontal cortex",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0658259391784668,
      "timestamp": "2026-01-05T13:59:34.096734"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_32717",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 77887,
      "natural_ratio": 0.5942306518554688,
      "max_context_tokens": 131072,
      "context_length": 155695,
      "question_length": 39,
      "prediction": "senior",
      "reference": "senior",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 3.2499332427978516,
      "timestamp": "2026-01-05T13:59:37.347014"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_26567",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 122894,
      "natural_ratio": 0.9376068115234375,
      "max_context_tokens": 131072,
      "context_length": 245670,
      "question_length": 78,
      "prediction": "A giant mutated squid-like monster",
      "reference": "Volcanoes",
      "metrics": {
        "f1": 0.48
      },
      "elapsed_time": 3.522120714187622,
      "timestamp": "2026-01-05T13:59:40.869558"
    },
    {
      "sample_id": "squad_5731cbc10fdd8d15006c653b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 543,
      "natural_ratio": 0.00414276123046875,
      "max_context_tokens": 131072,
      "context_length": 1010,
      "question_length": 36,
      "prediction": "The earliest priesthoods were the flamines, dedicated to Jupiter, Mars and Quirinus.",
      "reference": "flamines",
      "metrics": {
        "f1": 0.17391304347826084
      },
      "elapsed_time": 0.22699332237243652,
      "timestamp": "2026-01-05T13:59:41.097139"
    },
    {
      "sample_id": "squad_56d5fb031c85041400946e8e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 396,
      "natural_ratio": 0.003021240234375,
      "max_context_tokens": 131072,
      "context_length": 712,
      "question_length": 41,
      "prediction": "obligate carnivores",
      "reference": "obligate",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.07878589630126953,
      "timestamp": "2026-01-05T13:59:41.176139"
    },
    {
      "sample_id": "squad_572f7d20947a6a140053c9b8",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 625,
      "natural_ratio": 0.00476837158203125,
      "max_context_tokens": 131072,
      "context_length": 1171,
      "question_length": 39,
      "prediction": "1991",
      "reference": "1991",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07189774513244629,
      "timestamp": "2026-01-05T13:59:41.248239"
    },
    {
      "sample_id": "squad_5727a5812ca10214002d92c0",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 353,
      "natural_ratio": 0.00269317626953125,
      "max_context_tokens": 131072,
      "context_length": 614,
      "question_length": 52,
      "prediction": "Money and gifts-in-kind, membership dues",
      "reference": "Money and gifts",
      "metrics": {
        "f1": 0.5454545454545454
      },
      "elapsed_time": 0.10885310173034668,
      "timestamp": "2026-01-05T13:59:41.357298"
    },
    {
      "sample_id": "squad_5729117aaf94a219006aa01e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 577,
      "natural_ratio": 0.00440216064453125,
      "max_context_tokens": 131072,
      "context_length": 1090,
      "question_length": 24,
      "prediction": "William C. Boyd was an anthropologist who defined race in terms of genetic differences between populations.",
      "reference": "mid-20th-century anthropologist",
      "metrics": {
        "f1": 0.8372093023255814
      },
      "elapsed_time": 0.2253432273864746,
      "timestamp": "2026-01-05T13:59:41.582865"
    },
    {
      "sample_id": "squad_5730ed42497a881900248a69",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 302,
      "natural_ratio": 0.0023040771484375,
      "max_context_tokens": 131072,
      "context_length": 540,
      "question_length": 25,
      "prediction": "ostrich",
      "reference": "ostrich",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.056798458099365234,
      "timestamp": "2026-01-05T13:59:41.639848"
    },
    {
      "sample_id": "squad_57277fdcf1498d1400e8f9fa",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 326,
      "natural_ratio": 0.0024871826171875,
      "max_context_tokens": 131072,
      "context_length": 582,
      "question_length": 30,
      "prediction": "Roche moutonnée can range in length from less than a meter to several hundred meters long.",
      "reference": "less than a meter to several hundred meters long",
      "metrics": {
        "f1": 0.6956521739130436
      },
      "elapsed_time": 0.23882055282592773,
      "timestamp": "2026-01-05T13:59:41.878846"
    },
    {
      "sample_id": "squad_572a65ae7a1753140016af0d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 314,
      "natural_ratio": 0.0023956298828125,
      "max_context_tokens": 131072,
      "context_length": 545,
      "question_length": 43,
      "prediction": "there might be a delay in recovery",
      "reference": "there might be a delay in recovery.",
      "metrics": {
        "f1": 0.9714285714285714
      },
      "elapsed_time": 0.09635305404663086,
      "timestamp": "2026-01-05T13:59:41.975361"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_23918",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 103137,
      "natural_ratio": 0.7868728637695312,
      "max_context_tokens": 131072,
      "context_length": 206196,
      "question_length": 38,
      "prediction": "Dorothy Angkatell",
      "reference": "Alfred Parsons",
      "metrics": {
        "f1": 0.6666666666666667
      },
      "elapsed_time": 3.3259193897247314,
      "timestamp": "2026-01-05T13:59:45.301676"
    },
    {
      "sample_id": "squad_57336f6e4776f41900660ace",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 620,
      "natural_ratio": 0.004730224609375,
      "max_context_tokens": 131072,
      "context_length": 1084,
      "question_length": 116,
      "prediction": "Bear Stearns",
      "reference": "Bear Stearns",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07634711265563965,
      "timestamp": "2026-01-05T13:59:45.378205"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_638",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 78933,
      "natural_ratio": 0.6022109985351562,
      "max_context_tokens": 131072,
      "context_length": 157786,
      "question_length": 40,
      "prediction": "Miss Ogle knows that Letitia Hardy is promised to Mr. Hardy's cousin, but Mr. Doricourt believes it to be Miss Kitty Willis.",
      "reference": "Doricourt",
      "metrics": {
        "f1": 0.13533834586466167
      },
      "elapsed_time": 2.997145175933838,
      "timestamp": "2026-01-05T13:59:48.375676"
    },
    {
      "sample_id": "squad_5726efa6dd62a815002e95bf",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 459,
      "natural_ratio": 0.00350189208984375,
      "max_context_tokens": 131072,
      "context_length": 820,
      "question_length": 59,
      "prediction": "No",
      "reference": "do not contain programs of study or academic departments",
      "metrics": {
        "f1": 0.21052631578947367
      },
      "elapsed_time": 0.04015994071960449,
      "timestamp": "2026-01-05T13:59:48.416023"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_9482",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 94745,
      "natural_ratio": 0.7228469848632812,
      "max_context_tokens": 131072,
      "context_length": 189380,
      "question_length": 71,
      "prediction": "Felix Bush wants to plan a funeral for himself with the help of the town's minister, but it turns out to be a funeral party in surprise to him.",
      "reference": "A funeral party",
      "metrics": {
        "f1": 0.2222222222222222
      },
      "elapsed_time": 3.1788222789764404,
      "timestamp": "2026-01-05T13:59:51.595254"
    },
    {
      "sample_id": "squad_56f8f1b49e9bad19000a074d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 431,
      "natural_ratio": 0.00328826904296875,
      "max_context_tokens": 131072,
      "context_length": 782,
      "question_length": 40,
      "prediction": "William Miller effectively signed the death warrant of the Age of Empires by advocating for changes in British policy towards the empires in the Near East, particularly the Ottoman Empire.",
      "reference": "signed the death warrant, so to speak, of the Age of Empires",
      "metrics": {
        "f1": 0.8095238095238096
      },
      "elapsed_time": 0.3798694610595703,
      "timestamp": "2026-01-05T13:59:51.975319"
    },
    {
      "sample_id": "squad_57275ec9dd62a815002e9bba",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 433,
      "natural_ratio": 0.00330352783203125,
      "max_context_tokens": 131072,
      "context_length": 764,
      "question_length": 63,
      "prediction": "80",
      "reference": "80",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.049459218978881836,
      "timestamp": "2026-01-05T13:59:52.024912"
    },
    {
      "sample_id": "squad_572409ef0ba9f01400d97b42",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 366,
      "natural_ratio": 0.0027923583984375,
      "max_context_tokens": 131072,
      "context_length": 627,
      "question_length": 65,
      "prediction": "Isle of Wight",
      "reference": "on the Isle of Wight",
      "metrics": {
        "f1": 0.9565217391304348
      },
      "elapsed_time": 0.07572054862976074,
      "timestamp": "2026-01-05T13:59:52.100773"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_18667",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 109031,
      "natural_ratio": 0.8318405151367188,
      "max_context_tokens": 131072,
      "context_length": 217986,
      "question_length": 37,
      "prediction": "Shelly",
      "reference": "Shelly Kaplow",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.15283632278442383,
      "timestamp": "2026-01-05T13:59:52.253890"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_23194",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 57538,
      "natural_ratio": 0.4389801025390625,
      "max_context_tokens": 131072,
      "context_length": 115003,
      "question_length": 33,
      "prediction": "The guests are at the house to learn the identity of the informant at their respective organizations who has been supplying their enemies with incriminating information.",
      "reference": "To enact revenge upon Mr Boddy after revealing him as the blackmailer.",
      "metrics": {
        "f1": 0.9361702127659574
      },
      "elapsed_time": 2.125310182571411,
      "timestamp": "2026-01-05T13:59:54.379869"
    },
    {
      "sample_id": "squad_57278aac708984140094e05b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 433,
      "natural_ratio": 0.00330352783203125,
      "max_context_tokens": 131072,
      "context_length": 780,
      "question_length": 47,
      "prediction": "trial courts of general jurisdiction",
      "reference": "trial courts of general jurisdiction",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08172106742858887,
      "timestamp": "2026-01-05T13:59:54.461789"
    },
    {
      "sample_id": "squad_57282ec23acd2414000df67b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 567,
      "natural_ratio": 0.00432586669921875,
      "max_context_tokens": 131072,
      "context_length": 1055,
      "question_length": 40,
      "prediction": "300 miles",
      "reference": "300",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.07076072692871094,
      "timestamp": "2026-01-05T13:59:54.532702"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_25059",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 83271,
      "natural_ratio": 0.6353073120117188,
      "max_context_tokens": 131072,
      "context_length": 166442,
      "question_length": 60,
      "prediction": "warn people about the creatures",
      "reference": "Check for gremlins if things are going wrong",
      "metrics": {
        "f1": 0.7500000000000001
      },
      "elapsed_time": 2.8330090045928955,
      "timestamp": "2026-01-05T13:59:57.366048"
    },
    {
      "sample_id": "squad_572820842ca10214002d9e7e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 366,
      "natural_ratio": 0.0027923583984375,
      "max_context_tokens": 131072,
      "context_length": 625,
      "question_length": 67,
      "prediction": "no ex-ante time or size limit",
      "reference": "no ex-ante time or size limit",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.11020374298095703,
      "timestamp": "2026-01-05T13:59:57.476414"
    },
    {
      "sample_id": "squad_572948b7af94a219006aa25d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 410,
      "natural_ratio": 0.0031280517578125,
      "max_context_tokens": 131072,
      "context_length": 733,
      "question_length": 47,
      "prediction": "Bermudians turned to maritime trades.",
      "reference": "turn to maritime trades",
      "metrics": {
        "f1": 0.923076923076923
      },
      "elapsed_time": 0.11689949035644531,
      "timestamp": "2026-01-05T13:59:57.593472"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_9572",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 103424,
      "natural_ratio": 0.7890625,
      "max_context_tokens": 131072,
      "context_length": 206786,
      "question_length": 22,
      "prediction": "Peter",
      "reference": "Sullivan.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 2.8123056888580322,
      "timestamp": "2026-01-05T14:00:00.406126"
    },
    {
      "sample_id": "squad_572eaf07dfa6aa1500f8d2ba",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 304,
      "natural_ratio": 0.0023193359375,
      "max_context_tokens": 131072,
      "context_length": 513,
      "question_length": 56,
      "prediction": "up to 500,000",
      "reference": "United States, which was already home to over 10,000 Salvadorans, making Salvadorans Americans the fourth-largest Hispanic and Latino American group",
      "metrics": {
        "f1": 0.411764705882353
      },
      "elapsed_time": 0.1313316822052002,
      "timestamp": "2026-01-05T14:00:00.537643"
    },
    {
      "sample_id": "squad_56fa08a8f34c681400b0bf7b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 565,
      "natural_ratio": 0.00431060791015625,
      "max_context_tokens": 131072,
      "context_length": 1029,
      "question_length": 62,
      "prediction": "Trust Territory of Somalia",
      "reference": "the Trust Territory of Somalia",
      "metrics": {
        "f1": 0.962962962962963
      },
      "elapsed_time": 0.07056570053100586,
      "timestamp": "2026-01-05T14:00:00.608359"
    },
    {
      "sample_id": "squad_5735e8d3012e2f140011a0d5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 337,
      "natural_ratio": 0.00257110595703125,
      "max_context_tokens": 131072,
      "context_length": 592,
      "question_length": 43,
      "prediction": "archaeologists and paleoanthropologists",
      "reference": "archaeologists and paleoanthropologists",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.11510705947875977,
      "timestamp": "2026-01-05T14:00:00.723609"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_4207",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 112441,
      "natural_ratio": 0.8578567504882812,
      "max_context_tokens": 131072,
      "context_length": 224790,
      "question_length": 52,
      "prediction": "British",
      "reference": "Mycroft",
      "metrics": {
        "f1": 0.30769230769230765
      },
      "elapsed_time": 3.858017683029175,
      "timestamp": "2026-01-05T14:00:04.581996"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_21951",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 54154,
      "natural_ratio": 0.4131622314453125,
      "max_context_tokens": 131072,
      "context_length": 108198,
      "question_length": 70,
      "prediction": "the Dark Lady",
      "reference": "The lady Queen Elisabeth",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 1.4941606521606445,
      "timestamp": "2026-01-05T14:00:06.076856"
    },
    {
      "sample_id": "squad_57280e204b864d1900164335",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 406,
      "natural_ratio": 0.0030975341796875,
      "max_context_tokens": 131072,
      "context_length": 667,
      "question_length": 105,
      "prediction": "34",
      "reference": "34",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0513460636138916,
      "timestamp": "2026-01-05T14:00:06.128374"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_27221",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 62382,
      "natural_ratio": 0.4759368896484375,
      "max_context_tokens": 131072,
      "context_length": 124676,
      "question_length": 48,
      "prediction": "Jessica doesn't bleed, suggesting she is a zombie and that the arrow hit an artery in her arm that has since been removed.",
      "reference": "She doesn't bleed.",
      "metrics": {
        "f1": 0.6486486486486487
      },
      "elapsed_time": 2.4182004928588867,
      "timestamp": "2026-01-05T14:00:08.546917"
    },
    {
      "sample_id": "squad_56de3a30cffd8e1900b4b69c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 439,
      "natural_ratio": 0.00334930419921875,
      "max_context_tokens": 131072,
      "context_length": 749,
      "question_length": 90,
      "prediction": "The Crusades",
      "reference": "the Crusades",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05914115905761719,
      "timestamp": "2026-01-05T14:00:08.606220"
    },
    {
      "sample_id": "squad_570716129e06ca38007e93df",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 373,
      "natural_ratio": 0.00284576416015625,
      "max_context_tokens": 131072,
      "context_length": 656,
      "question_length": 50,
      "prediction": "Durango",
      "reference": "Durango",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04526233673095703,
      "timestamp": "2026-01-05T14:00:08.651605"
    },
    {
      "sample_id": "squad_572827feff5b5019007d9e3e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 357,
      "natural_ratio": 0.00272369384765625,
      "max_context_tokens": 131072,
      "context_length": 636,
      "question_length": 39,
      "prediction": "torrential rain",
      "reference": "torrential rain",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.054413557052612305,
      "timestamp": "2026-01-05T14:00:08.706133"
    },
    {
      "sample_id": "squad_5726b0b6708984140094cdea",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 455,
      "natural_ratio": 0.00347137451171875,
      "max_context_tokens": 131072,
      "context_length": 808,
      "question_length": 63,
      "prediction": "scientific medicine and social medicine",
      "reference": "social medicine",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.07589602470397949,
      "timestamp": "2026-01-05T14:00:08.782145"
    },
    {
      "sample_id": "squad_570d3c44b3d812140066d5bd",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 319,
      "natural_ratio": 0.00243377685546875,
      "max_context_tokens": 131072,
      "context_length": 559,
      "question_length": 40,
      "prediction": "Range and height of the target",
      "reference": "Range and height of the target",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08541059494018555,
      "timestamp": "2026-01-05T14:00:08.867680"
    },
    {
      "sample_id": "squad_5727beae2ca10214002d9548",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 303,
      "natural_ratio": 0.00231170654296875,
      "max_context_tokens": 131072,
      "context_length": 534,
      "question_length": 33,
      "prediction": "highways",
      "reference": "highways",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04351305961608887,
      "timestamp": "2026-01-05T14:00:08.911308"
    },
    {
      "sample_id": "squad_570e6b5f0b85d914000d7ec1",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 316,
      "natural_ratio": 0.002410888671875,
      "max_context_tokens": 131072,
      "context_length": 543,
      "question_length": 50,
      "prediction": "Cfb",
      "reference": "Cfb",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04334664344787598,
      "timestamp": "2026-01-05T14:00:08.954767"
    },
    {
      "sample_id": "squad_570fa65280d9841400ab360b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 343,
      "natural_ratio": 0.00261688232421875,
      "max_context_tokens": 131072,
      "context_length": 598,
      "question_length": 49,
      "prediction": "The study found that women's genitals became aroused to both human and non-human stimuli from movies showing humans of both genders having sex (heterosexual and homosexual) and from videos showing non-human primates (bonobos) having sex.",
      "reference": "both human and nonhuman stimuli",
      "metrics": {
        "f1": 0.6341463414634146
      },
      "elapsed_time": 0.509251594543457,
      "timestamp": "2026-01-05T14:00:09.464157"
    },
    {
      "sample_id": "squad_5726783af1498d1400e8e0ca",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 318,
      "natural_ratio": 0.0024261474609375,
      "max_context_tokens": 131072,
      "context_length": 534,
      "question_length": 63,
      "prediction": "seven",
      "reference": "seven",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.03540992736816406,
      "timestamp": "2026-01-05T14:00:09.499690"
    },
    {
      "sample_id": "squad_572e9f64cb0c0d14000f1389",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 456,
      "natural_ratio": 0.00347900390625,
      "max_context_tokens": 131072,
      "context_length": 756,
      "question_length": 116,
      "prediction": "Prussia and Frederick",
      "reference": "Frederick and Prussia",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0654458999633789,
      "timestamp": "2026-01-05T14:00:09.565251"
    },
    {
      "sample_id": "squad_56d4fa2e2ccc5a1400d833cc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 920,
      "natural_ratio": 0.00701904296875,
      "max_context_tokens": 131072,
      "context_length": 1728,
      "question_length": 73,
      "prediction": "I Was Here",
      "reference": "I Was Here",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06195878982543945,
      "timestamp": "2026-01-05T14:00:09.627330"
    },
    {
      "sample_id": "squad_572808403acd2414000df2cb",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 379,
      "natural_ratio": 0.00289154052734375,
      "max_context_tokens": 131072,
      "context_length": 692,
      "question_length": 26,
      "prediction": "Good Pope John",
      "reference": "Good Pope John",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.054526329040527344,
      "timestamp": "2026-01-05T14:00:09.681975"
    },
    {
      "sample_id": "squad_56d66c621c8504140094710d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 232,
      "natural_ratio": 0.00177001953125,
      "max_context_tokens": 131072,
      "context_length": 349,
      "question_length": 76,
      "prediction": "arrest",
      "reference": "threat of arrest",
      "metrics": {
        "f1": 0.7142857142857143
      },
      "elapsed_time": 0.04049229621887207,
      "timestamp": "2026-01-05T14:00:09.722589"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_2978",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 64161,
      "natural_ratio": 0.48950958251953125,
      "max_context_tokens": 131072,
      "context_length": 128239,
      "question_length": 44,
      "prediction": "他的女儿被杀，为了报仇，从香港来到洛杉矶。",
      "reference": "James Carter convinced him to come to L.A. with him when Lee took Carter to the airport.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 2.7354700565338135,
      "timestamp": "2026-01-05T14:00:12.458380"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_14037",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 93034,
      "natural_ratio": 0.7097930908203125,
      "max_context_tokens": 131072,
      "context_length": 185986,
      "question_length": 42,
      "prediction": "the Fairy Blackstick cast a spell on the lions, making them unable to harm Rosalba",
      "reference": "They are the same lions she grew up with as a child.",
      "metrics": {
        "f1": 0.8292682926829269
      },
      "elapsed_time": 3.3640036582946777,
      "timestamp": "2026-01-05T14:00:15.822749"
    },
    {
      "sample_id": "squad_570dec460dc6ce1900204d0f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 303,
      "natural_ratio": 0.00231170654296875,
      "max_context_tokens": 131072,
      "context_length": 518,
      "question_length": 48,
      "prediction": "pole antarctique",
      "reference": "pole antartike",
      "metrics": {
        "f1": 0.8333333333333333
      },
      "elapsed_time": 0.08182287216186523,
      "timestamp": "2026-01-05T14:00:15.904745"
    },
    {
      "sample_id": "squad_57276f39f1498d1400e8f809",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 304,
      "natural_ratio": 0.0023193359375,
      "max_context_tokens": 131072,
      "context_length": 514,
      "question_length": 54,
      "prediction": "West River",
      "reference": "West River",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04425168037414551,
      "timestamp": "2026-01-05T14:00:15.949128"
    },
    {
      "sample_id": "squad_5733f8f0d058e614000b66d0",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 347,
      "natural_ratio": 0.00264739990234375,
      "max_context_tokens": 131072,
      "context_length": 608,
      "question_length": 47,
      "prediction": "Portuguese Escudo",
      "reference": "Portuguese Escudo",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07689237594604492,
      "timestamp": "2026-01-05T14:00:16.026178"
    },
    {
      "sample_id": "squad_57280ac93acd2414000df2ff",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 450,
      "natural_ratio": 0.0034332275390625,
      "max_context_tokens": 131072,
      "context_length": 799,
      "question_length": 62,
      "prediction": "80 GB",
      "reference": "80 GB",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0586087703704834,
      "timestamp": "2026-01-05T14:00:16.084922"
    },
    {
      "sample_id": "squad_5726240738643c19005ad078",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 392,
      "natural_ratio": 0.00299072265625,
      "max_context_tokens": 131072,
      "context_length": 693,
      "question_length": 51,
      "prediction": "Sir James Lancaster",
      "reference": "Sir James Lancaster",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05566835403442383,
      "timestamp": "2026-01-05T14:00:16.140711"
    },
    {
      "sample_id": "squad_5727b1c13acd2414000de9eb",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 418,
      "natural_ratio": 0.0031890869140625,
      "max_context_tokens": 131072,
      "context_length": 786,
      "question_length": 11,
      "prediction": "Advaita",
      "reference": "Advaita",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.054894447326660156,
      "timestamp": "2026-01-05T14:00:16.195722"
    },
    {
      "sample_id": "squad_572fabd004bcaa1900d76baa",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 327,
      "natural_ratio": 0.00249481201171875,
      "max_context_tokens": 131072,
      "context_length": 554,
      "question_length": 61,
      "prediction": "4%",
      "reference": "4%",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04422783851623535,
      "timestamp": "2026-01-05T14:00:16.240068"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_18864",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 89995,
      "natural_ratio": 0.6866073608398438,
      "max_context_tokens": 131072,
      "context_length": 179911,
      "question_length": 40,
      "prediction": "a conditional equation",
      "reference": "A problem.",
      "metrics": {
        "f1": 0.45454545454545453
      },
      "elapsed_time": 3.2661964893341064,
      "timestamp": "2026-01-05T14:00:19.506607"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_9362",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 38305,
      "natural_ratio": 0.29224395751953125,
      "max_context_tokens": 131072,
      "context_length": 76509,
      "question_length": 62,
      "prediction": "He speculates that it is the source of energy for the wheel, as well as having medicinal properties.",
      "reference": "The crystal emits a form of radiation that destroys unhealty tissue, but does not harm what is healthy.",
      "metrics": {
        "f1": 0.909090909090909
      },
      "elapsed_time": 1.152963399887085,
      "timestamp": "2026-01-05T14:00:20.659886"
    },
    {
      "sample_id": "squad_570b0e40ec8fbc190045b813",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 365,
      "natural_ratio": 0.00278472900390625,
      "max_context_tokens": 131072,
      "context_length": 640,
      "question_length": 50,
      "prediction": "the Julian calendar as used prior to Pope Gregory XIII's time",
      "reference": "Julian calendar",
      "metrics": {
        "f1": 0.3947368421052631
      },
      "elapsed_time": 0.15210628509521484,
      "timestamp": "2026-01-05T14:00:20.812168"
    },
    {
      "sample_id": "squad_56d38ac959d6e414001466cf",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 319,
      "natural_ratio": 0.00243377685546875,
      "max_context_tokens": 131072,
      "context_length": 540,
      "question_length": 58,
      "prediction": "tuberculosis",
      "reference": "tuberculosis",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05524563789367676,
      "timestamp": "2026-01-05T14:00:20.867547"
    },
    {
      "sample_id": "squad_570ce8b4fed7b91900d45ac6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 562,
      "natural_ratio": 0.0042877197265625,
      "max_context_tokens": 131072,
      "context_length": 1052,
      "question_length": 32,
      "prediction": "fiberglass covered in wood laminate",
      "reference": "fiberglass covered in wood laminate",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08654403686523438,
      "timestamp": "2026-01-05T14:00:20.954215"
    },
    {
      "sample_id": "squad_56df7f755ca0a614008f9b62",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 139,
      "natural_ratio": 0.00106048583984375,
      "max_context_tokens": 131072,
      "context_length": 168,
      "question_length": 70,
      "prediction": "Northern",
      "reference": "north",
      "metrics": {
        "f1": 0.7692307692307693
      },
      "elapsed_time": 0.035904645919799805,
      "timestamp": "2026-01-05T14:00:20.990240"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_11868",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 115074,
      "natural_ratio": 0.8779449462890625,
      "max_context_tokens": 131072,
      "context_length": 230075,
      "question_length": 33,
      "prediction": "Jane didn't tell the police that she was pregnant, instead she dropped out of school.",
      "reference": "She is pregnant",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 4.173385858535767,
      "timestamp": "2026-01-05T14:00:25.163968"
    },
    {
      "sample_id": "squad_5726ca635951b619008f7e35",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 687,
      "natural_ratio": 0.00524139404296875,
      "max_context_tokens": 131072,
      "context_length": 1274,
      "question_length": 60,
      "prediction": "HIV/AIDS",
      "reference": "HIV/AIDS",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06608986854553223,
      "timestamp": "2026-01-05T14:00:25.230578"
    },
    {
      "sample_id": "squad_57279e8c4b864d19001638ae",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 443,
      "natural_ratio": 0.00337982177734375,
      "max_context_tokens": 131072,
      "context_length": 769,
      "question_length": 78,
      "prediction": "New Haven Black Panther trials",
      "reference": "New Haven Black Panther trials",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07628154754638672,
      "timestamp": "2026-01-05T14:00:25.306985"
    },
    {
      "sample_id": "squad_572b9405be1ee31400cb8450",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 460,
      "natural_ratio": 0.003509521484375,
      "max_context_tokens": 131072,
      "context_length": 841,
      "question_length": 39,
      "prediction": "Thomas Alva Edison",
      "reference": "Thomas Alva Edison",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06583309173583984,
      "timestamp": "2026-01-05T14:00:25.372940"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_24943",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 82462,
      "natural_ratio": 0.6291351318359375,
      "max_context_tokens": 131072,
      "context_length": 164852,
      "question_length": 32,
      "prediction": "Scott's sister",
      "reference": "She is her daughter.",
      "metrics": {
        "f1": 0.5714285714285715
      },
      "elapsed_time": 3.8927764892578125,
      "timestamp": "2026-01-05T14:00:29.266040"
    },
    {
      "sample_id": "squad_57332ba74776f41900660730",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 378,
      "natural_ratio": 0.0028839111328125,
      "max_context_tokens": 131072,
      "context_length": 601,
      "question_length": 116,
      "prediction": "Collateralized debt obligation",
      "reference": "collateralized debt obligation",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08050394058227539,
      "timestamp": "2026-01-05T14:00:29.346712"
    },
    {
      "sample_id": "squad_56dc7c9d14d3a41400c268ff",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 248,
      "natural_ratio": 0.00189208984375,
      "max_context_tokens": 131072,
      "context_length": 403,
      "question_length": 54,
      "prediction": "Margaret Thatcher",
      "reference": "Margaret Thatcher",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05176377296447754,
      "timestamp": "2026-01-05T14:00:29.398607"
    },
    {
      "sample_id": "squad_57071f039e06ca38007e9475",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 329,
      "natural_ratio": 0.00251007080078125,
      "max_context_tokens": 131072,
      "context_length": 583,
      "question_length": 35,
      "prediction": "1910",
      "reference": "1910",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06464266777038574,
      "timestamp": "2026-01-05T14:00:29.463380"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_17766",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 86074,
      "natural_ratio": 0.6566925048828125,
      "max_context_tokens": 131072,
      "context_length": 172028,
      "question_length": 80,
      "prediction": "he sees them as a fresh new look",
      "reference": "He rejects them.",
      "metrics": {
        "f1": 0.5833333333333334
      },
      "elapsed_time": 4.394097805023193,
      "timestamp": "2026-01-05T14:00:33.857827"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_21589",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 61500,
      "natural_ratio": 0.469207763671875,
      "max_context_tokens": 131072,
      "context_length": 122908,
      "question_length": 52,
      "prediction": "Faun",
      "reference": "A faun.",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 1.3607535362243652,
      "timestamp": "2026-01-05T14:00:35.218933"
    },
    {
      "sample_id": "squad_5731ab21b9d445190005e44f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 775,
      "natural_ratio": 0.00591278076171875,
      "max_context_tokens": 131072,
      "context_length": 1447,
      "question_length": 64,
      "prediction": "Ovid",
      "reference": "Ovid",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05539965629577637,
      "timestamp": "2026-01-05T14:00:35.274516"
    },
    {
      "sample_id": "squad_571a98164faf5e1900b8aafc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 465,
      "natural_ratio": 0.00354766845703125,
      "max_context_tokens": 131072,
      "context_length": 817,
      "question_length": 73,
      "prediction": "通过与白人结婚并融入白人社会",
      "reference": "Many of majority European ancestry and appearance \"married white\" and assimilated into white society",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.12714576721191406,
      "timestamp": "2026-01-05T14:00:35.401813"
    },
    {
      "sample_id": "squad_5727b8684b864d1900163b64",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 360,
      "natural_ratio": 0.00274658203125,
      "max_context_tokens": 131072,
      "context_length": 610,
      "question_length": 70,
      "prediction": "Anglo-Americans",
      "reference": "Anglo-American settlers",
      "metrics": {
        "f1": 0.923076923076923
      },
      "elapsed_time": 0.05420970916748047,
      "timestamp": "2026-01-05T14:00:35.456151"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_19087",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 43014,
      "natural_ratio": 0.3281707763671875,
      "max_context_tokens": 131072,
      "context_length": 85941,
      "question_length": 47,
      "prediction": "An explosion killed everyone except for the robots.",
      "reference": "It exploded and everyone died. ",
      "metrics": {
        "f1": 0.8333333333333333
      },
      "elapsed_time": 1.1516432762145996,
      "timestamp": "2026-01-05T14:00:36.608112"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_30131",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 88047,
      "natural_ratio": 0.6717453002929688,
      "max_context_tokens": 131072,
      "context_length": 175985,
      "question_length": 69,
      "prediction": "Julie catches him cheating on her with his secretary, Rita.",
      "reference": "He had two explosive fights with Julie.",
      "metrics": {
        "f1": 0.717948717948718
      },
      "elapsed_time": 2.209306478500366,
      "timestamp": "2026-01-05T14:00:38.817791"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_21875",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 17273,
      "natural_ratio": 0.13178253173828125,
      "max_context_tokens": 131072,
      "context_length": 34471,
      "question_length": 36,
      "prediction": "Dr. Hitz",
      "reference": "Himself.",
      "metrics": {
        "f1": 0.375
      },
      "elapsed_time": 0.4271266460418701,
      "timestamp": "2026-01-05T14:00:39.245198"
    },
    {
      "sample_id": "squad_5728294a2ca10214002d9f99",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 388,
      "natural_ratio": 0.002960205078125,
      "max_context_tokens": 131072,
      "context_length": 678,
      "question_length": 59,
      "prediction": "2",
      "reference": "two",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.03577089309692383,
      "timestamp": "2026-01-05T14:00:39.281123"
    },
    {
      "sample_id": "squad_572efd2003f9891900756b24",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 611,
      "natural_ratio": 0.00466156005859375,
      "max_context_tokens": 131072,
      "context_length": 1133,
      "question_length": 50,
      "prediction": "The Tokugawa period was based on a strict class hierarchy.",
      "reference": "strict class hierarchy",
      "metrics": {
        "f1": 0.5499999999999999
      },
      "elapsed_time": 0.1618204116821289,
      "timestamp": "2026-01-05T14:00:39.443080"
    },
    {
      "sample_id": "squad_5726712af1498d1400e8dfc2",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 538,
      "natural_ratio": 0.0041046142578125,
      "max_context_tokens": 131072,
      "context_length": 972,
      "question_length": 64,
      "prediction": "Its religious nature",
      "reference": "its religious nature",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05591845512390137,
      "timestamp": "2026-01-05T14:00:39.499132"
    },
    {
      "sample_id": "squad_570c2e35ec8fbc190045bd3f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 326,
      "natural_ratio": 0.0024871826171875,
      "max_context_tokens": 131072,
      "context_length": 569,
      "question_length": 44,
      "prediction": "60",
      "reference": "60",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04379916191101074,
      "timestamp": "2026-01-05T14:00:39.543049"
    },
    {
      "sample_id": "squad_5726be91dd62a815002e8f2d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 333,
      "natural_ratio": 0.00254058837890625,
      "max_context_tokens": 131072,
      "context_length": 565,
      "question_length": 62,
      "prediction": "Faleiva",
      "reference": "Faleiva",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05461311340332031,
      "timestamp": "2026-01-05T14:00:39.597784"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_20349",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 115683,
      "natural_ratio": 0.8825912475585938,
      "max_context_tokens": 131072,
      "context_length": 231257,
      "question_length": 69,
      "prediction": "Alex throws the necklace back inside the cabin.",
      "reference": "He throws it in the ocean",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 3.9822142124176025,
      "timestamp": "2026-01-05T14:00:43.580342"
    },
    {
      "sample_id": "squad_56ddb03066d3e219004dac9b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 330,
      "natural_ratio": 0.0025177001953125,
      "max_context_tokens": 131072,
      "context_length": 543,
      "question_length": 78,
      "prediction": "University of Technology",
      "reference": "University of Technology",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.060213565826416016,
      "timestamp": "2026-01-05T14:00:43.641072"
    },
    {
      "sample_id": "squad_5727bc74ff5b5019007d93ca",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 363,
      "natural_ratio": 0.00276947021484375,
      "max_context_tokens": 131072,
      "context_length": 649,
      "question_length": 37,
      "prediction": "Han Chinese",
      "reference": "Han Chinese",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04542732238769531,
      "timestamp": "2026-01-05T14:00:43.686630"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_22566",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 93492,
      "natural_ratio": 0.713287353515625,
      "max_context_tokens": 131072,
      "context_length": 186884,
      "question_length": 60,
      "prediction": "去酒店了",
      "reference": "Margot and Pauline flee to a motel.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 2.8660686016082764,
      "timestamp": "2026-01-05T14:00:46.553087"
    },
    {
      "sample_id": "squad_57267897dd62a815002e8638",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 624,
      "natural_ratio": 0.0047607421875,
      "max_context_tokens": 131072,
      "context_length": 1175,
      "question_length": 33,
      "prediction": "1902",
      "reference": "1902",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07604312896728516,
      "timestamp": "2026-01-05T14:00:46.629292"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_14014",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 46560,
      "natural_ratio": 0.355224609375,
      "max_context_tokens": 131072,
      "context_length": 93035,
      "question_length": 46,
      "prediction": "Andrew Moreton called for an university in London in section 1.",
      "reference": "A hospital",
      "metrics": {
        "f1": 0.48275862068965514
      },
      "elapsed_time": 1.2420275211334229,
      "timestamp": "2026-01-05T14:00:47.871639"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_20740",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 93039,
      "natural_ratio": 0.7098312377929688,
      "max_context_tokens": 131072,
      "context_length": 185985,
      "question_length": 54,
      "prediction": "because there were fingerprints on his van",
      "reference": "To avoid negative attention if something happens Lee.",
      "metrics": {
        "f1": 0.7428571428571428
      },
      "elapsed_time": 4.131414890289307,
      "timestamp": "2026-01-05T14:00:52.003415"
    },
    {
      "sample_id": "squad_570a8df96d058f1900182f93",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 446,
      "natural_ratio": 0.0034027099609375,
      "max_context_tokens": 131072,
      "context_length": 744,
      "question_length": 109,
      "prediction": "1",
      "reference": "1",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.042755842208862305,
      "timestamp": "2026-01-05T14:00:52.046365"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_8666",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 93900,
      "natural_ratio": 0.716400146484375,
      "max_context_tokens": 131072,
      "context_length": 187723,
      "question_length": 38,
      "prediction": "Comte Octave",
      "reference": "Maurice",
      "metrics": {
        "f1": 0.5333333333333333
      },
      "elapsed_time": 3.0254812240600586,
      "timestamp": "2026-01-05T14:00:55.072214"
    },
    {
      "sample_id": "squad_570c30996b8089140040fbf4",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 297,
      "natural_ratio": 0.00226593017578125,
      "max_context_tokens": 131072,
      "context_length": 515,
      "question_length": 40,
      "prediction": "Controlled Substances Act of 1970",
      "reference": "Controlled Substances Act of 1970",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.1384875774383545,
      "timestamp": "2026-01-05T14:00:55.210869"
    },
    {
      "sample_id": "squad_5728ec472ca10214002daa98",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 554,
      "natural_ratio": 0.0042266845703125,
      "max_context_tokens": 131072,
      "context_length": 1007,
      "question_length": 62,
      "prediction": "Shipping costs",
      "reference": "shipping costs",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.047229766845703125,
      "timestamp": "2026-01-05T14:00:55.258222"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_24411",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 117540,
      "natural_ratio": 0.896759033203125,
      "max_context_tokens": 131072,
      "context_length": 234996,
      "question_length": 44,
      "prediction": "Smiley's men",
      "reference": "Prideaux.",
      "metrics": {
        "f1": 0.2222222222222222
      },
      "elapsed_time": 4.061279773712158,
      "timestamp": "2026-01-05T14:00:59.319857"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_17573",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 71296,
      "natural_ratio": 0.5439453125,
      "max_context_tokens": 131072,
      "context_length": 142518,
      "question_length": 34,
      "prediction": "Septimius",
      "reference": "Lucius Septimius",
      "metrics": {
        "f1": 0.8235294117647058
      },
      "elapsed_time": 2.621926784515381,
      "timestamp": "2026-01-05T14:01:01.942502"
    },
    {
      "sample_id": "squad_5730f90c497a881900248ab5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 701,
      "natural_ratio": 0.00534820556640625,
      "max_context_tokens": 131072,
      "context_length": 1264,
      "question_length": 99,
      "prediction": "original innocence",
      "reference": "the Fathers saw foreshadowings of Mary's \"wondrous abundance of divine gifts and original innocence",
      "metrics": {
        "f1": 0.6060606060606061
      },
      "elapsed_time": 0.05704832077026367,
      "timestamp": "2026-01-05T14:01:01.999728"
    },
    {
      "sample_id": "squad_570aa9644103511400d598f0",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 553,
      "natural_ratio": 0.00421905517578125,
      "max_context_tokens": 131072,
      "context_length": 995,
      "question_length": 72,
      "prediction": "gate-to-gate",
      "reference": "all phases",
      "metrics": {
        "f1": 0.30769230769230765
      },
      "elapsed_time": 0.06707477569580078,
      "timestamp": "2026-01-05T14:01:02.066932"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_18848",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 90017,
      "natural_ratio": 0.6867752075195312,
      "max_context_tokens": 131072,
      "context_length": 179911,
      "question_length": 84,
      "prediction": "negative",
      "reference": "They are easily memorized",
      "metrics": {
        "f1": 0.38095238095238093
      },
      "elapsed_time": 0.16241192817687988,
      "timestamp": "2026-01-05T14:01:02.229612"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_10079",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 94822,
      "natural_ratio": 0.7234344482421875,
      "max_context_tokens": 131072,
      "context_length": 189560,
      "question_length": 45,
      "prediction": "pregnant",
      "reference": "She is pregnant.",
      "metrics": {
        "f1": 0.7368421052631579
      },
      "elapsed_time": 3.1194393634796143,
      "timestamp": "2026-01-05T14:01:05.349387"
    },
    {
      "sample_id": "squad_5732696fe17f3d140042295f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 518,
      "natural_ratio": 0.0039520263671875,
      "max_context_tokens": 131072,
      "context_length": 931,
      "question_length": 65,
      "prediction": "The failure of negotiations prior to 1955 was due to Russia's refusal to permit inspections.",
      "reference": "refusal of the Russians to permit any sort of inspections",
      "metrics": {
        "f1": 0.7804878048780487
      },
      "elapsed_time": 0.26296401023864746,
      "timestamp": "2026-01-05T14:01:05.612529"
    },
    {
      "sample_id": "squad_56e16a26e3433e1400422ed7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 422,
      "natural_ratio": 0.0032196044921875,
      "max_context_tokens": 131072,
      "context_length": 776,
      "question_length": 28,
      "prediction": "Robert Boyle",
      "reference": "Robert Boyle",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.046907663345336914,
      "timestamp": "2026-01-05T14:01:05.659590"
    },
    {
      "sample_id": "squad_572b5afd34ae481900deadc0",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 543,
      "natural_ratio": 0.00414276123046875,
      "max_context_tokens": 131072,
      "context_length": 979,
      "question_length": 68,
      "prediction": "20 to 1",
      "reference": "20 to 1",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08008813858032227,
      "timestamp": "2026-01-05T14:01:05.739815"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_22194",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 116897,
      "natural_ratio": 0.8918533325195312,
      "max_context_tokens": 131072,
      "context_length": 233700,
      "question_length": 55,
      "prediction": "grandfather",
      "reference": "He is their grandfather",
      "metrics": {
        "f1": 0.8571428571428571
      },
      "elapsed_time": 4.3334150314331055,
      "timestamp": "2026-01-05T14:01:10.073583"
    },
    {
      "sample_id": "squad_573247070fdd8d15006c68c7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 650,
      "natural_ratio": 0.0049591064453125,
      "max_context_tokens": 131072,
      "context_length": 1202,
      "question_length": 58,
      "prediction": "divine guidance",
      "reference": "divine guidance",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06327056884765625,
      "timestamp": "2026-01-05T14:01:10.137372"
    },
    {
      "sample_id": "squad_56df7edd56340a1900b29c2f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 424,
      "natural_ratio": 0.00323486328125,
      "max_context_tokens": 131072,
      "context_length": 751,
      "question_length": 57,
      "prediction": "English Civil War",
      "reference": "English Civil War",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05646538734436035,
      "timestamp": "2026-01-05T14:01:10.193966"
    },
    {
      "sample_id": "squad_5726b450708984140094ce60",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 658,
      "natural_ratio": 0.0050201416015625,
      "max_context_tokens": 131072,
      "context_length": 1229,
      "question_length": 47,
      "prediction": "2500 BC",
      "reference": "2500 BC",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08002328872680664,
      "timestamp": "2026-01-05T14:01:10.274111"
    },
    {
      "sample_id": "squad_572844f92ca10214002da213",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 668,
      "natural_ratio": 0.005096435546875,
      "max_context_tokens": 131072,
      "context_length": 1256,
      "question_length": 41,
      "prediction": "南部阿富汗",
      "reference": "southern Afghanistan",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.04927992820739746,
      "timestamp": "2026-01-05T14:01:10.323517"
    },
    {
      "sample_id": "squad_572c9ab62babe914003c29a7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 610,
      "natural_ratio": 0.0046539306640625,
      "max_context_tokens": 131072,
      "context_length": 1107,
      "question_length": 73,
      "prediction": "23.8%",
      "reference": "23.8",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 0.07612204551696777,
      "timestamp": "2026-01-05T14:01:10.399759"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_5294",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 102512,
      "natural_ratio": 0.7821044921875,
      "max_context_tokens": 131072,
      "context_length": 204946,
      "question_length": 39,
      "prediction": "Julian Grendel",
      "reference": "Julian Grendlel. ",
      "metrics": {
        "f1": 0.9565217391304348
      },
      "elapsed_time": 2.667630672454834,
      "timestamp": "2026-01-05T14:01:13.067763"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_24864",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 84560,
      "natural_ratio": 0.6451416015625,
      "max_context_tokens": 131072,
      "context_length": 169053,
      "question_length": 27,
      "prediction": "Rene is of European nationality.",
      "reference": "French",
      "metrics": {
        "f1": 0.3809523809523809
      },
      "elapsed_time": 2.596924304962158,
      "timestamp": "2026-01-05T14:01:15.665073"
    },
    {
      "sample_id": "squad_571a076110f8ca1400304edd",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 548,
      "natural_ratio": 0.004180908203125,
      "max_context_tokens": 131072,
      "context_length": 1015,
      "question_length": 41,
      "prediction": "$231 billion",
      "reference": "$231 billion",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08293724060058594,
      "timestamp": "2026-01-05T14:01:15.748173"
    },
    {
      "sample_id": "squad_572825df2ca10214002d9f08",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 485,
      "natural_ratio": 0.00370025634765625,
      "max_context_tokens": 131072,
      "context_length": 887,
      "question_length": 43,
      "prediction": "Jordan River",
      "reference": "Jordan River",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04637265205383301,
      "timestamp": "2026-01-05T14:01:15.794689"
    },
    {
      "sample_id": "squad_57261df0ec44d21400f3d909",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 477,
      "natural_ratio": 0.00363922119140625,
      "max_context_tokens": 131072,
      "context_length": 869,
      "question_length": 46,
      "prediction": "The Grand Service was originally made for the Prince of Wales, later George IV.",
      "reference": "Prince of Wales",
      "metrics": {
        "f1": 0.3191489361702127
      },
      "elapsed_time": 0.18887877464294434,
      "timestamp": "2026-01-05T14:01:15.983717"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_26939",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 67445,
      "natural_ratio": 0.5145645141601562,
      "max_context_tokens": 131072,
      "context_length": 134786,
      "question_length": 65,
      "prediction": "Amber wins the title of Minnesota American Teen Princess by impressing the judges with her talent performance, which includes a tap dancing routine.",
      "reference": "all of the other girls get food poisoning from seafood",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 3.253600597381592,
      "timestamp": "2026-01-05T14:01:19.237715"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_21550",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 115289,
      "natural_ratio": 0.8795852661132812,
      "max_context_tokens": 131072,
      "context_length": 230452,
      "question_length": 87,
      "prediction": "It melts open",
      "reference": "It bends",
      "metrics": {
        "f1": 0.6666666666666665
      },
      "elapsed_time": 3.898874044418335,
      "timestamp": "2026-01-05T14:01:23.136942"
    },
    {
      "sample_id": "squad_570abc016d058f19001830b8",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 331,
      "natural_ratio": 0.00252532958984375,
      "max_context_tokens": 131072,
      "context_length": 595,
      "question_length": 28,
      "prediction": "heavy aircraft with full loadouts",
      "reference": "deployment of heavy jets with full loadouts",
      "metrics": {
        "f1": 0.8333333333333333
      },
      "elapsed_time": 0.09198951721191406,
      "timestamp": "2026-01-05T14:01:23.229456"
    },
    {
      "sample_id": "squad_572952a31d046914007792a2",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 460,
      "natural_ratio": 0.003509521484375,
      "max_context_tokens": 131072,
      "context_length": 849,
      "question_length": 32,
      "prediction": "behind the hypopharynx",
      "reference": "behind the hypopharynx",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.09685611724853516,
      "timestamp": "2026-01-05T14:01:23.326442"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_17005",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 93371,
      "natural_ratio": 0.7123641967773438,
      "max_context_tokens": 131072,
      "context_length": 186676,
      "question_length": 26,
      "prediction": "Osip",
      "reference": "Osip",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 3.397451639175415,
      "timestamp": "2026-01-05T14:01:26.724262"
    },
    {
      "sample_id": "squad_5726c4905951b619008f7d9c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 503,
      "natural_ratio": 0.00383758544921875,
      "max_context_tokens": 131072,
      "context_length": 908,
      "question_length": 58,
      "prediction": "Queen + Wyclef Jean",
      "reference": "Wyclef Jean",
      "metrics": {
        "f1": 0.7333333333333334
      },
      "elapsed_time": 0.09550356864929199,
      "timestamp": "2026-01-05T14:01:26.819924"
    },
    {
      "sample_id": "squad_56cec79caab44d1400b88a36",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 546,
      "natural_ratio": 0.0041656494140625,
      "max_context_tokens": 131072,
      "context_length": 1023,
      "question_length": 30,
      "prediction": "3,000",
      "reference": "around 3,000",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.08021116256713867,
      "timestamp": "2026-01-05T14:01:26.900268"
    },
    {
      "sample_id": "squad_5726dad6708984140094d3ad",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 592,
      "natural_ratio": 0.0045166015625,
      "max_context_tokens": 131072,
      "context_length": 1068,
      "question_length": 76,
      "prediction": "two and a half months",
      "reference": "two and a half months",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07967686653137207,
      "timestamp": "2026-01-05T14:01:26.980086"
    },
    {
      "sample_id": "squad_57318751e6313a140071d03f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 340,
      "natural_ratio": 0.002593994140625,
      "max_context_tokens": 131072,
      "context_length": 554,
      "question_length": 87,
      "prediction": "Urban town life and medieval guilds",
      "reference": "urban town life and medieval guilds",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.09936118125915527,
      "timestamp": "2026-01-05T14:01:27.079580"
    },
    {
      "sample_id": "squad_57269e385951b619008f77ee",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 468,
      "natural_ratio": 0.003570556640625,
      "max_context_tokens": 131072,
      "context_length": 867,
      "question_length": 30,
      "prediction": "steel",
      "reference": "steel",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.03515458106994629,
      "timestamp": "2026-01-05T14:01:27.114859"
    },
    {
      "sample_id": "squad_5728c8083acd2414000dfe3f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 381,
      "natural_ratio": 0.00290679931640625,
      "max_context_tokens": 131072,
      "context_length": 650,
      "question_length": 72,
      "prediction": "correspondence",
      "reference": "truth as correspondence",
      "metrics": {
        "f1": 0.761904761904762
      },
      "elapsed_time": 0.05463075637817383,
      "timestamp": "2026-01-05T14:01:27.169614"
    },
    {
      "sample_id": "squad_57279338708984140094e153",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 469,
      "natural_ratio": 0.00357818603515625,
      "max_context_tokens": 131072,
      "context_length": 878,
      "question_length": 21,
      "prediction": "Pedobaptism is the practice of baptizing infants.",
      "reference": "infant baptism",
      "metrics": {
        "f1": 0.6896551724137931
      },
      "elapsed_time": 0.1476457118988037,
      "timestamp": "2026-01-05T14:01:27.317384"
    },
    {
      "sample_id": "squad_5719ca564faf5e1900b8a7f7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 466,
      "natural_ratio": 0.0035552978515625,
      "max_context_tokens": 131072,
      "context_length": 836,
      "question_length": 57,
      "prediction": "Anti-Chinese riots",
      "reference": "anti-Chinese riots",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06569480895996094,
      "timestamp": "2026-01-05T14:01:27.383200"
    },
    {
      "sample_id": "squad_57278d02f1498d1400e8fbc9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 330,
      "natural_ratio": 0.0025177001953125,
      "max_context_tokens": 131072,
      "context_length": 568,
      "question_length": 53,
      "prediction": "elegance",
      "reference": "elegance",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05451679229736328,
      "timestamp": "2026-01-05T14:01:27.437837"
    },
    {
      "sample_id": "squad_570d2ad5fed7b91900d45c8c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 732,
      "natural_ratio": 0.005584716796875,
      "max_context_tokens": 131072,
      "context_length": 1379,
      "question_length": 46,
      "prediction": "bulky",
      "reference": "relatively bulky and somewhat outdated",
      "metrics": {
        "f1": 0.4166666666666667
      },
      "elapsed_time": 0.04902362823486328,
      "timestamp": "2026-01-05T14:01:27.486981"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_9641",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 118759,
      "natural_ratio": 0.9060592651367188,
      "max_context_tokens": 131072,
      "context_length": 237440,
      "question_length": 38,
      "prediction": "Tom finds Maggie in a bedroom, she is in a body that has been mistreated. Tom holds her.",
      "reference": "Embraces his wife.",
      "metrics": {
        "f1": 0.6875
      },
      "elapsed_time": 4.239866018295288,
      "timestamp": "2026-01-05T14:01:31.727162"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_29036",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 35751,
      "natural_ratio": 0.27275848388671875,
      "max_context_tokens": 131072,
      "context_length": 71424,
      "question_length": 38,
      "prediction": "The Flood attacked the Forerunner stronghold world of Charum Hakkor, and the Magos of the ecumene attempted to activate the Halos there to destroy both the Flood and the Forerunners. The activation was successful, but the act of doing so apparently \"woke\" the Didact, who awoke on Charum Hakkor and began to rebuild the Forerunner fleet to retake the Halo Array. The Forerunners were unable to prevent this and were eventually forced to flee to their refuge world of Onyx. The Didact pursued them to Onyx, and the two engaged in a climactic battle that devastated the planet, but ended with the Didact being defeated and banished by his former pupil. With the aid of the Librarian, the Forerunners then retreated to the sanctuary world of the Mantle, where they would remain until the Human-Forerunner War.",
      "reference": "Interstellar war",
      "metrics": {
        "f1": 0.5263157894736842
      },
      "elapsed_time": 2.7469396591186523,
      "timestamp": "2026-01-05T14:01:34.474795"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_22138",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 20390,
      "natural_ratio": 0.1555633544921875,
      "max_context_tokens": 131072,
      "context_length": 40681,
      "question_length": 60,
      "prediction": "他听到时钟滴答作响的声音，看到河水流动。",
      "reference": "Whispered voices and strange constellations",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.5627477169036865,
      "timestamp": "2026-01-05T14:01:35.037853"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_11589",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 101503,
      "natural_ratio": 0.7744064331054688,
      "max_context_tokens": 131072,
      "context_length": 202913,
      "question_length": 53,
      "prediction": "Alsace countryside",
      "reference": "A combat sports training facility near Strasbourg. ",
      "metrics": {
        "f1": 0.7878787878787878
      },
      "elapsed_time": 3.095737934112549,
      "timestamp": "2026-01-05T14:01:38.133972"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_9438",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 61424,
      "natural_ratio": 0.4686279296875,
      "max_context_tokens": 131072,
      "context_length": 122773,
      "question_length": 36,
      "prediction": "Apollo carries the bow.",
      "reference": "A golden bow.",
      "metrics": {
        "f1": 0.6153846153846153
      },
      "elapsed_time": 1.8851356506347656,
      "timestamp": "2026-01-05T14:01:40.019411"
    },
    {
      "sample_id": "squad_5727131c5951b619008f8585",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 548,
      "natural_ratio": 0.004180908203125,
      "max_context_tokens": 131072,
      "context_length": 941,
      "question_length": 115,
      "prediction": "Mouse",
      "reference": "mice",
      "metrics": {
        "f1": 0.4444444444444445
      },
      "elapsed_time": 0.04159092903137207,
      "timestamp": "2026-01-05T14:01:40.061185"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_24689",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 121052,
      "natural_ratio": 0.923553466796875,
      "max_context_tokens": 131072,
      "context_length": 242002,
      "question_length": 63,
      "prediction": "H stands for hoaxer.",
      "reference": "Hoaxer",
      "metrics": {
        "f1": 0.4615384615384615
      },
      "elapsed_time": 4.386166095733643,
      "timestamp": "2026-01-05T14:01:44.447751"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_14502",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 62041,
      "natural_ratio": 0.47333526611328125,
      "max_context_tokens": 131072,
      "context_length": 123995,
      "question_length": 47,
      "prediction": "Because he was afraid Ruth's God would be angry if he married her.",
      "reference": "Because he was unwilling to jeopardize the inheritance of his own estate. ",
      "metrics": {
        "f1": 0.8636363636363636
      },
      "elapsed_time": 1.8249552249908447,
      "timestamp": "2026-01-05T14:01:46.273389"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_26523",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 123245,
      "natural_ratio": 0.9402847290039062,
      "max_context_tokens": 131072,
      "context_length": 246399,
      "question_length": 51,
      "prediction": "His son is dead in Italy.",
      "reference": "The wildflowers.",
      "metrics": {
        "f1": 0.7692307692307693
      },
      "elapsed_time": 3.1222455501556396,
      "timestamp": "2026-01-05T14:01:49.395988"
    },
    {
      "sample_id": "squad_572683c7dd62a815002e87e2",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 561,
      "natural_ratio": 0.00428009033203125,
      "max_context_tokens": 131072,
      "context_length": 1003,
      "question_length": 80,
      "prediction": "the Mongol invasions",
      "reference": "The Pagan Kingdom fell due to the Mongol invasions and several warring states emerged",
      "metrics": {
        "f1": 0.787878787878788
      },
      "elapsed_time": 0.10088133811950684,
      "timestamp": "2026-01-05T14:01:49.497416"
    },
    {
      "sample_id": "squad_56f8245fa6d7ea1400e17405",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 367,
      "natural_ratio": 0.00279998779296875,
      "max_context_tokens": 131072,
      "context_length": 655,
      "question_length": 40,
      "prediction": "1454",
      "reference": "1454",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06671714782714844,
      "timestamp": "2026-01-05T14:01:49.564275"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_22979",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 101280,
      "natural_ratio": 0.772705078125,
      "max_context_tokens": 131072,
      "context_length": 202501,
      "question_length": 20,
      "prediction": "Mental dies when Harry fires a pepper into his mouth.",
      "reference": "Harry and Lloyd give him rat poison pills believing that the pills are medication.",
      "metrics": {
        "f1": 0.8205128205128205
      },
      "elapsed_time": 3.5809168815612793,
      "timestamp": "2026-01-05T14:01:53.145544"
    },
    {
      "sample_id": "squad_5734580c879d6814001ca545",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 402,
      "natural_ratio": 0.0030670166015625,
      "max_context_tokens": 131072,
      "context_length": 689,
      "question_length": 76,
      "prediction": "four years",
      "reference": "four",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.05036020278930664,
      "timestamp": "2026-01-05T14:01:53.196062"
    },
    {
      "sample_id": "squad_5728d8be4b864d1900164f6d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 380,
      "natural_ratio": 0.002899169921875,
      "max_context_tokens": 131072,
      "context_length": 640,
      "question_length": 81,
      "prediction": "hunting and fishing",
      "reference": "Hunting and fishing",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06554913520812988,
      "timestamp": "2026-01-05T14:01:53.261739"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_30972",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 57903,
      "natural_ratio": 0.44176483154296875,
      "max_context_tokens": 131072,
      "context_length": 115683,
      "question_length": 84,
      "prediction": "She plans to test whether he is as interested in her as she believes.",
      "reference": "To seduce him.",
      "metrics": {
        "f1": 0.6206896551724139
      },
      "elapsed_time": 1.6581072807312012,
      "timestamp": "2026-01-05T14:01:54.920172"
    },
    {
      "sample_id": "squad_56f7366c711bf01900a44a66",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 234,
      "natural_ratio": 0.0017852783203125,
      "max_context_tokens": 131072,
      "context_length": 333,
      "question_length": 95,
      "prediction": "1945",
      "reference": "1945",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06527423858642578,
      "timestamp": "2026-01-05T14:01:54.985640"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_20599",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 104586,
      "natural_ratio": 0.7979278564453125,
      "max_context_tokens": 131072,
      "context_length": 209098,
      "question_length": 35,
      "prediction": "Dr. Evil和Fat Bastard",
      "reference": "Dr. Evil and Frau Farbissina",
      "metrics": {
        "f1": 0.8571428571428571
      },
      "elapsed_time": 2.5740623474121094,
      "timestamp": "2026-01-05T14:01:57.560154"
    },
    {
      "sample_id": "squad_56e16caee3433e1400422f06",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 214,
      "natural_ratio": 0.0016326904296875,
      "max_context_tokens": 131072,
      "context_length": 338,
      "question_length": 50,
      "prediction": "The Germanic superstrate",
      "reference": "Germanic superstrate",
      "metrics": {
        "f1": 0.9090909090909091
      },
      "elapsed_time": 0.07663130760192871,
      "timestamp": "2026-01-05T14:01:57.636960"
    },
    {
      "sample_id": "squad_5728060d4b864d1900164266",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 573,
      "natural_ratio": 0.00437164306640625,
      "max_context_tokens": 131072,
      "context_length": 1015,
      "question_length": 92,
      "prediction": "took great offense",
      "reference": "took great offense",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06027388572692871,
      "timestamp": "2026-01-05T14:01:57.697389"
    },
    {
      "sample_id": "squad_57268e59708984140094c9f8",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 311,
      "natural_ratio": 0.00237274169921875,
      "max_context_tokens": 131072,
      "context_length": 501,
      "question_length": 82,
      "prediction": "Yes",
      "reference": "neolithic age domestication of plants and animals and the use of polished stone tools dating to sometime between 10,000 and 6,000 BC has been discovered",
      "metrics": {
        "f1": 0.14285714285714285
      },
      "elapsed_time": 0.03429055213928223,
      "timestamp": "2026-01-05T14:01:57.731831"
    },
    {
      "sample_id": "squad_56f7d6d8aef2371900625c2c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 418,
      "natural_ratio": 0.0031890869140625,
      "max_context_tokens": 131072,
      "context_length": 743,
      "question_length": 53,
      "prediction": "odwieczna",
      "reference": "odwieczna",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06574702262878418,
      "timestamp": "2026-01-05T14:01:57.797718"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_8668",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 93901,
      "natural_ratio": 0.7164077758789062,
      "max_context_tokens": 131072,
      "context_length": 187723,
      "question_length": 40,
      "prediction": "Honorine left her husband and took up residence in a little house where she made artificial flowers.",
      "reference": "They grew up together because Honorine was adopted into the family. ",
      "metrics": {
        "f1": 0.9302325581395349
      },
      "elapsed_time": 0.3537321090698242,
      "timestamp": "2026-01-05T14:01:58.151761"
    },
    {
      "sample_id": "squad_57285ba53acd2414000df94a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 349,
      "natural_ratio": 0.00266265869140625,
      "max_context_tokens": 131072,
      "context_length": 594,
      "question_length": 65,
      "prediction": "Persian Achaemenid Empire",
      "reference": "Persian Achaemenid Empire",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.09848141670227051,
      "timestamp": "2026-01-05T14:01:58.250386"
    },
    {
      "sample_id": "squad_57324d56b9d445190005e9ff",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 427,
      "natural_ratio": 0.00325775146484375,
      "max_context_tokens": 131072,
      "context_length": 745,
      "question_length": 69,
      "prediction": "Winston Churchill",
      "reference": "Churchill",
      "metrics": {
        "f1": 0.6923076923076924
      },
      "elapsed_time": 0.05570101737976074,
      "timestamp": "2026-01-05T14:01:58.306211"
    },
    {
      "sample_id": "squad_573031bb04bcaa1900d7733f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 458,
      "natural_ratio": 0.0034942626953125,
      "max_context_tokens": 131072,
      "context_length": 858,
      "question_length": 19,
      "prediction": "A collection of hooks to provide common functionality that can integrate with other apps, including search and sharing.",
      "reference": "collection of hooks to provide common functionality that can integrate with other apps",
      "metrics": {
        "f1": 0.8390243902439024
      },
      "elapsed_time": 0.2825484275817871,
      "timestamp": "2026-01-05T14:01:58.588956"
    },
    {
      "sample_id": "squad_572ee029cb0c0d14000f1656",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 543,
      "natural_ratio": 0.00414276123046875,
      "max_context_tokens": 131072,
      "context_length": 1025,
      "question_length": 21,
      "prediction": "Liang Ji was forced to commit suicide.",
      "reference": "forced to commit suicide",
      "metrics": {
        "f1": 0.7741935483870968
      },
      "elapsed_time": 0.12931036949157715,
      "timestamp": "2026-01-05T14:01:58.718450"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_20553",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 122701,
      "natural_ratio": 0.9361343383789062,
      "max_context_tokens": 131072,
      "context_length": 245309,
      "question_length": 53,
      "prediction": "Hugh Glass",
      "reference": "Joey Leubner",
      "metrics": {
        "f1": 0.3529411764705882
      },
      "elapsed_time": 3.4129247665405273,
      "timestamp": "2026-01-05T14:02:02.131736"
    },
    {
      "sample_id": "squad_572749b4708984140094dbc6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 663,
      "natural_ratio": 0.00505828857421875,
      "max_context_tokens": 131072,
      "context_length": 1195,
      "question_length": 92,
      "prediction": "four times",
      "reference": "four times",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.051436662673950195,
      "timestamp": "2026-01-05T14:02:02.183691"
    },
    {
      "sample_id": "squad_57265236708984140094c24d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 338,
      "natural_ratio": 0.0025787353515625,
      "max_context_tokens": 131072,
      "context_length": 546,
      "question_length": 90,
      "prediction": "abandoned, effectively doubling the nominal speed of many black-and-white negative films",
      "reference": "safety factors against under-exposure",
      "metrics": {
        "f1": 0.761904761904762
      },
      "elapsed_time": 0.18089866638183594,
      "timestamp": "2026-01-05T14:02:02.364728"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_10561",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 114735,
      "natural_ratio": 0.8753585815429688,
      "max_context_tokens": 131072,
      "context_length": 229340,
      "question_length": 91,
      "prediction": "Mademoiselle La Rue is a French teacher to Charlotte Temple.",
      "reference": "She is her teacher",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 3.997055768966675,
      "timestamp": "2026-01-05T14:02:06.362128"
    },
    {
      "sample_id": "squad_5727f67d3acd2414000df0f0",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 418,
      "natural_ratio": 0.0031890869140625,
      "max_context_tokens": 131072,
      "context_length": 754,
      "question_length": 42,
      "prediction": "Hussein Sirri Amer",
      "reference": "Hussein Sirri Amer",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.09179019927978516,
      "timestamp": "2026-01-05T14:02:06.454444"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_30134",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 88036,
      "natural_ratio": 0.671661376953125,
      "max_context_tokens": 131072,
      "context_length": 175985,
      "question_length": 47,
      "prediction": "Phil left her for another woman, and she started having an affair with the handyman Sykes.",
      "reference": "She rejected his advances",
      "metrics": {
        "f1": 0.6285714285714286
      },
      "elapsed_time": 0.33301496505737305,
      "timestamp": "2026-01-05T14:02:06.787774"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_11014",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 32917,
      "natural_ratio": 0.25113677978515625,
      "max_context_tokens": 131072,
      "context_length": 65737,
      "question_length": 58,
      "prediction": "To Aurora",
      "reference": " How He Lied to Her Husband",
      "metrics": {
        "f1": 0.5714285714285715
      },
      "elapsed_time": 0.7730793952941895,
      "timestamp": "2026-01-05T14:02:07.561143"
    },
    {
      "sample_id": "squad_56dfe23b7aa994140058e217",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 206,
      "natural_ratio": 0.0015716552734375,
      "max_context_tokens": 131072,
      "context_length": 266,
      "question_length": 107,
      "prediction": "O'Neill's",
      "reference": "O'Neill's",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05389857292175293,
      "timestamp": "2026-01-05T14:02:07.615216"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_256",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 32304,
      "natural_ratio": 0.2464599609375,
      "max_context_tokens": 131072,
      "context_length": 64510,
      "question_length": 59,
      "prediction": "the offspring of one human ancestor",
      "reference": "One human family",
      "metrics": {
        "f1": 0.7142857142857143
      },
      "elapsed_time": 0.709608793258667,
      "timestamp": "2026-01-05T14:02:08.325067"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_32129",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 110483,
      "natural_ratio": 0.8429183959960938,
      "max_context_tokens": 131072,
      "context_length": 220905,
      "question_length": 22,
      "prediction": "The German officer, Baron von Schoenvorts.",
      "reference": "The Germans",
      "metrics": {
        "f1": 0.7142857142857143
      },
      "elapsed_time": 3.8974194526672363,
      "timestamp": "2026-01-05T14:02:12.222850"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_13624",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 112107,
      "natural_ratio": 0.8553085327148438,
      "max_context_tokens": 131072,
      "context_length": 224154,
      "question_length": 21,
      "prediction": "MOSES BEHE",
      "reference": "Zoil's female supervisor",
      "metrics": {
        "f1": 0.4545454545454545
      },
      "elapsed_time": 3.7004218101501465,
      "timestamp": "2026-01-05T14:02:15.924036"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_26427",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 97591,
      "natural_ratio": 0.7445602416992188,
      "max_context_tokens": 131072,
      "context_length": 195071,
      "question_length": 72,
      "prediction": "helicopter",
      "reference": "A helicopter.",
      "metrics": {
        "f1": 0.8571428571428571
      },
      "elapsed_time": 3.206455945968628,
      "timestamp": "2026-01-05T14:02:19.131271"
    },
    {
      "sample_id": "squad_5709880eed30961900e842a7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 393,
      "natural_ratio": 0.00299835205078125,
      "max_context_tokens": 131072,
      "context_length": 638,
      "question_length": 109,
      "prediction": "red",
      "reference": "reddish",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.040296077728271484,
      "timestamp": "2026-01-05T14:02:19.171722"
    },
    {
      "sample_id": "squad_5726e30ddd62a815002e93fc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 438,
      "natural_ratio": 0.0033416748046875,
      "max_context_tokens": 131072,
      "context_length": 783,
      "question_length": 54,
      "prediction": "Sahara Desert",
      "reference": "Sahara",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.06633615493774414,
      "timestamp": "2026-01-05T14:02:19.238179"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_11528",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 112953,
      "natural_ratio": 0.8617630004882812,
      "max_context_tokens": 131072,
      "context_length": 225819,
      "question_length": 47,
      "prediction": "Raffles Haw despises McIntyre Senior, realizing that he is a man who stole from Raffles Haw.",
      "reference": "Raffles Haw thinks McIntyre Senior is a greedy beggar",
      "metrics": {
        "f1": 0.8571428571428572
      },
      "elapsed_time": 4.073731899261475,
      "timestamp": "2026-01-05T14:02:23.312258"
    },
    {
      "sample_id": "squad_571a9be84faf5e1900b8ab2e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 354,
      "natural_ratio": 0.0027008056640625,
      "max_context_tokens": 131072,
      "context_length": 586,
      "question_length": 82,
      "prediction": "the Fertile Crescent",
      "reference": "Fertile Crescent",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 0.08132195472717285,
      "timestamp": "2026-01-05T14:02:23.394098"
    },
    {
      "sample_id": "squad_570e22860dc6ce1900204de7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 380,
      "natural_ratio": 0.002899169921875,
      "max_context_tokens": 131072,
      "context_length": 679,
      "question_length": 41,
      "prediction": "无官方语言",
      "reference": "The nation has no official language",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.055579423904418945,
      "timestamp": "2026-01-05T14:02:23.449808"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_9707",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 22689,
      "natural_ratio": 0.17310333251953125,
      "max_context_tokens": 131072,
      "context_length": 45323,
      "question_length": 16,
      "prediction": "Clemenza is a fictional character from the movie \"The Godfather Part II.\" His full name is Paul Clemenza, and he is portrayed by actor Richard Castellano. He serves as a mentor and friend to the protagonist, Michael Corleone, and is a high-ranking member of the Corleone crime family.",
      "reference": "A Detective that investigates Frye",
      "metrics": {
        "f1": 0.7142857142857143
      },
      "elapsed_time": 1.1444017887115479,
      "timestamp": "2026-01-05T14:02:24.594498"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_28839",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 100823,
      "natural_ratio": 0.7692184448242188,
      "max_context_tokens": 131072,
      "context_length": 201565,
      "question_length": 42,
      "prediction": "Because she believes that marriage would make her financially dependent and she wants to remain independent.",
      "reference": "She wants to remain independent in every way.",
      "metrics": {
        "f1": 0.8292682926829268
      },
      "elapsed_time": 4.240448474884033,
      "timestamp": "2026-01-05T14:02:28.835354"
    },
    {
      "sample_id": "squad_56fad9cef34c681400b0c165",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 246,
      "natural_ratio": 0.0018768310546875,
      "max_context_tokens": 131072,
      "context_length": 386,
      "question_length": 66,
      "prediction": "20th",
      "reference": "20th",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05992436408996582,
      "timestamp": "2026-01-05T14:02:28.895438"
    },
    {
      "sample_id": "squad_5731b5b50fdd8d15006c6472",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 683,
      "natural_ratio": 0.00521087646484375,
      "max_context_tokens": 131072,
      "context_length": 1258,
      "question_length": 69,
      "prediction": "Quakers",
      "reference": "Quakers",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.050583839416503906,
      "timestamp": "2026-01-05T14:02:28.946153"
    },
    {
      "sample_id": "squad_572c9bb7f182dd1900d7c7f9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 453,
      "natural_ratio": 0.00345611572265625,
      "max_context_tokens": 131072,
      "context_length": 797,
      "question_length": 69,
      "prediction": "Herbert Hoover",
      "reference": "Herbert Hoover",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07276558876037598,
      "timestamp": "2026-01-05T14:02:29.019044"
    },
    {
      "sample_id": "squad_572a15493f37b319004786cc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 393,
      "natural_ratio": 0.00299835205078125,
      "max_context_tokens": 131072,
      "context_length": 689,
      "question_length": 58,
      "prediction": "the so-called Ghazi emirates",
      "reference": "Ghazi emirates",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.09654045104980469,
      "timestamp": "2026-01-05T14:02:29.115712"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_15623",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 48723,
      "natural_ratio": 0.37172698974609375,
      "max_context_tokens": 131072,
      "context_length": 97357,
      "question_length": 50,
      "prediction": "1780",
      "reference": "1785",
      "metrics": {
        "f1": 0.75
      },
      "elapsed_time": 1.1484136581420898,
      "timestamp": "2026-01-05T14:02:30.264446"
    },
    {
      "sample_id": "squad_572ee81bdfa6aa1500f8d4b7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 290,
      "natural_ratio": 0.0022125244140625,
      "max_context_tokens": 131072,
      "context_length": 515,
      "question_length": 26,
      "prediction": "Dong Zhuo burned Luoyang to the ground.",
      "reference": "burned",
      "metrics": {
        "f1": 0.2666666666666667
      },
      "elapsed_time": 0.14955854415893555,
      "timestamp": "2026-01-05T14:02:30.414157"
    },
    {
      "sample_id": "squad_57324bd1b9d445190005e9db",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 738,
      "natural_ratio": 0.0056304931640625,
      "max_context_tokens": 131072,
      "context_length": 1383,
      "question_length": 54,
      "prediction": "Kingdom Halls",
      "reference": "Kingdom Halls",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0709073543548584,
      "timestamp": "2026-01-05T14:02:30.485189"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_13861",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 101481,
      "natural_ratio": 0.7742385864257812,
      "max_context_tokens": 131072,
      "context_length": 202883,
      "question_length": 39,
      "prediction": "Because the government censors were going to watch a movie.",
      "reference": "the mutagen experiment got out of control",
      "metrics": {
        "f1": 0.7428571428571428
      },
      "elapsed_time": 3.0802652835845947,
      "timestamp": "2026-01-05T14:02:33.565821"
    },
    {
      "sample_id": "squad_570cee7ffed7b91900d45aff",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 342,
      "natural_ratio": 0.0026092529296875,
      "max_context_tokens": 131072,
      "context_length": 606,
      "question_length": 39,
      "prediction": "the Ti and Ri plasmids",
      "reference": "The Ti and Ri plasmids",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.10476207733154297,
      "timestamp": "2026-01-05T14:02:33.670751"
    },
    {
      "sample_id": "squad_56ce7bf4aab44d1400b887f4",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 573,
      "natural_ratio": 0.00437164306640625,
      "max_context_tokens": 131072,
      "context_length": 1040,
      "question_length": 66,
      "prediction": "Vinnie Chieco",
      "reference": "Vinnie Chieco",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08473062515258789,
      "timestamp": "2026-01-05T14:02:33.755622"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_13096",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 16678,
      "natural_ratio": 0.1272430419921875,
      "max_context_tokens": 131072,
      "context_length": 33278,
      "question_length": 39,
      "prediction": "At first, Prince Prospero was shaken with fear or disgust when he saw the stranger, but he quickly turned his anger towards his own guests who were mocking the stranger. He demanded that they seize and unmask the stranger so they could hang him at sunrise from the battlements. When they failed to do so, Prospero himself tried to apprehend the stranger, which led to the stranger confronting him within the black chamber.",
      "reference": "He was angry and demanded to know his identity.",
      "metrics": {
        "f1": 0.7906976744186047
      },
      "elapsed_time": 1.204674243927002,
      "timestamp": "2026-01-05T14:02:34.960580"
    },
    {
      "sample_id": "squad_56f8a2749e9bad19000a0227",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 200,
      "natural_ratio": 0.00152587890625,
      "max_context_tokens": 131072,
      "context_length": 336,
      "question_length": 24,
      "prediction": "Lakes",
      "reference": "lakes",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04306173324584961,
      "timestamp": "2026-01-05T14:02:35.003808"
    },
    {
      "sample_id": "squad_57282ded2ca10214002da003",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 313,
      "natural_ratio": 0.00238800048828125,
      "max_context_tokens": 131072,
      "context_length": 548,
      "question_length": 39,
      "prediction": "Aristotle added historical empirical evidence to Plato's analysis.",
      "reference": "historical empirical evidence",
      "metrics": {
        "f1": 0.6105263157894737
      },
      "elapsed_time": 0.14588332176208496,
      "timestamp": "2026-01-05T14:02:35.149822"
    },
    {
      "sample_id": "squad_5728ac7b2ca10214002da596",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 335,
      "natural_ratio": 0.00255584716796875,
      "max_context_tokens": 131072,
      "context_length": 573,
      "question_length": 58,
      "prediction": "$40,042",
      "reference": "$40,042",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.09604644775390625,
      "timestamp": "2026-01-05T14:02:35.245993"
    },
    {
      "sample_id": "squad_57341dc0d058e614000b696d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 590,
      "natural_ratio": 0.0045013427734375,
      "max_context_tokens": 131072,
      "context_length": 1098,
      "question_length": 43,
      "prediction": "Anaerobic bacteria species",
      "reference": "anaerobic bacteria",
      "metrics": {
        "f1": 0.8181818181818181
      },
      "elapsed_time": 0.0760657787322998,
      "timestamp": "2026-01-05T14:02:35.322179"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_17617",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 108678,
      "natural_ratio": 0.8291473388671875,
      "max_context_tokens": 131072,
      "context_length": 217262,
      "question_length": 54,
      "prediction": "Electro-Magnetic Restorer, Illimitable Communicator, and Resting Bandana",
      "reference": "garment of protection, a record of events, spectacles that are a character marker",
      "metrics": {
        "f1": 0.7894736842105262
      },
      "elapsed_time": 3.6442925930023193,
      "timestamp": "2026-01-05T14:02:38.966822"
    },
    {
      "sample_id": "squad_570a83046d058f1900182ef6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 523,
      "natural_ratio": 0.00399017333984375,
      "max_context_tokens": 131072,
      "context_length": 931,
      "question_length": 75,
      "prediction": "Using the Runic alphabet",
      "reference": "runic system",
      "metrics": {
        "f1": 0.6923076923076923
      },
      "elapsed_time": 0.08440518379211426,
      "timestamp": "2026-01-05T14:02:39.051819"
    },
    {
      "sample_id": "squad_5729f6063f37b31900478612",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 401,
      "natural_ratio": 0.00305938720703125,
      "max_context_tokens": 131072,
      "context_length": 599,
      "question_length": 163,
      "prediction": "飓风/台风",
      "reference": "hurricane",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.06678247451782227,
      "timestamp": "2026-01-05T14:02:39.118753"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_5607",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 107124,
      "natural_ratio": 0.817291259765625,
      "max_context_tokens": 131072,
      "context_length": 214118,
      "question_length": 91,
      "prediction": "Parker",
      "reference": "Parker family",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 5.285743951797485,
      "timestamp": "2026-01-05T14:02:44.404832"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_31000",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 73682,
      "natural_ratio": 0.5621490478515625,
      "max_context_tokens": 131072,
      "context_length": 147294,
      "question_length": 31,
      "prediction": "Nevada",
      "reference": "The Mojave Desert.",
      "metrics": {
        "f1": 0.4444444444444444
      },
      "elapsed_time": 3.3320603370666504,
      "timestamp": "2026-01-05T14:02:47.737674"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_13491",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 80714,
      "natural_ratio": 0.6157989501953125,
      "max_context_tokens": 131072,
      "context_length": 161328,
      "question_length": 61,
      "prediction": "Grace finds out that Leonard has been philandering with other women, which is causing conflicts and jealousy among his potential partners.",
      "reference": "He is having an affair with Julia.",
      "metrics": {
        "f1": 0.7804878048780487
      },
      "elapsed_time": 2.947777509689331,
      "timestamp": "2026-01-05T14:02:50.685887"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_28194",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 53140,
      "natural_ratio": 0.405426025390625,
      "max_context_tokens": 131072,
      "context_length": 106158,
      "question_length": 83,
      "prediction": "Lavarcham tells Conchubor that Deirdre is so fickle and difficult to please that it would be best to give up and let Deirdre be with Naisi, suggesting that Deirdre will bring destruction to those around her. She also hints that Deirdre's influence has already caused chaos, mentioning the deaths of Naisi's brothers and the destruction of Emain.",
      "reference": "She is old and ugly",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 2.502973794937134,
      "timestamp": "2026-01-05T14:02:53.189242"
    },
    {
      "sample_id": "squad_56f82667a6d7ea1400e1741d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 298,
      "natural_ratio": 0.0022735595703125,
      "max_context_tokens": 131072,
      "context_length": 513,
      "question_length": 44,
      "prediction": "1492",
      "reference": "1492",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06897473335266113,
      "timestamp": "2026-01-05T14:02:53.258405"
    },
    {
      "sample_id": "squad_5727c79f3acd2414000dec2d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 358,
      "natural_ratio": 0.0027313232421875,
      "max_context_tokens": 131072,
      "context_length": 609,
      "question_length": 68,
      "prediction": "two",
      "reference": "two",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.03506112098693848,
      "timestamp": "2026-01-05T14:02:53.293599"
    },
    {
      "sample_id": "squad_56de71114396321400ee28fc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 272,
      "natural_ratio": 0.0020751953125,
      "max_context_tokens": 131072,
      "context_length": 418,
      "question_length": 86,
      "prediction": "Cathar Crusade",
      "reference": "The Cathar Crusade",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07399821281433105,
      "timestamp": "2026-01-05T14:02:53.367735"
    },
    {
      "sample_id": "squad_57300b16b2c2fd140056879d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 394,
      "natural_ratio": 0.0030059814453125,
      "max_context_tokens": 131072,
      "context_length": 702,
      "question_length": 47,
      "prediction": "Karim Khan",
      "reference": "Karim Khan",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.055693626403808594,
      "timestamp": "2026-01-05T14:02:53.423559"
    },
    {
      "sample_id": "squad_570aae5d4103511400d59921",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 488,
      "natural_ratio": 0.00372314453125,
      "max_context_tokens": 131072,
      "context_length": 877,
      "question_length": 60,
      "prediction": "Some of the forward rolling movement of the aircraft into vertical velocity",
      "reference": "some of the forward rolling movement of the aircraft into vertical velocity",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.1464829444885254,
      "timestamp": "2026-01-05T14:02:53.570178"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_32136",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 76457,
      "natural_ratio": 0.5833206176757812,
      "max_context_tokens": 131072,
      "context_length": 152825,
      "question_length": 50,
      "prediction": "Miami Dolphins vs Philadelphia Eagles",
      "reference": "Miami Dolphins and Philadelphia Eagles",
      "metrics": {
        "f1": 0.962962962962963
      },
      "elapsed_time": 3.540480852127075,
      "timestamp": "2026-01-05T14:02:57.110984"
    },
    {
      "sample_id": "squad_56e79d2300c9c71400d773c0",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 338,
      "natural_ratio": 0.0025787353515625,
      "max_context_tokens": 131072,
      "context_length": 566,
      "question_length": 70,
      "prediction": "2005",
      "reference": "2005",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07047176361083984,
      "timestamp": "2026-01-05T14:02:57.181634"
    },
    {
      "sample_id": "squad_57303815947a6a140053d2c9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 400,
      "natural_ratio": 0.0030517578125,
      "max_context_tokens": 131072,
      "context_length": 670,
      "question_length": 90,
      "prediction": "65",
      "reference": "65",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04593014717102051,
      "timestamp": "2026-01-05T14:02:57.227698"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_31820",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 113890,
      "natural_ratio": 0.8689117431640625,
      "max_context_tokens": 131072,
      "context_length": 227701,
      "question_length": 40,
      "prediction": "Jack Kennedy",
      "reference": "President Kennedy",
      "metrics": {
        "f1": 0.6
      },
      "elapsed_time": 5.684755325317383,
      "timestamp": "2026-01-05T14:03:02.912833"
    },
    {
      "sample_id": "squad_572fc958a23a5019007fc9d9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 594,
      "natural_ratio": 0.0045318603515625,
      "max_context_tokens": 131072,
      "context_length": 1084,
      "question_length": 64,
      "prediction": "condemned Japan",
      "reference": "condemned",
      "metrics": {
        "f1": 0.7499999999999999
      },
      "elapsed_time": 0.07767844200134277,
      "timestamp": "2026-01-05T14:03:02.991040"
    },
    {
      "sample_id": "squad_56e196cfcd28a01900c679f0",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 154,
      "natural_ratio": 0.0011749267578125,
      "max_context_tokens": 131072,
      "context_length": 242,
      "question_length": 27,
      "prediction": "as a means of expelling reducing equivalents in biochemical reactions",
      "reference": "expelling reducing equivalents in biochemical reactions",
      "metrics": {
        "f1": 0.8870967741935484
      },
      "elapsed_time": 0.13063383102416992,
      "timestamp": "2026-01-05T14:03:03.121834"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_19401",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 78939,
      "natural_ratio": 0.6022567749023438,
      "max_context_tokens": 131072,
      "context_length": 157782,
      "question_length": 56,
      "prediction": "Debo told Craig that he would kill him if he didn't move in with his uncle.",
      "reference": "Because the neighborhood bully, Deebo, would be getting out of jail soon",
      "metrics": {
        "f1": 0.8181818181818182
      },
      "elapsed_time": 2.4250314235687256,
      "timestamp": "2026-01-05T14:03:05.547232"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_24251",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 105035,
      "natural_ratio": 0.8013534545898438,
      "max_context_tokens": 131072,
      "context_length": 209999,
      "question_length": 32,
      "prediction": "Jason",
      "reference": "Jason",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 3.198242425918579,
      "timestamp": "2026-01-05T14:03:08.745928"
    },
    {
      "sample_id": "squad_56cf5187aab44d1400b88fc6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 308,
      "natural_ratio": 0.002349853515625,
      "max_context_tokens": 131072,
      "context_length": 466,
      "question_length": 110,
      "prediction": "The New York Times",
      "reference": "The New York Times",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07178068161010742,
      "timestamp": "2026-01-05T14:03:08.817871"
    },
    {
      "sample_id": "squad_57315d3b05b4da19006bd0e4",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 580,
      "natural_ratio": 0.004425048828125,
      "max_context_tokens": 131072,
      "context_length": 1085,
      "question_length": 36,
      "prediction": "Hunan",
      "reference": "Hunan region where it was raised",
      "metrics": {
        "f1": 0.4444444444444445
      },
      "elapsed_time": 0.0528872013092041,
      "timestamp": "2026-01-05T14:03:08.870900"
    },
    {
      "sample_id": "squad_57313cc0a5e9cc1400cdbd7f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 416,
      "natural_ratio": 0.003173828125,
      "max_context_tokens": 131072,
      "context_length": 754,
      "question_length": 39,
      "prediction": "Red, green, and blue",
      "reference": "red, green and blue",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08717012405395508,
      "timestamp": "2026-01-05T14:03:08.958207"
    },
    {
      "sample_id": "squad_56db1b38e7c41114004b4d2d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 393,
      "natural_ratio": 0.00299835205078125,
      "max_context_tokens": 131072,
      "context_length": 692,
      "question_length": 55,
      "prediction": "Green Party officials",
      "reference": "Green Party officials.",
      "metrics": {
        "f1": 0.967741935483871
      },
      "elapsed_time": 0.054903268814086914,
      "timestamp": "2026-01-05T14:03:09.013252"
    },
    {
      "sample_id": "squad_57342720d058e614000b6a26",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 453,
      "natural_ratio": 0.00345611572265625,
      "max_context_tokens": 131072,
      "context_length": 845,
      "question_length": 22,
      "prediction": "Immunoassays are diagnostic tests that use the reaction between antibodies and antigens to detect and measure specific substances, such as infectious agents or proteins, using a signal that can be detected by instrumentation.",
      "reference": "Complex serological techniques",
      "metrics": {
        "f1": 0.7804878048780488
      },
      "elapsed_time": 0.43586301803588867,
      "timestamp": "2026-01-05T14:03:09.449286"
    },
    {
      "sample_id": "squad_5731906a497a881900249042",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 346,
      "natural_ratio": 0.0026397705078125,
      "max_context_tokens": 131072,
      "context_length": 606,
      "question_length": 47,
      "prediction": "Greek myths",
      "reference": "Greek",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.05321645736694336,
      "timestamp": "2026-01-05T14:03:09.502640"
    },
    {
      "sample_id": "squad_570a84d76d058f1900182f14",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 304,
      "natural_ratio": 0.0023193359375,
      "max_context_tokens": 131072,
      "context_length": 536,
      "question_length": 33,
      "prediction": "Scyld",
      "reference": "Scyld",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05469155311584473,
      "timestamp": "2026-01-05T14:03:09.557466"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_483",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 90366,
      "natural_ratio": 0.6894378662109375,
      "max_context_tokens": 131072,
      "context_length": 180667,
      "question_length": 26,
      "prediction": "Erik Mejer",
      "reference": "Hikers",
      "metrics": {
        "f1": 0.6153846153846153
      },
      "elapsed_time": 2.4835362434387207,
      "timestamp": "2026-01-05T14:03:12.041352"
    },
    {
      "sample_id": "squad_56df4fa496943c1400a5d36e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 300,
      "natural_ratio": 0.002288818359375,
      "max_context_tokens": 131072,
      "context_length": 511,
      "question_length": 49,
      "prediction": "Vladimir Lenin",
      "reference": "Vladimir Lenin",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07190442085266113,
      "timestamp": "2026-01-05T14:03:12.113491"
    },
    {
      "sample_id": "squad_570f92ba80d9841400ab35cd",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 486,
      "natural_ratio": 0.0037078857421875,
      "max_context_tokens": 131072,
      "context_length": 860,
      "question_length": 72,
      "prediction": "private wealth",
      "reference": "private wealth",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.046950340270996094,
      "timestamp": "2026-01-05T14:03:12.160606"
    },
    {
      "sample_id": "squad_572843362ca10214002da1e5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 354,
      "natural_ratio": 0.0027008056640625,
      "max_context_tokens": 131072,
      "context_length": 589,
      "question_length": 80,
      "prediction": "Operation Infinite Reach",
      "reference": "Operation Infinite Reach",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05506563186645508,
      "timestamp": "2026-01-05T14:03:12.215801"
    },
    {
      "sample_id": "squad_5726434b271a42140099d800",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 363,
      "natural_ratio": 0.00276947021484375,
      "max_context_tokens": 131072,
      "context_length": 634,
      "question_length": 52,
      "prediction": "Two",
      "reference": "two",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.034271240234375,
      "timestamp": "2026-01-05T14:03:12.250197"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_32150",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 76442,
      "natural_ratio": 0.5832061767578125,
      "max_context_tokens": 131072,
      "context_length": 152825,
      "question_length": 19,
      "prediction": "Ace Ventura is a pet detective.",
      "reference": "A Private Investigater in Miami Fl.",
      "metrics": {
        "f1": 0.7586206896551724
      },
      "elapsed_time": 0.21826601028442383,
      "timestamp": "2026-01-05T14:03:12.468756"
    },
    {
      "sample_id": "squad_570feb9480d9841400ab3732",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 327,
      "natural_ratio": 0.00249481201171875,
      "max_context_tokens": 131072,
      "context_length": 555,
      "question_length": 60,
      "prediction": "50,000",
      "reference": "50,000",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08499860763549805,
      "timestamp": "2026-01-05T14:03:12.553920"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_6482",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 121463,
      "natural_ratio": 0.9266891479492188,
      "max_context_tokens": 131072,
      "context_length": 242784,
      "question_length": 103,
      "prediction": "In the central chamber of Midian.",
      "reference": "A barn.",
      "metrics": {
        "f1": 0.5454545454545454
      },
      "elapsed_time": 5.0101423263549805,
      "timestamp": "2026-01-05T14:03:17.564407"
    },
    {
      "sample_id": "squad_56dda5109a695914005b957e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 180,
      "natural_ratio": 0.001373291015625,
      "max_context_tokens": 131072,
      "context_length": 262,
      "question_length": 59,
      "prediction": "Greek",
      "reference": "Greek",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.03735613822937012,
      "timestamp": "2026-01-05T14:03:17.602294"
    },
    {
      "sample_id": "squad_5730146f947a6a140053d07c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 460,
      "natural_ratio": 0.003509521484375,
      "max_context_tokens": 131072,
      "context_length": 816,
      "question_length": 65,
      "prediction": "The Regency style",
      "reference": "The Regency style",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06644892692565918,
      "timestamp": "2026-01-05T14:03:17.668883"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_6310",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 76974,
      "natural_ratio": 0.5872650146484375,
      "max_context_tokens": 131072,
      "context_length": 153821,
      "question_length": 87,
      "prediction": "因为The Dude认为自己照顾了另一个Jeff Lebowski的轮椅nyder，现在需要得到相应的报酬。",
      "reference": "The Dude thinks he should because he was beat up by mistake.",
      "metrics": {
        "f1": 0.46875
      },
      "elapsed_time": 3.780928611755371,
      "timestamp": "2026-01-05T14:03:21.450197"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_4882",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 115813,
      "natural_ratio": 0.8835830688476562,
      "max_context_tokens": 131072,
      "context_length": 231539,
      "question_length": 48,
      "prediction": "Harley had been seized by folly; Harley had become addicted to visiting prostitutes.",
      "reference": "He lost both of his parents.",
      "metrics": {
        "f1": 0.7692307692307693
      },
      "elapsed_time": 4.567022323608398,
      "timestamp": "2026-01-05T14:03:26.017576"
    },
    {
      "sample_id": "squad_56cf61d3aab44d1400b891a3",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 376,
      "natural_ratio": 0.00286865234375,
      "max_context_tokens": 131072,
      "context_length": 650,
      "question_length": 62,
      "prediction": "1817",
      "reference": "1817",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07456374168395996,
      "timestamp": "2026-01-05T14:03:26.092707"
    },
    {
      "sample_id": "squad_5726166d89a1e219009ac22c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 424,
      "natural_ratio": 0.00323486328125,
      "max_context_tokens": 131072,
      "context_length": 733,
      "question_length": 75,
      "prediction": "Hellenistic",
      "reference": "Hellenistic",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05645895004272461,
      "timestamp": "2026-01-05T14:03:26.149313"
    },
    {
      "sample_id": "squad_572ea7e9dfa6aa1500f8d268",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 585,
      "natural_ratio": 0.00446319580078125,
      "max_context_tokens": 131072,
      "context_length": 1021,
      "question_length": 109,
      "prediction": "United Nations Security Council",
      "reference": "United Nations Security Council",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0662984848022461,
      "timestamp": "2026-01-05T14:03:26.215747"
    },
    {
      "sample_id": "squad_5731408a05b4da19006bcf42",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 331,
      "natural_ratio": 0.00252532958984375,
      "max_context_tokens": 131072,
      "context_length": 574,
      "question_length": 48,
      "prediction": "Italy",
      "reference": "Rome",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.0339970588684082,
      "timestamp": "2026-01-05T14:03:26.249870"
    },
    {
      "sample_id": "squad_57342891d058e614000b6a5c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 451,
      "natural_ratio": 0.00344085693359375,
      "max_context_tokens": 131072,
      "context_length": 801,
      "question_length": 61,
      "prediction": "White House of the Confederacy",
      "reference": "White House of the Confederacy",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0859992504119873,
      "timestamp": "2026-01-05T14:03:26.336002"
    },
    {
      "sample_id": "squad_56d1314517492d1400aabbce",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 389,
      "natural_ratio": 0.00296783447265625,
      "max_context_tokens": 131072,
      "context_length": 658,
      "question_length": 81,
      "prediction": "battery life",
      "reference": "battery life",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04513072967529297,
      "timestamp": "2026-01-05T14:03:26.381267"
    },
    {
      "sample_id": "squad_572935dd3f37b3190047810b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 660,
      "natural_ratio": 0.005035400390625,
      "max_context_tokens": 131072,
      "context_length": 1241,
      "question_length": 39,
      "prediction": "AD 260",
      "reference": "AD 260",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08107709884643555,
      "timestamp": "2026-01-05T14:03:26.462477"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_1029",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 118100,
      "natural_ratio": 0.901031494140625,
      "max_context_tokens": 131072,
      "context_length": 236034,
      "question_length": 127,
      "prediction": "30",
      "reference": "30 years",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 4.32315731048584,
      "timestamp": "2026-01-05T14:03:30.785953"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_14270",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 52170,
      "natural_ratio": 0.3980255126953125,
      "max_context_tokens": 131072,
      "context_length": 104242,
      "question_length": 59,
      "prediction": "Aeschylus",
      "reference": "Aeschulys",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 1.6070737838745117,
      "timestamp": "2026-01-05T14:03:32.393737"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_1683",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 91296,
      "natural_ratio": 0.696533203125,
      "max_context_tokens": 131072,
      "context_length": 182517,
      "question_length": 35,
      "prediction": "Carter will be going to South America.",
      "reference": "Jack and Anna",
      "metrics": {
        "f1": 0.32
      },
      "elapsed_time": 2.10479998588562,
      "timestamp": "2026-01-05T14:03:34.498954"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_20043",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 102888,
      "natural_ratio": 0.78497314453125,
      "max_context_tokens": 131072,
      "context_length": 205659,
      "question_length": 77,
      "prediction": "Russian Embassy",
      "reference": "Russia",
      "metrics": {
        "f1": 0.5714285714285715
      },
      "elapsed_time": 3.373063325881958,
      "timestamp": "2026-01-05T14:03:37.872381"
    },
    {
      "sample_id": "squad_56df680f8bc80c19004e4bdd",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 497,
      "natural_ratio": 0.00379180908203125,
      "max_context_tokens": 131072,
      "context_length": 907,
      "question_length": 47,
      "prediction": "in the developing world",
      "reference": "in the developing world",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07122468948364258,
      "timestamp": "2026-01-05T14:03:37.943768"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_31368",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 84442,
      "natural_ratio": 0.6442413330078125,
      "max_context_tokens": 131072,
      "context_length": 168813,
      "question_length": 31,
      "prediction": "Mr. Higgins's daughter",
      "reference": "HER STEPSISTER",
      "metrics": {
        "f1": 0.608695652173913
      },
      "elapsed_time": 2.6703741550445557,
      "timestamp": "2026-01-05T14:03:40.614498"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_26233",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 122916,
      "natural_ratio": 0.937774658203125,
      "max_context_tokens": 131072,
      "context_length": 245733,
      "question_length": 59,
      "prediction": "It would spread, imitating humans it comes into contact with.",
      "reference": "All life on Earth will be assimilated.",
      "metrics": {
        "f1": 0.8108108108108107
      },
      "elapsed_time": 3.259402275085449,
      "timestamp": "2026-01-05T14:03:43.874266"
    },
    {
      "sample_id": "squad_56f9914a9b226e1400dd15ae",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 404,
      "natural_ratio": 0.003082275390625,
      "max_context_tokens": 131072,
      "context_length": 708,
      "question_length": 61,
      "prediction": "Anhui",
      "reference": "with Anhui",
      "metrics": {
        "f1": 0.7692307692307693
      },
      "elapsed_time": 0.06099748611450195,
      "timestamp": "2026-01-05T14:03:43.935817"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_11641",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 50526,
      "natural_ratio": 0.3854827880859375,
      "max_context_tokens": 131072,
      "context_length": 100969,
      "question_length": 43,
      "prediction": "He believed Hague had used Kate, learning from him then moving on, which made him feel used and hurt.",
      "reference": "He had done a terrible harm",
      "metrics": {
        "f1": 0.7222222222222222
      },
      "elapsed_time": 1.5345396995544434,
      "timestamp": "2026-01-05T14:03:45.470700"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_17570",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 71289,
      "natural_ratio": 0.5438919067382812,
      "max_context_tokens": 131072,
      "context_length": 142518,
      "question_length": 21,
      "prediction": "The setting is Alexandria, Egypt.",
      "reference": "Egypt in 48 BC",
      "metrics": {
        "f1": 0.5517241379310345
      },
      "elapsed_time": 0.2124793529510498,
      "timestamp": "2026-01-05T14:03:45.683543"
    },
    {
      "sample_id": "squad_572794a8708984140094e173",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 897,
      "natural_ratio": 0.00684356689453125,
      "max_context_tokens": 131072,
      "context_length": 1715,
      "question_length": 39,
      "prediction": "BBC和Sky Sports",
      "reference": "From 1988 to 1997, the BBC and Sky Sports had coverage of the FA Cup",
      "metrics": {
        "f1": 0.5555555555555556
      },
      "elapsed_time": 0.07633662223815918,
      "timestamp": "2026-01-05T14:03:45.760050"
    },
    {
      "sample_id": "squad_5727f7e32ca10214002d9a6d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 626,
      "natural_ratio": 0.0047760009765625,
      "max_context_tokens": 131072,
      "context_length": 1149,
      "question_length": 64,
      "prediction": "The French Revolution of 1789",
      "reference": "the French Revolution of 1789",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.12172055244445801,
      "timestamp": "2026-01-05T14:03:45.881905"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_4340",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 96523,
      "natural_ratio": 0.7364120483398438,
      "max_context_tokens": 131072,
      "context_length": 192961,
      "question_length": 46,
      "prediction": "Christmas",
      "reference": "Christmas",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 5.016185760498047,
      "timestamp": "2026-01-05T14:03:50.898416"
    },
    {
      "sample_id": "squad_57282d9cff5b5019007d9ea0",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 340,
      "natural_ratio": 0.002593994140625,
      "max_context_tokens": 131072,
      "context_length": 556,
      "question_length": 84,
      "prediction": "No",
      "reference": "Von Neumann–Wigner interpretation never gained acceptance amongst the majority of physicists",
      "metrics": {
        "f1": 0.16666666666666669
      },
      "elapsed_time": 0.041010379791259766,
      "timestamp": "2026-01-05T14:03:50.939605"
    },
    {
      "sample_id": "squad_5725e29c271a42140099d2e1",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 562,
      "natural_ratio": 0.0042877197265625,
      "max_context_tokens": 131072,
      "context_length": 1032,
      "question_length": 52,
      "prediction": "190 BC",
      "reference": "190 BC",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07512879371643066,
      "timestamp": "2026-01-05T14:03:51.014903"
    },
    {
      "sample_id": "squad_57344892879d6814001ca482",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 402,
      "natural_ratio": 0.0030670166015625,
      "max_context_tokens": 131072,
      "context_length": 681,
      "question_length": 83,
      "prediction": "taboos",
      "reference": "hunting restrictions",
      "metrics": {
        "f1": 0.35294117647058826
      },
      "elapsed_time": 0.05692791938781738,
      "timestamp": "2026-01-05T14:03:51.072004"
    },
    {
      "sample_id": "squad_56f8d0389e9bad19000a0580",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 192,
      "natural_ratio": 0.00146484375,
      "max_context_tokens": 131072,
      "context_length": 301,
      "question_length": 44,
      "prediction": "1991年",
      "reference": "mid-1991",
      "metrics": {
        "f1": 0.4444444444444444
      },
      "elapsed_time": 0.07247710227966309,
      "timestamp": "2026-01-05T14:03:51.144636"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_4888",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 115813,
      "natural_ratio": 0.8835830688476562,
      "max_context_tokens": 131072,
      "context_length": 231539,
      "question_length": 48,
      "prediction": "Miss Walton is going to be married.",
      "reference": "Miss Walton and Sir Harry Benson.",
      "metrics": {
        "f1": 0.9090909090909091
      },
      "elapsed_time": 0.27658748626708984,
      "timestamp": "2026-01-05T14:03:51.421533"
    },
    {
      "sample_id": "squad_570629ba52bb891400689917",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 324,
      "natural_ratio": 0.002471923828125,
      "max_context_tokens": 131072,
      "context_length": 559,
      "question_length": 49,
      "prediction": "compression efficiency of encoders",
      "reference": "compression ratios",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 0.07980179786682129,
      "timestamp": "2026-01-05T14:03:51.501850"
    },
    {
      "sample_id": "squad_5730cf87b54a4f140068cc77",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 847,
      "natural_ratio": 0.00646209716796875,
      "max_context_tokens": 131072,
      "context_length": 1603,
      "question_length": 51,
      "prediction": "1946",
      "reference": "1946",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07149934768676758,
      "timestamp": "2026-01-05T14:03:51.573478"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_30360",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 65068,
      "natural_ratio": 0.496429443359375,
      "max_context_tokens": 131072,
      "context_length": 130015,
      "question_length": 81,
      "prediction": "from all over the island",
      "reference": "The sands of the island.",
      "metrics": {
        "f1": 0.8571428571428571
      },
      "elapsed_time": 1.7407252788543701,
      "timestamp": "2026-01-05T14:03:53.314500"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_5293",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 102505,
      "natural_ratio": 0.7820510864257812,
      "max_context_tokens": 131072,
      "context_length": 204946,
      "question_length": 25,
      "prediction": "Colleen",
      "reference": "John Crunch. ",
      "metrics": {
        "f1": 0.42857142857142855
      },
      "elapsed_time": 0.17493867874145508,
      "timestamp": "2026-01-05T14:03:53.489785"
    },
    {
      "sample_id": "squad_56f8df189e9bad19000a0649",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 448,
      "natural_ratio": 0.00341796875,
      "max_context_tokens": 131072,
      "context_length": 792,
      "question_length": 65,
      "prediction": "stress",
      "reference": "environmental stress",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.03820633888244629,
      "timestamp": "2026-01-05T14:03:53.528146"
    },
    {
      "sample_id": "squad_57279d5a3acd2414000de7e9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 553,
      "natural_ratio": 0.00421905517578125,
      "max_context_tokens": 131072,
      "context_length": 1029,
      "question_length": 38,
      "prediction": "in the Diaspora",
      "reference": "Diaspora",
      "metrics": {
        "f1": 0.6956521739130436
      },
      "elapsed_time": 0.06687188148498535,
      "timestamp": "2026-01-05T14:03:53.595151"
    },
    {
      "sample_id": "squad_572fc652a23a5019007fc9cb",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 455,
      "natural_ratio": 0.00347137451171875,
      "max_context_tokens": 131072,
      "context_length": 819,
      "question_length": 52,
      "prediction": "Over 3.5 million vehicles were driving in Hyderabad in 2012.",
      "reference": "3.5 million",
      "metrics": {
        "f1": 0.3636363636363636
      },
      "elapsed_time": 0.20907115936279297,
      "timestamp": "2026-01-05T14:03:53.804358"
    },
    {
      "sample_id": "squad_57263f68ec44d21400f3dcc5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 365,
      "natural_ratio": 0.00278472900390625,
      "max_context_tokens": 131072,
      "context_length": 646,
      "question_length": 45,
      "prediction": "epipubic bones",
      "reference": "epipubic bones",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06931233406066895,
      "timestamp": "2026-01-05T14:03:53.873797"
    },
    {
      "sample_id": "squad_571a2ab710f8ca1400304f22",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 381,
      "natural_ratio": 0.00290679931640625,
      "max_context_tokens": 131072,
      "context_length": 670,
      "question_length": 52,
      "prediction": "White and Other",
      "reference": "white or \"other.\"",
      "metrics": {
        "f1": 0.761904761904762
      },
      "elapsed_time": 0.05846977233886719,
      "timestamp": "2026-01-05T14:03:53.932399"
    },
    {
      "sample_id": "squad_5705fc5b75f01819005e7834",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 211,
      "natural_ratio": 0.00160980224609375,
      "max_context_tokens": 131072,
      "context_length": 348,
      "question_length": 34,
      "prediction": "The energy cost.",
      "reference": "energy cost",
      "metrics": {
        "f1": 0.8148148148148148
      },
      "elapsed_time": 0.06524181365966797,
      "timestamp": "2026-01-05T14:03:53.997767"
    },
    {
      "sample_id": "squad_57282e373acd2414000df66d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 364,
      "natural_ratio": 0.002777099609375,
      "max_context_tokens": 131072,
      "context_length": 660,
      "question_length": 28,
      "prediction": "trochophore larvae",
      "reference": "trochophore larvae",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07848429679870605,
      "timestamp": "2026-01-05T14:03:54.076378"
    },
    {
      "sample_id": "squad_571cef645efbb31900334e56",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 600,
      "natural_ratio": 0.00457763671875,
      "max_context_tokens": 131072,
      "context_length": 1107,
      "question_length": 54,
      "prediction": "1960s",
      "reference": "1960s",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08328485488891602,
      "timestamp": "2026-01-05T14:03:54.159788"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_7467",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 79395,
      "natural_ratio": 0.6057357788085938,
      "max_context_tokens": 131072,
      "context_length": 158713,
      "question_length": 37,
      "prediction": "Neville uses his blood and the blood of others who have recovered from the virus to create a serum that can cure the virus.",
      "reference": "His own body",
      "metrics": {
        "f1": 0.6
      },
      "elapsed_time": 2.931699275970459,
      "timestamp": "2026-01-05T14:03:57.091818"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_31390",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 84447,
      "natural_ratio": 0.6442794799804688,
      "max_context_tokens": 131072,
      "context_length": 168813,
      "question_length": 42,
      "prediction": "Louise accidently starts the fire when she and Mr. Cobb quarrel and she knocks over the lamp, which then falls onto the screen and catches fire.",
      "reference": "Tripping on a chair.",
      "metrics": {
        "f1": 0.5945945945945945
      },
      "elapsed_time": 0.4793822765350342,
      "timestamp": "2026-01-05T14:03:57.571572"
    },
    {
      "sample_id": "squad_56e11d89e3433e1400422c21",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 343,
      "natural_ratio": 0.00261688232421875,
      "max_context_tokens": 131072,
      "context_length": 574,
      "question_length": 72,
      "prediction": "2020",
      "reference": "by 2020",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.06939816474914551,
      "timestamp": "2026-01-05T14:03:57.641134"
    },
    {
      "sample_id": "squad_5727b94dff5b5019007d9379",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 329,
      "natural_ratio": 0.00251007080078125,
      "max_context_tokens": 131072,
      "context_length": 551,
      "question_length": 67,
      "prediction": "Monroe County",
      "reference": "Monroe County",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05570673942565918,
      "timestamp": "2026-01-05T14:03:57.696977"
    },
    {
      "sample_id": "squad_570da68e16d0071400510c4c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 414,
      "natural_ratio": 0.0031585693359375,
      "max_context_tokens": 131072,
      "context_length": 740,
      "question_length": 49,
      "prediction": "$16.3 million",
      "reference": "$16.3 million",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08527874946594238,
      "timestamp": "2026-01-05T14:03:57.782394"
    },
    {
      "sample_id": "squad_571b3d469499d21900609c34",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 557,
      "natural_ratio": 0.00424957275390625,
      "max_context_tokens": 131072,
      "context_length": 1003,
      "question_length": 72,
      "prediction": "recycled hot-mix asphalt",
      "reference": "recycled",
      "metrics": {
        "f1": 0.5
      },
      "elapsed_time": 0.08619999885559082,
      "timestamp": "2026-01-05T14:03:57.868728"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_18448",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 80292,
      "natural_ratio": 0.612579345703125,
      "max_context_tokens": 131072,
      "context_length": 160519,
      "question_length": 25,
      "prediction": "Jonah",
      "reference": "Jonah",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 3.004633665084839,
      "timestamp": "2026-01-05T14:04:00.873689"
    },
    {
      "sample_id": "squad_570c376aec8fbc190045bddc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 359,
      "natural_ratio": 0.00273895263671875,
      "max_context_tokens": 131072,
      "context_length": 599,
      "question_length": 80,
      "prediction": "The British Colonial Office",
      "reference": "The British Colonial Office",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07081961631774902,
      "timestamp": "2026-01-05T14:04:00.944671"
    },
    {
      "sample_id": "squad_57263e5189a1e219009ac5ad",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 295,
      "natural_ratio": 0.00225067138671875,
      "max_context_tokens": 131072,
      "context_length": 510,
      "question_length": 41,
      "prediction": "Ciudad de México",
      "reference": "Ciudad de México",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06552886962890625,
      "timestamp": "2026-01-05T14:04:01.010324"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_32371",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 108948,
      "natural_ratio": 0.831207275390625,
      "max_context_tokens": 131072,
      "context_length": 217801,
      "question_length": 55,
      "prediction": "Auerbach's cellar",
      "reference": "Auerbach's tavern in Leipzig",
      "metrics": {
        "f1": 0.7586206896551725
      },
      "elapsed_time": 4.245061874389648,
      "timestamp": "2026-01-05T14:04:05.255777"
    },
    {
      "sample_id": "squad_5727fc04ff5b5019007d99f4",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 444,
      "natural_ratio": 0.003387451171875,
      "max_context_tokens": 131072,
      "context_length": 805,
      "question_length": 44,
      "prediction": "World War I",
      "reference": "World War I",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06248903274536133,
      "timestamp": "2026-01-05T14:04:05.318790"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_4593",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 70687,
      "natural_ratio": 0.5392990112304688,
      "max_context_tokens": 131072,
      "context_length": 141309,
      "question_length": 26,
      "prediction": "Eyolf falls overboard in the fiord.",
      "reference": "He drowns is the sea.",
      "metrics": {
        "f1": 0.8000000000000002
      },
      "elapsed_time": 2.429138660430908,
      "timestamp": "2026-01-05T14:04:07.748290"
    },
    {
      "sample_id": "squad_5725cd5838643c19005acd39",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 489,
      "natural_ratio": 0.00373077392578125,
      "max_context_tokens": 131072,
      "context_length": 885,
      "question_length": 53,
      "prediction": "Agrianes",
      "reference": "Agrianes",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06725692749023438,
      "timestamp": "2026-01-05T14:04:07.815776"
    },
    {
      "sample_id": "squad_572eaf3fcb0c0d14000f1477",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 311,
      "natural_ratio": 0.00237274169921875,
      "max_context_tokens": 131072,
      "context_length": 521,
      "question_length": 61,
      "prediction": "third largest and third most populous island",
      "reference": "third largest",
      "metrics": {
        "f1": 0.5
      },
      "elapsed_time": 0.10006475448608398,
      "timestamp": "2026-01-05T14:04:07.916020"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_12433",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 38935,
      "natural_ratio": 0.29705047607421875,
      "max_context_tokens": 131072,
      "context_length": 77754,
      "question_length": 76,
      "prediction": "full pardon for every criminal act committed in the United States",
      "reference": "Snake will be granted a pardon which will release him of his charges.",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 1.353691577911377,
      "timestamp": "2026-01-05T14:04:09.270023"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_6525",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 123723,
      "natural_ratio": 0.9439315795898438,
      "max_context_tokens": 131072,
      "context_length": 247366,
      "question_length": 41,
      "prediction": "Vietnam War",
      "reference": "Vietnam War",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 4.462153673171997,
      "timestamp": "2026-01-05T14:04:13.732546"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_18192",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 75869,
      "natural_ratio": 0.5788345336914062,
      "max_context_tokens": 131072,
      "context_length": 151669,
      "question_length": 29,
      "prediction": "Rudy becomes jealous when he sees the Englishman showing affection to Babette.",
      "reference": "Babette flirts with her cousin",
      "metrics": {
        "f1": 0.8421052631578948
      },
      "elapsed_time": 2.4163365364074707,
      "timestamp": "2026-01-05T14:04:16.149581"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_21737",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 113940,
      "natural_ratio": 0.869293212890625,
      "max_context_tokens": 131072,
      "context_length": 227820,
      "question_length": 21,
      "prediction": "the new boss in the future",
      "reference": "Cid",
      "metrics": {
        "f1": 0.125
      },
      "elapsed_time": 3.871901273727417,
      "timestamp": "2026-01-05T14:04:20.021858"
    },
    {
      "sample_id": "squad_570d2bfafed7b91900d45ca0",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 602,
      "natural_ratio": 0.0045928955078125,
      "max_context_tokens": 131072,
      "context_length": 1100,
      "question_length": 64,
      "prediction": "MacBook Air and MacBook Pro",
      "reference": "MacBooks",
      "metrics": {
        "f1": 0.631578947368421
      },
      "elapsed_time": 0.0968935489654541,
      "timestamp": "2026-01-05T14:04:20.119288"
    },
    {
      "sample_id": "squad_56f97f299b226e1400dd14c6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 421,
      "natural_ratio": 0.00321197509765625,
      "max_context_tokens": 131072,
      "context_length": 741,
      "question_length": 62,
      "prediction": "cochlea",
      "reference": "cochlea",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05587649345397949,
      "timestamp": "2026-01-05T14:04:20.175300"
    },
    {
      "sample_id": "squad_5726d965f1498d1400e8ed0e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 370,
      "natural_ratio": 0.0028228759765625,
      "max_context_tokens": 131072,
      "context_length": 640,
      "question_length": 60,
      "prediction": "Soviet NKVD troops",
      "reference": "Soviet NKVD",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 0.08584189414978027,
      "timestamp": "2026-01-05T14:04:20.261290"
    },
    {
      "sample_id": "squad_56fc975cb53dbe1900755136",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 254,
      "natural_ratio": 0.0019378662109375,
      "max_context_tokens": 131072,
      "context_length": 417,
      "question_length": 51,
      "prediction": "Prosody",
      "reference": "prosody",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04123687744140625,
      "timestamp": "2026-01-05T14:04:20.302665"
    },
    {
      "sample_id": "squad_57070f769e06ca38007e9360",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 352,
      "natural_ratio": 0.002685546875,
      "max_context_tokens": 131072,
      "context_length": 630,
      "question_length": 35,
      "prediction": "By appearance, not heredity.",
      "reference": "like the colors of the spectrum",
      "metrics": {
        "f1": 0.5806451612903225
      },
      "elapsed_time": 0.10509133338928223,
      "timestamp": "2026-01-05T14:04:20.407900"
    },
    {
      "sample_id": "squad_56e6d988de9d371400068085",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 237,
      "natural_ratio": 0.00180816650390625,
      "max_context_tokens": 131072,
      "context_length": 347,
      "question_length": 87,
      "prediction": "Urban",
      "reference": "urban",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.030704975128173828,
      "timestamp": "2026-01-05T14:04:20.438734"
    },
    {
      "sample_id": "squad_5726e2f1708984140094d4cd",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 470,
      "natural_ratio": 0.0035858154296875,
      "max_context_tokens": 131072,
      "context_length": 869,
      "question_length": 31,
      "prediction": "In 1952, following a referendum, Baden, Württemberg-Baden, and Württemberg-Hohenzollern merged into Baden-Württemberg.",
      "reference": "following a referendum",
      "metrics": {
        "f1": 0.3142857142857143
      },
      "elapsed_time": 0.46425390243530273,
      "timestamp": "2026-01-05T14:04:20.903156"
    },
    {
      "sample_id": "squad_573425204776f41900661950",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 317,
      "natural_ratio": 0.00241851806640625,
      "max_context_tokens": 131072,
      "context_length": 558,
      "question_length": 37,
      "prediction": "1940",
      "reference": "1940",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06522345542907715,
      "timestamp": "2026-01-05T14:04:20.968517"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_21960",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 54139,
      "natural_ratio": 0.41304779052734375,
      "max_context_tokens": 131072,
      "context_length": 108198,
      "question_length": 40,
      "prediction": "The Man to cease making the Queen his love",
      "reference": "An apology.",
      "metrics": {
        "f1": 0.46153846153846156
      },
      "elapsed_time": 1.5528919696807861,
      "timestamp": "2026-01-05T14:04:22.521706"
    },
    {
      "sample_id": "squad_56d13b3ae7d4791d00902049",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 746,
      "natural_ratio": 0.0056915283203125,
      "max_context_tokens": 131072,
      "context_length": 1387,
      "question_length": 66,
      "prediction": "parody, satire, irony",
      "reference": "parody, satire, and irony",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.09549379348754883,
      "timestamp": "2026-01-05T14:04:22.617414"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_28889",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 113792,
      "natural_ratio": 0.8681640625,
      "max_context_tokens": 131072,
      "context_length": 227515,
      "question_length": 30,
      "prediction": "Renard",
      "reference": "Sir Robert King",
      "metrics": {
        "f1": 0.37499999999999994
      },
      "elapsed_time": 3.833400249481201,
      "timestamp": "2026-01-05T14:04:26.451158"
    },
    {
      "sample_id": "squad_5735ad64e853931400426abd",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 499,
      "natural_ratio": 0.00380706787109375,
      "max_context_tokens": 131072,
      "context_length": 895,
      "question_length": 63,
      "prediction": "Raxaul",
      "reference": "Raxaul",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06182575225830078,
      "timestamp": "2026-01-05T14:04:26.513537"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_8743",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 89177,
      "natural_ratio": 0.6803665161132812,
      "max_context_tokens": 131072,
      "context_length": 178274,
      "question_length": 40,
      "prediction": "To have a picnic with friends",
      "reference": "To have a picnic",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 2.8644492626190186,
      "timestamp": "2026-01-05T14:04:29.378328"
    },
    {
      "sample_id": "squad_573002eda23a5019007fcc35",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 506,
      "natural_ratio": 0.0038604736328125,
      "max_context_tokens": 131072,
      "context_length": 905,
      "question_length": 68,
      "prediction": "the senators",
      "reference": "senators",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 0.05122184753417969,
      "timestamp": "2026-01-05T14:04:29.429718"
    },
    {
      "sample_id": "squad_572e88f303f9891900756745",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 446,
      "natural_ratio": 0.0034027099609375,
      "max_context_tokens": 131072,
      "context_length": 798,
      "question_length": 55,
      "prediction": "柴油发动机",
      "reference": "diesel engine",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.04545331001281738,
      "timestamp": "2026-01-05T14:04:29.475314"
    },
    {
      "sample_id": "squad_570fe3fb80d9841400ab3705",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 480,
      "natural_ratio": 0.003662109375,
      "max_context_tokens": 131072,
      "context_length": 862,
      "question_length": 58,
      "prediction": "2008",
      "reference": "2008",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06628155708312988,
      "timestamp": "2026-01-05T14:04:29.541734"
    },
    {
      "sample_id": "squad_56dfbedb7aa994140058e0ea",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 177,
      "natural_ratio": 0.00135040283203125,
      "max_context_tokens": 131072,
      "context_length": 241,
      "question_length": 74,
      "prediction": "Mailbox provider",
      "reference": "A mailbox provider",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0506129264831543,
      "timestamp": "2026-01-05T14:04:29.592485"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_20537",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 90496,
      "natural_ratio": 0.6904296875,
      "max_context_tokens": 131072,
      "context_length": 180911,
      "question_length": 42,
      "prediction": "Fett Ship",
      "reference": "Slave I",
      "metrics": {
        "f1": 0.5333333333333333
      },
      "elapsed_time": 4.098646879196167,
      "timestamp": "2026-01-05T14:04:33.691498"
    },
    {
      "sample_id": "squad_56be96653aeaaa14008c9119",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 622,
      "natural_ratio": 0.0047454833984375,
      "max_context_tokens": 131072,
      "context_length": 1132,
      "question_length": 72,
      "prediction": "thriller",
      "reference": "thriller",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05591225624084473,
      "timestamp": "2026-01-05T14:04:33.747569"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_18379",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 101192,
      "natural_ratio": 0.77203369140625,
      "max_context_tokens": 131072,
      "context_length": 202298,
      "question_length": 46,
      "prediction": "Major",
      "reference": "She was a major in the Salvation Army.",
      "metrics": {
        "f1": 0.45454545454545453
      },
      "elapsed_time": 3.7779219150543213,
      "timestamp": "2026-01-05T14:04:37.525868"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_28716",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 13529,
      "natural_ratio": 0.10321807861328125,
      "max_context_tokens": 131072,
      "context_length": 26984,
      "question_length": 34,
      "prediction": "The woman eats the little bear's porridge.",
      "reference": "The Wee Bear.",
      "metrics": {
        "f1": 0.6428571428571429
      },
      "elapsed_time": 0.38641858100891113,
      "timestamp": "2026-01-05T14:04:37.912542"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_27260",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 21333,
      "natural_ratio": 0.16275787353515625,
      "max_context_tokens": 131072,
      "context_length": 42577,
      "question_length": 49,
      "prediction": "The Skin Horse",
      "reference": "The Skin Horse",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.4645223617553711,
      "timestamp": "2026-01-05T14:04:38.377370"
    },
    {
      "sample_id": "squad_572f7a7704bcaa1900d769de",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 589,
      "natural_ratio": 0.00449371337890625,
      "max_context_tokens": 131072,
      "context_length": 1093,
      "question_length": 45,
      "prediction": "to bring sampled and electronic sounds to the pop mainstream",
      "reference": "attempt to bring sampled and electronic sounds to the pop mainstream",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.1293935775756836,
      "timestamp": "2026-01-05T14:04:38.506947"
    },
    {
      "sample_id": "squad_56f858c6aef2371900626007",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 144,
      "natural_ratio": 0.0010986328125,
      "max_context_tokens": 131072,
      "context_length": 199,
      "question_length": 50,
      "prediction": "English Civil War",
      "reference": "English Civil War",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05108070373535156,
      "timestamp": "2026-01-05T14:04:38.558185"
    },
    {
      "sample_id": "squad_573039c004bcaa1900d773c8",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 384,
      "natural_ratio": 0.0029296875,
      "max_context_tokens": 131072,
      "context_length": 651,
      "question_length": 78,
      "prediction": "infantrymen",
      "reference": "cohort",
      "metrics": {
        "f1": 0.2857142857142857
      },
      "elapsed_time": 0.055402278900146484,
      "timestamp": "2026-01-05T14:04:38.613769"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_11146",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 106746,
      "natural_ratio": 0.8144073486328125,
      "max_context_tokens": 131072,
      "context_length": 213413,
      "question_length": 40,
      "prediction": "the Captain",
      "reference": "Captain McCrea.",
      "metrics": {
        "f1": 0.7999999999999999
      },
      "elapsed_time": 3.25877046585083,
      "timestamp": "2026-01-05T14:04:41.872904"
    },
    {
      "sample_id": "squad_572629b789a1e219009ac484",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 663,
      "natural_ratio": 0.00505828857421875,
      "max_context_tokens": 131072,
      "context_length": 1245,
      "question_length": 42,
      "prediction": "Justinian",
      "reference": "Justinian",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05483293533325195,
      "timestamp": "2026-01-05T14:04:41.928261"
    },
    {
      "sample_id": "squad_570dc39016d0071400510d5b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 496,
      "natural_ratio": 0.0037841796875,
      "max_context_tokens": 131072,
      "context_length": 820,
      "question_length": 132,
      "prediction": "Enhancement motives",
      "reference": "enjoyment",
      "metrics": {
        "f1": 0.5263157894736842
      },
      "elapsed_time": 0.055893898010253906,
      "timestamp": "2026-01-05T14:04:41.984288"
    },
    {
      "sample_id": "squad_57266b63708984140094c562",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 351,
      "natural_ratio": 0.00267791748046875,
      "max_context_tokens": 131072,
      "context_length": 582,
      "question_length": 81,
      "prediction": "The Sandberg Game",
      "reference": "The Sandberg Game",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06506538391113281,
      "timestamp": "2026-01-05T14:04:42.049485"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_9317",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 105197,
      "natural_ratio": 0.8025894165039062,
      "max_context_tokens": 131072,
      "context_length": 210291,
      "question_length": 64,
      "prediction": "gets a damn cappuccino",
      "reference": "By drinking a cappuccino.",
      "metrics": {
        "f1": 0.689655172413793
      },
      "elapsed_time": 3.4150900840759277,
      "timestamp": "2026-01-05T14:04:45.464909"
    },
    {
      "sample_id": "squad_572a50757a1753140016aea1",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 658,
      "natural_ratio": 0.0050201416015625,
      "max_context_tokens": 131072,
      "context_length": 1226,
      "question_length": 50,
      "prediction": "too much magnesium or vitamin C or undigested lactose",
      "reference": "too much magnesium or vitamin C or undigested lactose",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.15776491165161133,
      "timestamp": "2026-01-05T14:04:45.622884"
    },
    {
      "sample_id": "squad_57342891d058e614000b6a5b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 452,
      "natural_ratio": 0.003448486328125,
      "max_context_tokens": 131072,
      "context_length": 801,
      "question_length": 64,
      "prediction": "CSS Virginia",
      "reference": "CSS Virginia",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04308032989501953,
      "timestamp": "2026-01-05T14:04:45.666128"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_25507",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 85273,
      "natural_ratio": 0.6505813598632812,
      "max_context_tokens": 131072,
      "context_length": 170470,
      "question_length": 37,
      "prediction": "Jacques",
      "reference": "Enzo",
      "metrics": {
        "f1": 0.18181818181818182
      },
      "elapsed_time": 3.856231689453125,
      "timestamp": "2026-01-05T14:04:49.522739"
    },
    {
      "sample_id": "squad_5727e3ec3acd2414000def17",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 673,
      "natural_ratio": 0.00513458251953125,
      "max_context_tokens": 131072,
      "context_length": 1255,
      "question_length": 52,
      "prediction": "intellectual",
      "reference": "intellectual property",
      "metrics": {
        "f1": 0.761904761904762
      },
      "elapsed_time": 0.0553433895111084,
      "timestamp": "2026-01-05T14:04:49.578265"
    },
    {
      "sample_id": "squad_572b415834ae481900dead53",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 292,
      "natural_ratio": 0.002227783203125,
      "max_context_tokens": 131072,
      "context_length": 505,
      "question_length": 40,
      "prediction": "Vienna Circle",
      "reference": "Logical empiricism",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.05541181564331055,
      "timestamp": "2026-01-05T14:04:49.633838"
    },
    {
      "sample_id": "squad_571de5e2556973190063909e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 514,
      "natural_ratio": 0.0039215087890625,
      "max_context_tokens": 131072,
      "context_length": 901,
      "question_length": 87,
      "prediction": "eugenics",
      "reference": "eugenics and ideas of racial purity",
      "metrics": {
        "f1": 0.5833333333333334
      },
      "elapsed_time": 0.057605743408203125,
      "timestamp": "2026-01-05T14:04:49.691640"
    },
    {
      "sample_id": "squad_5709fb5e4103511400d594cf",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 394,
      "natural_ratio": 0.0030059814453125,
      "max_context_tokens": 131072,
      "context_length": 697,
      "question_length": 51,
      "prediction": "1937",
      "reference": "1937",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06688570976257324,
      "timestamp": "2026-01-05T14:04:49.758672"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_12964",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 82373,
      "natural_ratio": 0.6284561157226562,
      "max_context_tokens": 131072,
      "context_length": 164642,
      "question_length": 65,
      "prediction": "Together in Paris",
      "reference": "\"Together in Paris.\"",
      "metrics": {
        "f1": 0.923076923076923
      },
      "elapsed_time": 2.8265676498413086,
      "timestamp": "2026-01-05T14:04:52.585578"
    },
    {
      "sample_id": "squad_56e6f84ede9d37140006810c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 270,
      "natural_ratio": 0.0020599365234375,
      "max_context_tokens": 131072,
      "context_length": 431,
      "question_length": 70,
      "prediction": "beautiful music and easy listening",
      "reference": "beautiful music and easy listening",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07899141311645508,
      "timestamp": "2026-01-05T14:04:52.664753"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_24843",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 77695,
      "natural_ratio": 0.5927658081054688,
      "max_context_tokens": 131072,
      "context_length": 155315,
      "question_length": 35,
      "prediction": "the French Revolution",
      "reference": "French Revolutionary army.",
      "metrics": {
        "f1": 0.8666666666666666
      },
      "elapsed_time": 2.107273578643799,
      "timestamp": "2026-01-05T14:04:54.772378"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_28331",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 78030,
      "natural_ratio": 0.5953216552734375,
      "max_context_tokens": 131072,
      "context_length": 155970,
      "question_length": 51,
      "prediction": "buys it",
      "reference": "He has it transferred to his backyard for the kids to enjoy.",
      "metrics": {
        "f1": 0.48
      },
      "elapsed_time": 2.297839879989624,
      "timestamp": "2026-01-05T14:04:57.070559"
    },
    {
      "sample_id": "squad_56f8d8959e9bad19000a05e2",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 193,
      "natural_ratio": 0.00147247314453125,
      "max_context_tokens": 131072,
      "context_length": 288,
      "question_length": 59,
      "prediction": "PRS (Party for Social Renewal)",
      "reference": "PRS (Party for Social Renewal)",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.10718107223510742,
      "timestamp": "2026-01-05T14:04:57.177921"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_30355",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 65054,
      "natural_ratio": 0.4963226318359375,
      "max_context_tokens": 131072,
      "context_length": 130015,
      "question_length": 53,
      "prediction": "The man is missing in the morning after the first night.",
      "reference": "One of the paddles, and some food.",
      "metrics": {
        "f1": 0.7586206896551724
      },
      "elapsed_time": 0.23326325416564941,
      "timestamp": "2026-01-05T14:04:57.411460"
    },
    {
      "sample_id": "squad_56e7b4f737bdd419002c43b3",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 233,
      "natural_ratio": 0.00177764892578125,
      "max_context_tokens": 131072,
      "context_length": 384,
      "question_length": 43,
      "prediction": "2001",
      "reference": "2001",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06387567520141602,
      "timestamp": "2026-01-05T14:04:57.475497"
    },
    {
      "sample_id": "squad_57292b1f6aef051400154b20",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 684,
      "natural_ratio": 0.005218505859375,
      "max_context_tokens": 131072,
      "context_length": 1288,
      "question_length": 40,
      "prediction": "Abgeordnetenhaus",
      "reference": "Abgeordnetenhaus",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.09079551696777344,
      "timestamp": "2026-01-05T14:04:57.566433"
    },
    {
      "sample_id": "squad_572827c62ca10214002d9f6a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 375,
      "natural_ratio": 0.00286102294921875,
      "max_context_tokens": 131072,
      "context_length": 669,
      "question_length": 42,
      "prediction": "The Bible",
      "reference": "the Bible",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.044820308685302734,
      "timestamp": "2026-01-05T14:04:57.611392"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_7798",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 105491,
      "natural_ratio": 0.8048324584960938,
      "max_context_tokens": 131072,
      "context_length": 210849,
      "question_length": 94,
      "prediction": "autonomy",
      "reference": "Free will.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.4828591346740723,
      "timestamp": "2026-01-05T14:05:01.094593"
    },
    {
      "sample_id": "squad_56e161bfcd28a01900c67845",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 611,
      "natural_ratio": 0.00466156005859375,
      "max_context_tokens": 131072,
      "context_length": 1145,
      "question_length": 37,
      "prediction": "The Boston Red Sox play their games at Fenway Park.",
      "reference": "The Boston Red Sox",
      "metrics": {
        "f1": 0.5714285714285715
      },
      "elapsed_time": 0.15798711776733398,
      "timestamp": "2026-01-05T14:05:01.252758"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_7874",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 48475,
      "natural_ratio": 0.36983489990234375,
      "max_context_tokens": 131072,
      "context_length": 96842,
      "question_length": 69,
      "prediction": "Mr. Crow's cawing",
      "reference": "He sits on a bee",
      "metrics": {
        "f1": 0.5217391304347826
      },
      "elapsed_time": 1.3005316257476807,
      "timestamp": "2026-01-05T14:05:02.553620"
    },
    {
      "sample_id": "squad_5707199590286e26004fc8f5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 511,
      "natural_ratio": 0.00389862060546875,
      "max_context_tokens": 131072,
      "context_length": 941,
      "question_length": 41,
      "prediction": "the paz porfiriana",
      "reference": "paz porfiriana",
      "metrics": {
        "f1": 0.8750000000000001
      },
      "elapsed_time": 0.07981657981872559,
      "timestamp": "2026-01-05T14:05:02.633604"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_31754",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 124250,
      "natural_ratio": 0.9479522705078125,
      "max_context_tokens": 131072,
      "context_length": 248406,
      "question_length": 54,
      "prediction": "Debbie is taken away by police with other women from the shower. Rick's ex girlfriend, Tina, who is at the scene, takes Debbie away.",
      "reference": "Cole kidnaps her",
      "metrics": {
        "f1": 0.717948717948718
      },
      "elapsed_time": 3.540339708328247,
      "timestamp": "2026-01-05T14:05:06.174279"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_14490",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 62040,
      "natural_ratio": 0.47332763671875,
      "max_context_tokens": 131072,
      "context_length": 123995,
      "question_length": 46,
      "prediction": "Ruth's declaration of love and devotion to God",
      "reference": "Her husband and both sons had all died. ",
      "metrics": {
        "f1": 0.7878787878787877
      },
      "elapsed_time": 1.7729854583740234,
      "timestamp": "2026-01-05T14:05:07.947923"
    },
    {
      "sample_id": "squad_5725b77c271a42140099d092",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 434,
      "natural_ratio": 0.0033111572265625,
      "max_context_tokens": 131072,
      "context_length": 758,
      "question_length": 70,
      "prediction": "1922",
      "reference": "1922",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0705103874206543,
      "timestamp": "2026-01-05T14:05:08.018613"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_235",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 32294,
      "natural_ratio": 0.2463836669921875,
      "max_context_tokens": 131072,
      "context_length": 64510,
      "question_length": 38,
      "prediction": "Acceptance of the oneness of mankind",
      "reference": "the thought that mankind is one human family",
      "metrics": {
        "f1": 0.8125
      },
      "elapsed_time": 0.7329909801483154,
      "timestamp": "2026-01-05T14:05:08.751888"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_22046",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 100482,
      "natural_ratio": 0.7666168212890625,
      "max_context_tokens": 131072,
      "context_length": 200864,
      "question_length": 61,
      "prediction": "It turns human into monster.",
      "reference": "It decomposes them.",
      "metrics": {
        "f1": 0.7199999999999999
      },
      "elapsed_time": 2.5376875400543213,
      "timestamp": "2026-01-05T14:05:11.289939"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_4288",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 92064,
      "natural_ratio": 0.702392578125,
      "max_context_tokens": 131072,
      "context_length": 184037,
      "question_length": 51,
      "prediction": "Because the air in the room was strongly hyperoxygenated, and the ether in the outside world was not toxic to humans.",
      "reference": "The Earth had passed the poison belt",
      "metrics": {
        "f1": 0.7567567567567568
      },
      "elapsed_time": 3.126255512237549,
      "timestamp": "2026-01-05T14:05:14.416576"
    },
    {
      "sample_id": "squad_570d2d61b3d812140066d4f1",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 451,
      "natural_ratio": 0.00344085693359375,
      "max_context_tokens": 131072,
      "context_length": 792,
      "question_length": 70,
      "prediction": "In May 2010",
      "reference": "May 2010",
      "metrics": {
        "f1": 0.8421052631578948
      },
      "elapsed_time": 0.10197591781616211,
      "timestamp": "2026-01-05T14:05:14.518719"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_16988",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 41294,
      "natural_ratio": 0.3150482177734375,
      "max_context_tokens": 131072,
      "context_length": 82487,
      "question_length": 61,
      "prediction": "Nicolette's father",
      "reference": "He is her father.",
      "metrics": {
        "f1": 0.75
      },
      "elapsed_time": 1.11612868309021,
      "timestamp": "2026-01-05T14:05:15.635198"
    },
    {
      "sample_id": "squad_57267187dd62a815002e850f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 354,
      "natural_ratio": 0.0027008056640625,
      "max_context_tokens": 131072,
      "context_length": 612,
      "question_length": 56,
      "prediction": "20th century",
      "reference": "the 20th century",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07037496566772461,
      "timestamp": "2026-01-05T14:05:15.705791"
    },
    {
      "sample_id": "squad_5727a3d0ff5b5019007d919f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 495,
      "natural_ratio": 0.00377655029296875,
      "max_context_tokens": 131072,
      "context_length": 894,
      "question_length": 57,
      "prediction": "Many of them turned to stone-crushing, street hustling, and prostitution.",
      "reference": "stone-crushing, street hustling, and prostitution\"",
      "metrics": {
        "f1": 0.8780487804878049
      },
      "elapsed_time": 0.18979382514953613,
      "timestamp": "2026-01-05T14:05:15.895759"
    },
    {
      "sample_id": "squad_571a7ebb10f8ca14003050a7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 443,
      "natural_ratio": 0.00337982177734375,
      "max_context_tokens": 131072,
      "context_length": 785,
      "question_length": 61,
      "prediction": "25%",
      "reference": "a quarter",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.055880069732666016,
      "timestamp": "2026-01-05T14:05:15.951775"
    },
    {
      "sample_id": "squad_57299cb33f37b319004784f9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 309,
      "natural_ratio": 0.00235748291015625,
      "max_context_tokens": 131072,
      "context_length": 538,
      "question_length": 40,
      "prediction": "480 million years ago during the Ordovician Period",
      "reference": "roughly 480 million years ago",
      "metrics": {
        "f1": 0.8717948717948718
      },
      "elapsed_time": 0.1461799144744873,
      "timestamp": "2026-01-05T14:05:16.098093"
    },
    {
      "sample_id": "squad_56e147e6cd28a01900c6772b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 328,
      "natural_ratio": 0.00250244140625,
      "max_context_tokens": 131072,
      "context_length": 507,
      "question_length": 109,
      "prediction": "production",
      "reference": "movie production",
      "metrics": {
        "f1": 0.8181818181818181
      },
      "elapsed_time": 0.03491497039794922,
      "timestamp": "2026-01-05T14:05:16.133136"
    },
    {
      "sample_id": "squad_570bd90fec8fbc190045bb76",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 345,
      "natural_ratio": 0.00263214111328125,
      "max_context_tokens": 131072,
      "context_length": 609,
      "question_length": 41,
      "prediction": "the Bemer-Ross Code",
      "reference": "Bemer-Ross Code",
      "metrics": {
        "f1": 0.8823529411764706
      },
      "elapsed_time": 0.0890960693359375,
      "timestamp": "2026-01-05T14:05:16.222360"
    },
    {
      "sample_id": "squad_5735c47ae853931400426b66",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 483,
      "natural_ratio": 0.00368499755859375,
      "max_context_tokens": 131072,
      "context_length": 887,
      "question_length": 39,
      "prediction": "Kirat",
      "reference": "Kirat",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04507637023925781,
      "timestamp": "2026-01-05T14:05:16.267566"
    },
    {
      "sample_id": "squad_57282f18ff5b5019007d9eab",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 391,
      "natural_ratio": 0.00298309326171875,
      "max_context_tokens": 131072,
      "context_length": 698,
      "question_length": 44,
      "prediction": "9 June",
      "reference": "9 June",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0445256233215332,
      "timestamp": "2026-01-05T14:05:16.312219"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_21765",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 110900,
      "natural_ratio": 0.846099853515625,
      "max_context_tokens": 131072,
      "context_length": 221711,
      "question_length": 49,
      "prediction": "fourth",
      "reference": "The 4th round",
      "metrics": {
        "f1": 0.625
      },
      "elapsed_time": 3.4013633728027344,
      "timestamp": "2026-01-05T14:05:19.713935"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_31365",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 84445,
      "natural_ratio": 0.6442642211914062,
      "max_context_tokens": 131072,
      "context_length": 168813,
      "question_length": 38,
      "prediction": "England",
      "reference": "ENGLAND",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.162001371383667,
      "timestamp": "2026-01-05T14:05:19.876671"
    },
    {
      "sample_id": "squad_572a48aefed8de19000d5b6d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 338,
      "natural_ratio": 0.0025787353515625,
      "max_context_tokens": 131072,
      "context_length": 544,
      "question_length": 92,
      "prediction": "Artspace",
      "reference": "Artspace",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05707883834838867,
      "timestamp": "2026-01-05T14:05:19.933913"
    },
    {
      "sample_id": "squad_5726660bdd62a815002e83a2",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 320,
      "natural_ratio": 0.00244140625,
      "max_context_tokens": 131072,
      "context_length": 523,
      "question_length": 78,
      "prediction": "The virus possibly could mutate to become highly virulent and infectious in humans and cause an influenza pandemic.",
      "reference": "The virus possibly could mutate to become highly virulent and infectious in humans and cause an influenza pandemic.",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.2277209758758545,
      "timestamp": "2026-01-05T14:05:20.161793"
    },
    {
      "sample_id": "squad_57313d70497a881900248cb9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 311,
      "natural_ratio": 0.00237274169921875,
      "max_context_tokens": 131072,
      "context_length": 545,
      "question_length": 38,
      "prediction": "tourism, agriculture, industry and services",
      "reference": "tourism, agriculture, industry and services",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.10487723350524902,
      "timestamp": "2026-01-05T14:05:20.266823"
    },
    {
      "sample_id": "squad_5727c82c3acd2414000dec43",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 790,
      "natural_ratio": 0.0060272216796875,
      "max_context_tokens": 131072,
      "context_length": 1451,
      "question_length": 90,
      "prediction": "Because it would have been encumbered by scholarly footnotes and much more technical detail.",
      "reference": "would have been encumbered by scholarly footnotes and much more technical detail",
      "metrics": {
        "f1": 0.9302325581395349
      },
      "elapsed_time": 0.21258068084716797,
      "timestamp": "2026-01-05T14:05:20.479598"
    },
    {
      "sample_id": "squad_5709b165ed30961900e84426",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 425,
      "natural_ratio": 0.00324249267578125,
      "max_context_tokens": 131072,
      "context_length": 736,
      "question_length": 74,
      "prediction": "The person must be deceased.",
      "reference": "deceased",
      "metrics": {
        "f1": 0.4444444444444445
      },
      "elapsed_time": 0.08762431144714355,
      "timestamp": "2026-01-05T14:05:20.567401"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_22920",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 121873,
      "natural_ratio": 0.9298171997070312,
      "max_context_tokens": 131072,
      "context_length": 243651,
      "question_length": 56,
      "prediction": "Three months later.",
      "reference": "When he saw others' discriminatory behavior toward Beckett at the law library.",
      "metrics": {
        "f1": 0.7272727272727273
      },
      "elapsed_time": 3.6111278533935547,
      "timestamp": "2026-01-05T14:05:24.178915"
    },
    {
      "sample_id": "squad_5726d859dd62a815002e924f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 315,
      "natural_ratio": 0.00240325927734375,
      "max_context_tokens": 131072,
      "context_length": 556,
      "question_length": 34,
      "prediction": "1 September",
      "reference": "1 September",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04987359046936035,
      "timestamp": "2026-01-05T14:05:24.229314"
    },
    {
      "sample_id": "squad_572616c289a1e219009ac232",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 424,
      "natural_ratio": 0.00323486328125,
      "max_context_tokens": 131072,
      "context_length": 746,
      "question_length": 62,
      "prediction": "Arsenal Ladies",
      "reference": "Arsenal Ladies",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06281805038452148,
      "timestamp": "2026-01-05T14:05:24.292269"
    },
    {
      "sample_id": "squad_57302231b2c2fd1400568917",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 476,
      "natural_ratio": 0.003631591796875,
      "max_context_tokens": 131072,
      "context_length": 845,
      "question_length": 67,
      "prediction": "August 16, 2012",
      "reference": "August 16, 2012",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.12809324264526367,
      "timestamp": "2026-01-05T14:05:24.420490"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_22254",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 75312,
      "natural_ratio": 0.5745849609375,
      "max_context_tokens": 131072,
      "context_length": 150534,
      "question_length": 50,
      "prediction": "home",
      "reference": "Back home",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 2.3240389823913574,
      "timestamp": "2026-01-05T14:05:26.744877"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_32286",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 44778,
      "natural_ratio": 0.3416290283203125,
      "max_context_tokens": 131072,
      "context_length": 89437,
      "question_length": 80,
      "prediction": "at the dwelling of Lola",
      "reference": "He was going to buy some more wine.",
      "metrics": {
        "f1": 0.6666666666666667
      },
      "elapsed_time": 1.3006207942962646,
      "timestamp": "2026-01-05T14:05:28.045881"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_10650",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 114367,
      "natural_ratio": 0.8725509643554688,
      "max_context_tokens": 131072,
      "context_length": 228667,
      "question_length": 27,
      "prediction": "a toon",
      "reference": "A toon in disguise ",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 4.016805648803711,
      "timestamp": "2026-01-05T14:05:32.063041"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_5763",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 46086,
      "natural_ratio": 0.3516082763671875,
      "max_context_tokens": 131072,
      "context_length": 92081,
      "question_length": 52,
      "prediction": "Holly's father",
      "reference": "Bounty hunters",
      "metrics": {
        "f1": 0.6956521739130435
      },
      "elapsed_time": 1.4940273761749268,
      "timestamp": "2026-01-05T14:05:33.557781"
    },
    {
      "sample_id": "squad_572801afff5b5019007d9ad1",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 353,
      "natural_ratio": 0.00269317626953125,
      "max_context_tokens": 131072,
      "context_length": 606,
      "question_length": 61,
      "prediction": "Hellenistic",
      "reference": "Hellenistic",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05957198143005371,
      "timestamp": "2026-01-05T14:05:33.617524"
    },
    {
      "sample_id": "squad_57301640b2c2fd1400568834",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 448,
      "natural_ratio": 0.00341796875,
      "max_context_tokens": 131072,
      "context_length": 799,
      "question_length": 58,
      "prediction": "optimates",
      "reference": "populares",
      "metrics": {
        "f1": 0.625
      },
      "elapsed_time": 0.04651045799255371,
      "timestamp": "2026-01-05T14:05:33.664169"
    },
    {
      "sample_id": "squad_56de93f94396321400ee2a36",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 213,
      "natural_ratio": 0.00162506103515625,
      "max_context_tokens": 131072,
      "context_length": 282,
      "question_length": 104,
      "prediction": "1988",
      "reference": "1988",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06151580810546875,
      "timestamp": "2026-01-05T14:05:33.725807"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_31029",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 98987,
      "natural_ratio": 0.7552108764648438,
      "max_context_tokens": 131072,
      "context_length": 197891,
      "question_length": 43,
      "prediction": "SWAT",
      "reference": "SWAT officer.",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 3.3969035148620605,
      "timestamp": "2026-01-05T14:05:37.123091"
    },
    {
      "sample_id": "squad_5726f517dd62a815002e963b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 347,
      "natural_ratio": 0.00264739990234375,
      "max_context_tokens": 131072,
      "context_length": 561,
      "question_length": 94,
      "prediction": "The United States relaxed curbs on foreign aid to Myanmar in November 2011 and announced the resumption of diplomatic relations on 13 January 2012",
      "reference": "the United States relaxed curbs on foreign aid to Myanmar in November 2011 and announced the resumption of diplomatic relations",
      "metrics": {
        "f1": 0.9304029304029303
      },
      "elapsed_time": 0.3975813388824463,
      "timestamp": "2026-01-05T14:05:37.520902"
    },
    {
      "sample_id": "squad_57320e1ee17f3d1400422637",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 370,
      "natural_ratio": 0.0028228759765625,
      "max_context_tokens": 131072,
      "context_length": 651,
      "question_length": 49,
      "prediction": "Anglicanism",
      "reference": "Anglicanism",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0573270320892334,
      "timestamp": "2026-01-05T14:05:37.578440"
    },
    {
      "sample_id": "squad_57281c262ca10214002d9e04",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 309,
      "natural_ratio": 0.00235748291015625,
      "max_context_tokens": 131072,
      "context_length": 502,
      "question_length": 77,
      "prediction": "The title of capital city of England and the United Kingdom has never been made official in law or by decree.",
      "reference": "London's status as the capital of England",
      "metrics": {
        "f1": 0.7804878048780487
      },
      "elapsed_time": 0.25003647804260254,
      "timestamp": "2026-01-05T14:05:37.828676"
    },
    {
      "sample_id": "squad_57280a4fff5b5019007d9b97",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 485,
      "natural_ratio": 0.00370025634765625,
      "max_context_tokens": 131072,
      "context_length": 896,
      "question_length": 34,
      "prediction": "Yes, Von Neumann had two younger brothers.",
      "reference": "two younger brothers",
      "metrics": {
        "f1": 0.6451612903225806
      },
      "elapsed_time": 0.14100193977355957,
      "timestamp": "2026-01-05T14:05:37.969890"
    },
    {
      "sample_id": "squad_56dee2b8c65bf219000b3dd5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 263,
      "natural_ratio": 0.00200653076171875,
      "max_context_tokens": 131072,
      "context_length": 460,
      "question_length": 26,
      "prediction": "Combat-Related Employment of Women",
      "reference": "Combat-Related Employment of Women",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08298563957214355,
      "timestamp": "2026-01-05T14:05:38.053060"
    },
    {
      "sample_id": "squad_56db2190e7c41114004b4dd5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 263,
      "natural_ratio": 0.00200653076171875,
      "max_context_tokens": 131072,
      "context_length": 437,
      "question_length": 50,
      "prediction": "4",
      "reference": "four",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.03153085708618164,
      "timestamp": "2026-01-05T14:05:38.084747"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_25800",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 110898,
      "natural_ratio": 0.8460845947265625,
      "max_context_tokens": 131072,
      "context_length": 221699,
      "question_length": 58,
      "prediction": "a pair of lovebirds",
      "reference": "Mitch wants to buy lovebirds.",
      "metrics": {
        "f1": 0.6666666666666667
      },
      "elapsed_time": 5.974852800369263,
      "timestamp": "2026-01-05T14:05:44.059932"
    },
    {
      "sample_id": "squad_56d8e7d8dc89441400fdb3bd",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 541,
      "natural_ratio": 0.00412750244140625,
      "max_context_tokens": 131072,
      "context_length": 977,
      "question_length": 66,
      "prediction": "Golden Gate Bridge",
      "reference": "Golden Gate Bridge",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06315827369689941,
      "timestamp": "2026-01-05T14:05:44.123612"
    },
    {
      "sample_id": "squad_56e15da4cd28a01900c67818",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 453,
      "natural_ratio": 0.00345611572265625,
      "max_context_tokens": 131072,
      "context_length": 813,
      "question_length": 53,
      "prediction": "2013",
      "reference": "in 2013",
      "metrics": {
        "f1": 0.7272727272727273
      },
      "elapsed_time": 0.06602120399475098,
      "timestamp": "2026-01-05T14:05:44.189762"
    },
    {
      "sample_id": "squad_56f7c6e1aef2371900625bff",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 219,
      "natural_ratio": 0.00167083740234375,
      "max_context_tokens": 131072,
      "context_length": 325,
      "question_length": 73,
      "prediction": "The poles of the 17th century thought the term \"szlachta\" came from the German word \"schlachten\" which means \"to slaughter\" or \"to butcher\".",
      "reference": "German \"schlachten\"",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.4185771942138672,
      "timestamp": "2026-01-05T14:05:44.608558"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_31752",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 124249,
      "natural_ratio": 0.9479446411132812,
      "max_context_tokens": 131072,
      "context_length": 248406,
      "question_length": 53,
      "prediction": "Rick has to keep stopping Brad from peeing on a tree.",
      "reference": "Committing suicide",
      "metrics": {
        "f1": 0.7096774193548387
      },
      "elapsed_time": 0.295804500579834,
      "timestamp": "2026-01-05T14:05:44.904702"
    },
    {
      "sample_id": "squad_5733ad50d058e614000b6020",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 390,
      "natural_ratio": 0.0029754638671875,
      "max_context_tokens": 131072,
      "context_length": 689,
      "question_length": 52,
      "prediction": "Expulsion of the Jesuits and subsequent confiscation of their assets.",
      "reference": "Sebastião de Melo prosecuted every person involved, even women and children",
      "metrics": {
        "f1": 0.7441860465116279
      },
      "elapsed_time": 0.17097711563110352,
      "timestamp": "2026-01-05T14:05:45.076237"
    },
    {
      "sample_id": "squad_56fdcadf761e401900d28c01",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 365,
      "natural_ratio": 0.00278472900390625,
      "max_context_tokens": 131072,
      "context_length": 633,
      "question_length": 58,
      "prediction": "wheel-and-disc mechanisms",
      "reference": "wheel-and-disc",
      "metrics": {
        "f1": 0.717948717948718
      },
      "elapsed_time": 0.06598496437072754,
      "timestamp": "2026-01-05T14:05:45.142366"
    },
    {
      "sample_id": "squad_572823083acd2414000df56d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 429,
      "natural_ratio": 0.00327301025390625,
      "max_context_tokens": 131072,
      "context_length": 710,
      "question_length": 108,
      "prediction": "facial expressions",
      "reference": "facial expressions",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05513644218444824,
      "timestamp": "2026-01-05T14:05:45.197636"
    },
    {
      "sample_id": "squad_56e10a04cd28a01900c674a1",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 689,
      "natural_ratio": 0.00525665283203125,
      "max_context_tokens": 131072,
      "context_length": 1288,
      "question_length": 51,
      "prediction": "November 1969",
      "reference": "November 1969",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.09090781211853027,
      "timestamp": "2026-01-05T14:05:45.288677"
    },
    {
      "sample_id": "squad_57264c97dd62a815002e80dc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 646,
      "natural_ratio": 0.0049285888671875,
      "max_context_tokens": 131072,
      "context_length": 1203,
      "question_length": 50,
      "prediction": "Latvian and Lithuanian Baltic peoples",
      "reference": "Latvian and Lithuanian Baltic peoples",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.12074565887451172,
      "timestamp": "2026-01-05T14:05:45.409550"
    },
    {
      "sample_id": "squad_56e860fd00c9c71400d77676",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 139,
      "natural_ratio": 0.00106048583984375,
      "max_context_tokens": 131072,
      "context_length": 173,
      "question_length": 66,
      "prediction": "1983",
      "reference": "1983",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06044960021972656,
      "timestamp": "2026-01-05T14:05:45.470124"
    },
    {
      "sample_id": "squad_5735adcde853931400426ac6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 495,
      "natural_ratio": 0.00377655029296875,
      "max_context_tokens": 131072,
      "context_length": 894,
      "question_length": 56,
      "prediction": "Himalayas",
      "reference": "Himalayas",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.058969736099243164,
      "timestamp": "2026-01-05T14:05:45.529213"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_28863",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 100822,
      "natural_ratio": 0.7692108154296875,
      "max_context_tokens": 131072,
      "context_length": 201565,
      "question_length": 40,
      "prediction": "Joey Percival is a cashier.",
      "reference": "He is a pilot",
      "metrics": {
        "f1": 0.7200000000000001
      },
      "elapsed_time": 4.160619735717773,
      "timestamp": "2026-01-05T14:05:49.690223"
    },
    {
      "sample_id": "squad_57268c78708984140094c9bf",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 401,
      "natural_ratio": 0.00305938720703125,
      "max_context_tokens": 131072,
      "context_length": 719,
      "question_length": 43,
      "prediction": "Unwrapped",
      "reference": "Unwrapped",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05447125434875488,
      "timestamp": "2026-01-05T14:05:49.744925"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_19282",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 32908,
      "natural_ratio": 0.251068115234375,
      "max_context_tokens": 131072,
      "context_length": 65755,
      "question_length": 22,
      "prediction": "Mary",
      "reference": "Mary Cushing",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.7653975486755371,
      "timestamp": "2026-01-05T14:05:50.510658"
    },
    {
      "sample_id": "squad_572f427604bcaa1900d767f3",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 597,
      "natural_ratio": 0.00455474853515625,
      "max_context_tokens": 131072,
      "context_length": 1107,
      "question_length": 48,
      "prediction": "The Confederacy had a larger area than it could defend.",
      "reference": "The Confederacy had a larger area than it could defend",
      "metrics": {
        "f1": 0.9908256880733944
      },
      "elapsed_time": 0.15828776359558105,
      "timestamp": "2026-01-05T14:05:50.669206"
    },
    {
      "sample_id": "squad_56e16960cd28a01900c678a6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 536,
      "natural_ratio": 0.00408935546875,
      "max_context_tokens": 131072,
      "context_length": 987,
      "question_length": 46,
      "prediction": "$40 million",
      "reference": "$40 million",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06828474998474121,
      "timestamp": "2026-01-05T14:05:50.737707"
    },
    {
      "sample_id": "squad_57301a88b2c2fd1400568875",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 573,
      "natural_ratio": 0.00437164306640625,
      "max_context_tokens": 131072,
      "context_length": 1058,
      "question_length": 48,
      "prediction": "indigenous peoples",
      "reference": "indigenous peoples they encountered,",
      "metrics": {
        "f1": 0.7857142857142858
      },
      "elapsed_time": 0.05702638626098633,
      "timestamp": "2026-01-05T14:05:50.794900"
    },
    {
      "sample_id": "squad_56e78f3100c9c71400d772bf",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 311,
      "natural_ratio": 0.00237274169921875,
      "max_context_tokens": 131072,
      "context_length": 442,
      "question_length": 140,
      "prediction": "state governments",
      "reference": "state governments",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04143977165222168,
      "timestamp": "2026-01-05T14:05:50.836480"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_18212",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 116526,
      "natural_ratio": 0.8890228271484375,
      "max_context_tokens": 131072,
      "context_length": 232971,
      "question_length": 42,
      "prediction": "Marrows assistant was impaled on a car's lion's head grilles, the scars are too regular to be real.",
      "reference": "It was slashed through a freak accident involving a clavichord wire. ",
      "metrics": {
        "f1": 0.8095238095238095
      },
      "elapsed_time": 3.733896017074585,
      "timestamp": "2026-01-05T14:05:54.570717"
    },
    {
      "sample_id": "squad_56d10ca817492d1400aab818",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 537,
      "natural_ratio": 0.00409698486328125,
      "max_context_tokens": 131072,
      "context_length": 994,
      "question_length": 41,
      "prediction": "My Beautiful Dark Twisted Fantasy",
      "reference": "My Beautiful Dark Twisted Fantasy",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.09297704696655273,
      "timestamp": "2026-01-05T14:05:54.664233"
    },
    {
      "sample_id": "squad_570b1d3d6b8089140040f720",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 677,
      "natural_ratio": 0.00516510009765625,
      "max_context_tokens": 131072,
      "context_length": 1247,
      "question_length": 68,
      "prediction": "Sega of America",
      "reference": "Sega of America",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07178044319152832,
      "timestamp": "2026-01-05T14:05:54.736174"
    },
    {
      "sample_id": "squad_5725662acc50291900b283a3",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 355,
      "natural_ratio": 0.00270843505859375,
      "max_context_tokens": 131072,
      "context_length": 636,
      "question_length": 34,
      "prediction": "His budget was defeated.",
      "reference": "his budget was defeated",
      "metrics": {
        "f1": 0.9787234042553191
      },
      "elapsed_time": 0.07541632652282715,
      "timestamp": "2026-01-05T14:05:54.811735"
    },
    {
      "sample_id": "squad_5732bcead6dcfa19001e8a99",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 489,
      "natural_ratio": 0.00373077392578125,
      "max_context_tokens": 131072,
      "context_length": 903,
      "question_length": 36,
      "prediction": "batons, tear gas, riot control agents, rubber bullets, riot shields, water cannons and electroshock weapons",
      "reference": "batons, tear gas, riot control agents, rubber bullets, riot shields, water cannons and electroshock weapons",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.2601785659790039,
      "timestamp": "2026-01-05T14:05:55.072062"
    },
    {
      "sample_id": "squad_56cf3e29aab44d1400b88ed0",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 498,
      "natural_ratio": 0.0037994384765625,
      "max_context_tokens": 131072,
      "context_length": 922,
      "question_length": 34,
      "prediction": "Charmian Bond",
      "reference": "Charmian Bond",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06609654426574707,
      "timestamp": "2026-01-05T14:05:55.138295"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_24765",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 15311,
      "natural_ratio": 0.11681365966796875,
      "max_context_tokens": 131072,
      "context_length": 30565,
      "question_length": 18,
      "prediction": "Kate",
      "reference": "Kate.",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 0.32739830017089844,
      "timestamp": "2026-01-05T14:05:55.465880"
    },
    {
      "sample_id": "squad_572fde67947a6a140053cd85",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 475,
      "natural_ratio": 0.00362396240234375,
      "max_context_tokens": 131072,
      "context_length": 870,
      "question_length": 41,
      "prediction": "polymerase chain reaction",
      "reference": "polymerase chain reaction",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0763404369354248,
      "timestamp": "2026-01-05T14:05:55.542398"
    },
    {
      "sample_id": "squad_5731070b05b4da19006bcce5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 653,
      "natural_ratio": 0.00498199462890625,
      "max_context_tokens": 131072,
      "context_length": 1185,
      "question_length": 81,
      "prediction": "Every human child is born pure and immaculate.",
      "reference": "in Islam every human child is born pure and immaculate",
      "metrics": {
        "f1": 0.9743589743589743
      },
      "elapsed_time": 0.13060355186462402,
      "timestamp": "2026-01-05T14:05:55.673161"
    },
    {
      "sample_id": "squad_572b8a3b34ae481900deae87",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 364,
      "natural_ratio": 0.002777099609375,
      "max_context_tokens": 131072,
      "context_length": 628,
      "question_length": 61,
      "prediction": "Distinction between phonemic /l/ and /ʎ/",
      "reference": "a distinction between phonemic /l/ and /ʎ/",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.14599108695983887,
      "timestamp": "2026-01-05T14:05:55.819308"
    },
    {
      "sample_id": "squad_5726b6a0f1498d1400e8e87f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 330,
      "natural_ratio": 0.0025177001953125,
      "max_context_tokens": 131072,
      "context_length": 528,
      "question_length": 92,
      "prediction": "significant disruption",
      "reference": "significant disruption",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04345989227294922,
      "timestamp": "2026-01-05T14:05:55.862895"
    },
    {
      "sample_id": "squad_5727b735ff5b5019007d933e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 424,
      "natural_ratio": 0.00323486328125,
      "max_context_tokens": 131072,
      "context_length": 742,
      "question_length": 66,
      "prediction": "ISO-8859-1",
      "reference": "ISO-8859-1",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.10880017280578613,
      "timestamp": "2026-01-05T14:05:55.971833"
    },
    {
      "sample_id": "squad_570b43ccec8fbc190045b93a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 397,
      "natural_ratio": 0.00302886962890625,
      "max_context_tokens": 131072,
      "context_length": 719,
      "question_length": 36,
      "prediction": "General Elwell Otis",
      "reference": "General Elwell Otis",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07557129859924316,
      "timestamp": "2026-01-05T14:05:56.047552"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_8081",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 108468,
      "natural_ratio": 0.827545166015625,
      "max_context_tokens": 131072,
      "context_length": 216815,
      "question_length": 81,
      "prediction": "The Hulk",
      "reference": "The Hulk",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 3.6957833766937256,
      "timestamp": "2026-01-05T14:05:59.743659"
    },
    {
      "sample_id": "squad_5727cd5c2ca10214002d96bc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 362,
      "natural_ratio": 0.0027618408203125,
      "max_context_tokens": 131072,
      "context_length": 629,
      "question_length": 55,
      "prediction": "Post modern Neo-Gothic",
      "reference": "post modern Neo-Gothic spires",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.09196662902832031,
      "timestamp": "2026-01-05T14:05:59.836174"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_14771",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 101476,
      "natural_ratio": 0.774200439453125,
      "max_context_tokens": 131072,
      "context_length": 202850,
      "question_length": 62,
      "prediction": "Her head",
      "reference": "A chainsaw",
      "metrics": {
        "f1": 0.42857142857142855
      },
      "elapsed_time": 4.02881383895874,
      "timestamp": "2026-01-05T14:06:03.865343"
    },
    {
      "sample_id": "squad_5726a2b75951b619008f7875",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 615,
      "natural_ratio": 0.00469207763671875,
      "max_context_tokens": 131072,
      "context_length": 1131,
      "question_length": 60,
      "prediction": "ecoParq",
      "reference": "ecoParq",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0660402774810791,
      "timestamp": "2026-01-05T14:06:03.931572"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_1664",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 91294,
      "natural_ratio": 0.6965179443359375,
      "max_context_tokens": 131072,
      "context_length": 182517,
      "question_length": 31,
      "prediction": "Frank Carter",
      "reference": "Frank",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.15018534660339355,
      "timestamp": "2026-01-05T14:06:04.082070"
    },
    {
      "sample_id": "squad_5727faef3acd2414000df155",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 538,
      "natural_ratio": 0.0041046142578125,
      "max_context_tokens": 131072,
      "context_length": 976,
      "question_length": 60,
      "prediction": "Gupta Empire",
      "reference": "Gupta",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.06884288787841797,
      "timestamp": "2026-01-05T14:06:04.151080"
    },
    {
      "sample_id": "squad_56e152d7cd28a01900c677c8",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 509,
      "natural_ratio": 0.00388336181640625,
      "max_context_tokens": 131072,
      "context_length": 943,
      "question_length": 35,
      "prediction": "Harvard University",
      "reference": "Harvard University",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.056641578674316406,
      "timestamp": "2026-01-05T14:06:04.207855"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_1268",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 71196,
      "natural_ratio": 0.543182373046875,
      "max_context_tokens": 131072,
      "context_length": 142299,
      "question_length": 54,
      "prediction": "a portrait",
      "reference": "The portrait of his wife, Augustine",
      "metrics": {
        "f1": 0.6086956521739131
      },
      "elapsed_time": 1.9470679759979248,
      "timestamp": "2026-01-05T14:06:06.155255"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_11235",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 119800,
      "natural_ratio": 0.91400146484375,
      "max_context_tokens": 131072,
      "context_length": 239521,
      "question_length": 39,
      "prediction": "Snakes and Spiders",
      "reference": "Spider and Snakes",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 4.3958961963653564,
      "timestamp": "2026-01-05T14:06:10.551537"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_17595",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 104835,
      "natural_ratio": 0.7998275756835938,
      "max_context_tokens": 131072,
      "context_length": 209596,
      "question_length": 34,
      "prediction": "Because Dave rides away with Cyril's guitar",
      "reference": "Rod finds out Dave has serenaded Katherine and he mistakes Cyril for Dave and beats up the wrong person.",
      "metrics": {
        "f1": 0.8095238095238096
      },
      "elapsed_time": 3.7052557468414307,
      "timestamp": "2026-01-05T14:06:14.257499"
    },
    {
      "sample_id": "squad_572aa0a1be1ee31400cb80d7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 560,
      "natural_ratio": 0.0042724609375,
      "max_context_tokens": 131072,
      "context_length": 947,
      "question_length": 134,
      "prediction": "Canada, Australia, New Zealand, the Union of South Africa, and Newfoundland",
      "reference": "colonies such as Canada, Australia, New Zealand, the Union of South Africa, and Newfoundland",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.18627142906188965,
      "timestamp": "2026-01-05T14:06:14.443951"
    },
    {
      "sample_id": "squad_5727b2134b864d1900163aa1",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 347,
      "natural_ratio": 0.00264739990234375,
      "max_context_tokens": 131072,
      "context_length": 574,
      "question_length": 80,
      "prediction": "faceting of form, spatial ambiguity, transparency, and multiplicity",
      "reference": "faceting of form, spatial ambiguity, transparency, and multiplicity.",
      "metrics": {
        "f1": 0.975609756097561
      },
      "elapsed_time": 0.16008663177490234,
      "timestamp": "2026-01-05T14:06:14.604202"
    },
    {
      "sample_id": "squad_572643ee1125e71900ae1915",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 320,
      "natural_ratio": 0.00244140625,
      "max_context_tokens": 131072,
      "context_length": 527,
      "question_length": 74,
      "prediction": "99percent",
      "reference": "99 percent",
      "metrics": {
        "f1": 0.9333333333333333
      },
      "elapsed_time": 0.05506110191345215,
      "timestamp": "2026-01-05T14:06:14.659406"
    },
    {
      "sample_id": "squad_5709b0aaed30961900e8441c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 602,
      "natural_ratio": 0.0045928955078125,
      "max_context_tokens": 131072,
      "context_length": 1098,
      "question_length": 66,
      "prediction": "Petrochemical refineries",
      "reference": "Petrochemical",
      "metrics": {
        "f1": 0.7027027027027027
      },
      "elapsed_time": 0.07647824287414551,
      "timestamp": "2026-01-05T14:06:14.736037"
    },
    {
      "sample_id": "squad_56cee14aaab44d1400b88bcb",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 377,
      "natural_ratio": 0.00287628173828125,
      "max_context_tokens": 131072,
      "context_length": 681,
      "question_length": 33,
      "prediction": "Liu Shaokun",
      "reference": "Liu Shaokun",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07543611526489258,
      "timestamp": "2026-01-05T14:06:14.811609"
    },
    {
      "sample_id": "squad_572ff927947a6a140053cebf",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 365,
      "natural_ratio": 0.00278472900390625,
      "max_context_tokens": 131072,
      "context_length": 608,
      "question_length": 83,
      "prediction": "around 4,395 years ago",
      "reference": "the 4th millennium BC",
      "metrics": {
        "f1": 0.35714285714285715
      },
      "elapsed_time": 0.11574649810791016,
      "timestamp": "2026-01-05T14:06:14.927504"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_21497",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 82715,
      "natural_ratio": 0.6310653686523438,
      "max_context_tokens": 131072,
      "context_length": 165323,
      "question_length": 67,
      "prediction": "Troubert",
      "reference": "Troubert",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 2.442692756652832,
      "timestamp": "2026-01-05T14:06:17.370521"
    },
    {
      "sample_id": "squad_56e136e7cd28a01900c676bc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 810,
      "natural_ratio": 0.0061798095703125,
      "max_context_tokens": 131072,
      "context_length": 1501,
      "question_length": 79,
      "prediction": "tidal areas",
      "reference": "tidal areas",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06703853607177734,
      "timestamp": "2026-01-05T14:06:17.437726"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_28882",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 113799,
      "natural_ratio": 0.8682174682617188,
      "max_context_tokens": 131072,
      "context_length": 227515,
      "question_length": 44,
      "prediction": "Because she is in love with Renard, who survived the accident she tried to kill him for.",
      "reference": "Renard drops a hint that he and Elektra are collaborating. ",
      "metrics": {
        "f1": 0.7999999999999999
      },
      "elapsed_time": 0.3834984302520752,
      "timestamp": "2026-01-05T14:06:17.821534"
    },
    {
      "sample_id": "squad_56f7f171aef2371900625c9f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 123,
      "natural_ratio": 0.00093841552734375,
      "max_context_tokens": 131072,
      "context_length": 167,
      "question_length": 39,
      "prediction": "India",
      "reference": "India",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.03387260437011719,
      "timestamp": "2026-01-05T14:06:17.855913"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_17571",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 71299,
      "natural_ratio": 0.5439682006835938,
      "max_context_tokens": 131072,
      "context_length": 142518,
      "question_length": 40,
      "prediction": "Cleopatra and her sister Arsinoe",
      "reference": "Cleopatra",
      "metrics": {
        "f1": 0.43902439024390244
      },
      "elapsed_time": 0.23140263557434082,
      "timestamp": "2026-01-05T14:06:18.087596"
    },
    {
      "sample_id": "squad_56d369ca59d6e41400146324",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 358,
      "natural_ratio": 0.0027313232421875,
      "max_context_tokens": 131072,
      "context_length": 609,
      "question_length": 67,
      "prediction": "Season nine",
      "reference": "season nine",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.047486066818237305,
      "timestamp": "2026-01-05T14:06:18.135237"
    },
    {
      "sample_id": "squad_5728dacc2ca10214002da9a9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 363,
      "natural_ratio": 0.00276947021484375,
      "max_context_tokens": 131072,
      "context_length": 640,
      "question_length": 47,
      "prediction": "四所",
      "reference": "four",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.045313119888305664,
      "timestamp": "2026-01-05T14:06:18.180690"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_22615",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 122153,
      "natural_ratio": 0.9319534301757812,
      "max_context_tokens": 131072,
      "context_length": 244211,
      "question_length": 55,
      "prediction": "Hamer and his men lay an ambush for the Barrow gang at a barn owned by C.W.'s grandfather in Arcadia, Louisiana. After a brief gunfight, Bonnie is killed and Clyde is mortally wounded. Clyde dies the next morning, with Bonnie at his side.",
      "reference": "They are captured after stopping on the side of the road to help Ivan Moss fix a flat tire, where the police are hiding in the bushes.",
      "metrics": {
        "f1": 0.92
      },
      "elapsed_time": 5.137786388397217,
      "timestamp": "2026-01-05T14:06:23.318904"
    },
    {
      "sample_id": "squad_572e904bdfa6aa1500f8d148",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 336,
      "natural_ratio": 0.0025634765625,
      "max_context_tokens": 131072,
      "context_length": 597,
      "question_length": 36,
      "prediction": "Tuberculosis",
      "reference": "tuberculosis",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.061324119567871094,
      "timestamp": "2026-01-05T14:06:23.380755"
    },
    {
      "sample_id": "squad_572ed5ccdfa6aa1500f8d428",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 622,
      "natural_ratio": 0.0047454833984375,
      "max_context_tokens": 131072,
      "context_length": 1147,
      "question_length": 58,
      "prediction": "Russia withdrew from the war.",
      "reference": "Catherine, who ended Russia's alliance with Prussia and withdrew from the war",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 0.09041452407836914,
      "timestamp": "2026-01-05T14:06:23.471326"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_15232",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 124001,
      "natural_ratio": 0.9460525512695312,
      "max_context_tokens": 131072,
      "context_length": 247925,
      "question_length": 38,
      "prediction": "By heat signatures",
      "reference": "By heat",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 3.8915152549743652,
      "timestamp": "2026-01-05T14:06:27.363201"
    },
    {
      "sample_id": "squad_572714d2708984140094d977",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 382,
      "natural_ratio": 0.0029144287109375,
      "max_context_tokens": 131072,
      "context_length": 643,
      "question_length": 81,
      "prediction": "consumption of carcinogenic preserved foods, such as smoked or salted food",
      "reference": "consumption of carcinogenic preserved foods",
      "metrics": {
        "f1": 0.735042735042735
      },
      "elapsed_time": 0.18400788307189941,
      "timestamp": "2026-01-05T14:06:27.547786"
    },
    {
      "sample_id": "squad_570d45acb3d812140066d612",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 308,
      "natural_ratio": 0.002349853515625,
      "max_context_tokens": 131072,
      "context_length": 529,
      "question_length": 48,
      "prediction": "Valencia",
      "reference": "centre of the city",
      "metrics": {
        "f1": 0.4444444444444444
      },
      "elapsed_time": 0.04535675048828125,
      "timestamp": "2026-01-05T14:06:27.593298"
    },
    {
      "sample_id": "squad_56dfbb8c231d4119001abd3c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 868,
      "natural_ratio": 0.006622314453125,
      "max_context_tokens": 131072,
      "context_length": 1637,
      "question_length": 60,
      "prediction": "municipal broadband",
      "reference": "municipal broadband",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07117390632629395,
      "timestamp": "2026-01-05T14:06:27.664608"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_10101",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 52267,
      "natural_ratio": 0.39876556396484375,
      "max_context_tokens": 131072,
      "context_length": 104464,
      "question_length": 31,
      "prediction": "Withnail and I are arrested for shoplifting.",
      "reference": "Driving under the influence",
      "metrics": {
        "f1": 0.7096774193548386
      },
      "elapsed_time": 1.5648374557495117,
      "timestamp": "2026-01-05T14:06:29.229767"
    },
    {
      "sample_id": "squad_56ddf08d66d3e219004dae35",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 231,
      "natural_ratio": 0.00176239013671875,
      "max_context_tokens": 131072,
      "context_length": 386,
      "question_length": 37,
      "prediction": "around 2000 BC",
      "reference": "2000 BC",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 0.0971519947052002,
      "timestamp": "2026-01-05T14:06:29.327094"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_20679",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 101054,
      "natural_ratio": 0.7709808349609375,
      "max_context_tokens": 131072,
      "context_length": 202029,
      "question_length": 39,
      "prediction": "Glen Whitehouse",
      "reference": "Their father, Glen Whitehouse",
      "metrics": {
        "f1": 0.8571428571428571
      },
      "elapsed_time": 2.3052287101745605,
      "timestamp": "2026-01-05T14:06:31.632683"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_21756",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 113948,
      "natural_ratio": 0.869354248046875,
      "max_context_tokens": 131072,
      "context_length": 227820,
      "question_length": 37,
      "prediction": "loopers",
      "reference": "He is a hitman.",
      "metrics": {
        "f1": 0.25
      },
      "elapsed_time": 0.19678616523742676,
      "timestamp": "2026-01-05T14:06:31.829838"
    },
    {
      "sample_id": "squad_572e8035cb0c0d14000f11e0",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 466,
      "natural_ratio": 0.0035552978515625,
      "max_context_tokens": 131072,
      "context_length": 844,
      "question_length": 49,
      "prediction": "英国和法国",
      "reference": "British and French forces",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.06119394302368164,
      "timestamp": "2026-01-05T14:06:31.891550"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_32251",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 105026,
      "natural_ratio": 0.8012847900390625,
      "max_context_tokens": 131072,
      "context_length": 209986,
      "question_length": 27,
      "prediction": "15",
      "reference": "Fifteen ",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 7.0102455615997314,
      "timestamp": "2026-01-05T14:06:38.902146"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_16399",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 82792,
      "natural_ratio": 0.63165283203125,
      "max_context_tokens": 131072,
      "context_length": 165473,
      "question_length": 72,
      "prediction": "The engagement was broken off because the sailor was going away and Ellida didn't want to be left waiting for him, even though he said he would come back.",
      "reference": "Because he murdered a captain and had to escape.",
      "metrics": {
        "f1": 0.7619047619047621
      },
      "elapsed_time": 3.1460373401641846,
      "timestamp": "2026-01-05T14:06:42.048547"
    },
    {
      "sample_id": "squad_56f7d4f7aef2371900625c24",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 223,
      "natural_ratio": 0.00170135498046875,
      "max_context_tokens": 131072,
      "context_length": 357,
      "question_length": 50,
      "prediction": "become tenants of the wealthier gentry",
      "reference": "to become tenants of the wealthier gentry.",
      "metrics": {
        "f1": 0.972972972972973
      },
      "elapsed_time": 0.11828446388244629,
      "timestamp": "2026-01-05T14:06:42.167009"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_22266",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 101299,
      "natural_ratio": 0.7728500366210938,
      "max_context_tokens": 131072,
      "context_length": 202518,
      "question_length": 41,
      "prediction": "Valentine has been pretending to be mad to get his father's attention, and now Valentine's father is upset that he thought he was mad.",
      "reference": "He spends too much money and is now in debt.",
      "metrics": {
        "f1": 0.7804878048780488
      },
      "elapsed_time": 4.589282274246216,
      "timestamp": "2026-01-05T14:06:46.756640"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_22976",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 101295,
      "natural_ratio": 0.7728195190429688,
      "max_context_tokens": 131072,
      "context_length": 202501,
      "question_length": 50,
      "prediction": "Harry was fired because he got his coworker's dog, Petey, drunk.",
      "reference": "He delivered the dogs late to the dog show and got them dirty.",
      "metrics": {
        "f1": 0.7317073170731708
      },
      "elapsed_time": 3.651280164718628,
      "timestamp": "2026-01-05T14:06:50.408312"
    },
    {
      "sample_id": "squad_5726acfa5951b619008f79c9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 469,
      "natural_ratio": 0.00357818603515625,
      "max_context_tokens": 131072,
      "context_length": 843,
      "question_length": 55,
      "prediction": "30 July",
      "reference": "30 July",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06296801567077637,
      "timestamp": "2026-01-05T14:06:50.471437"
    },
    {
      "sample_id": "squad_5727b3b72ca10214002d944c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 312,
      "natural_ratio": 0.00238037109375,
      "max_context_tokens": 131072,
      "context_length": 520,
      "question_length": 64,
      "prediction": "incompatibility with other encodings",
      "reference": "wide usage in various countries of the world but remain largely incompatible with each other",
      "metrics": {
        "f1": 0.9268292682926829
      },
      "elapsed_time": 0.09299230575561523,
      "timestamp": "2026-01-05T14:06:50.564579"
    },
    {
      "sample_id": "squad_57264ac7f1498d1400e8db57",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 459,
      "natural_ratio": 0.00350189208984375,
      "max_context_tokens": 131072,
      "context_length": 814,
      "question_length": 65,
      "prediction": "Affluent middle-class",
      "reference": "the affluent middle-class",
      "metrics": {
        "f1": 0.9655172413793104
      },
      "elapsed_time": 0.08236479759216309,
      "timestamp": "2026-01-05T14:06:50.647083"
    },
    {
      "sample_id": "squad_572734eb5951b619008f86b9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 360,
      "natural_ratio": 0.00274658203125,
      "max_context_tokens": 131072,
      "context_length": 630,
      "question_length": 51,
      "prediction": "1980-81",
      "reference": "didn't celebrate its centenary year until 1980–81",
      "metrics": {
        "f1": 0.32000000000000006
      },
      "elapsed_time": 0.09633326530456543,
      "timestamp": "2026-01-05T14:06:50.743558"
    },
    {
      "sample_id": "squad_56de739f4396321400ee292d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 520,
      "natural_ratio": 0.00396728515625,
      "max_context_tokens": 131072,
      "context_length": 935,
      "question_length": 66,
      "prediction": "orthodoxy",
      "reference": "orthodoxy",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04528546333312988,
      "timestamp": "2026-01-05T14:06:50.788972"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_32378",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 108947,
      "natural_ratio": 0.8311996459960938,
      "max_context_tokens": 131072,
      "context_length": 217801,
      "question_length": 53,
      "prediction": "lovers",
      "reference": "They are lovers.",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.19716238975524902,
      "timestamp": "2026-01-05T14:06:50.986402"
    },
    {
      "sample_id": "squad_572fcc11947a6a140053ccd3",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 360,
      "natural_ratio": 0.00274658203125,
      "max_context_tokens": 131072,
      "context_length": 615,
      "question_length": 65,
      "prediction": "Spirochaetes of the genus Borrelia",
      "reference": "Spirochaetes of the genus Borrelia",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.12918901443481445,
      "timestamp": "2026-01-05T14:06:51.116100"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_16959",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 61932,
      "natural_ratio": 0.472503662109375,
      "max_context_tokens": 131072,
      "context_length": 123786,
      "question_length": 38,
      "prediction": "20 pounds",
      "reference": "Twenty Pounds.",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 1.7406516075134277,
      "timestamp": "2026-01-05T14:06:52.857142"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_1543",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 119841,
      "natural_ratio": 0.9143142700195312,
      "max_context_tokens": 131072,
      "context_length": 239611,
      "question_length": 31,
      "prediction": "基督教",
      "reference": "Angel of Italian Art",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 4.488091707229614,
      "timestamp": "2026-01-05T14:06:57.345587"
    },
    {
      "sample_id": "squad_5730dfc8b7151e1900c0159f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 474,
      "natural_ratio": 0.0036163330078125,
      "max_context_tokens": 131072,
      "context_length": 867,
      "question_length": 42,
      "prediction": "to achieve military, diplomatic, informational, and/or economic objectives",
      "reference": "achieve military, diplomatic, informational, and/or economic objectives",
      "metrics": {
        "f1": 0.9793103448275862
      },
      "elapsed_time": 0.15472722053527832,
      "timestamp": "2026-01-05T14:06:57.500841"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_28971",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 36686,
      "natural_ratio": 0.2798919677734375,
      "max_context_tokens": 131072,
      "context_length": 73259,
      "question_length": 73,
      "prediction": "Tweel helps the earthmen while they are being attacked by the dream-beasts.",
      "reference": "Tweel.",
      "metrics": {
        "f1": 0.3846153846153846
      },
      "elapsed_time": 1.0352098941802979,
      "timestamp": "2026-01-05T14:06:58.536328"
    },
    {
      "sample_id": "squad_570b3980ec8fbc190045b8e8",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 1047,
      "natural_ratio": 0.00798797607421875,
      "max_context_tokens": 131072,
      "context_length": 1979,
      "question_length": 76,
      "prediction": "The War of 1812",
      "reference": "the War of 1812",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.11483883857727051,
      "timestamp": "2026-01-05T14:06:58.651346"
    },
    {
      "sample_id": "squad_5728eae64b864d190016508f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 315,
      "natural_ratio": 0.00240325927734375,
      "max_context_tokens": 131072,
      "context_length": 553,
      "question_length": 37,
      "prediction": "everyday life and in modern Japanese martial arts",
      "reference": "everyday life and in modern Japanese martial arts",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.11617040634155273,
      "timestamp": "2026-01-05T14:06:58.767648"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_27787",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 96974,
      "natural_ratio": 0.7398529052734375,
      "max_context_tokens": 131072,
      "context_length": 193837,
      "question_length": 71,
      "prediction": "Timmy came back to life and became a killer.",
      "reference": "he comes back as an evil zombie",
      "metrics": {
        "f1": 0.6875
      },
      "elapsed_time": 4.764147996902466,
      "timestamp": "2026-01-05T14:07:03.532156"
    },
    {
      "sample_id": "squad_570d2095b3d812140066d45e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 296,
      "natural_ratio": 0.00225830078125,
      "max_context_tokens": 131072,
      "context_length": 509,
      "question_length": 43,
      "prediction": "1520–1522",
      "reference": "1520–1522",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.12218451499938965,
      "timestamp": "2026-01-05T14:07:03.654510"
    },
    {
      "sample_id": "squad_57290ea56aef0514001549fd",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 375,
      "natural_ratio": 0.00286102294921875,
      "max_context_tokens": 131072,
      "context_length": 662,
      "question_length": 49,
      "prediction": "Testing can furnish a criticism or comparison against oracles.",
      "reference": "furnishes a criticism or comparison",
      "metrics": {
        "f1": 0.9090909090909091
      },
      "elapsed_time": 0.13675904273986816,
      "timestamp": "2026-01-05T14:07:03.791409"
    },
    {
      "sample_id": "squad_570dcb0c0dc6ce1900204cbe",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 451,
      "natural_ratio": 0.00344085693359375,
      "max_context_tokens": 131072,
      "context_length": 783,
      "question_length": 79,
      "prediction": "rise",
      "reference": "decline",
      "metrics": {
        "f1": 0.4
      },
      "elapsed_time": 0.03900408744812012,
      "timestamp": "2026-01-05T14:07:03.830546"
    },
    {
      "sample_id": "squad_57262c2c271a42140099d6f2",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 294,
      "natural_ratio": 0.0022430419921875,
      "max_context_tokens": 131072,
      "context_length": 517,
      "question_length": 31,
      "prediction": "Professional wrestling is an athletic form of entertainment based on a portrayal of a combat sport.",
      "reference": "an athletic form of entertainment based on a portrayal of a combat sport",
      "metrics": {
        "f1": 0.8421052631578948
      },
      "elapsed_time": 0.19486165046691895,
      "timestamp": "2026-01-05T14:07:04.025544"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_28306",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 55784,
      "natural_ratio": 0.42559814453125,
      "max_context_tokens": 131072,
      "context_length": 111486,
      "question_length": 42,
      "prediction": "The people in the building are transported back in time.",
      "reference": "They were stranded in pre- Columbian Manhattan",
      "metrics": {
        "f1": 0.85
      },
      "elapsed_time": 1.4807207584381104,
      "timestamp": "2026-01-05T14:07:05.506586"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_12422",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 38924,
      "natural_ratio": 0.296966552734375,
      "max_context_tokens": 131072,
      "context_length": 77754,
      "question_length": 55,
      "prediction": "Snake",
      "reference": "the Duke of New York",
      "metrics": {
        "f1": 0.33333333333333337
      },
      "elapsed_time": 1.255962610244751,
      "timestamp": "2026-01-05T14:07:06.762882"
    },
    {
      "sample_id": "squad_570dae6016d0071400510cc9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 423,
      "natural_ratio": 0.00322723388671875,
      "max_context_tokens": 131072,
      "context_length": 714,
      "question_length": 93,
      "prediction": "body dissatisfaction",
      "reference": "body dissatisfaction",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.049486637115478516,
      "timestamp": "2026-01-05T14:07:06.812545"
    },
    {
      "sample_id": "squad_57267077dd62a815002e84e6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 353,
      "natural_ratio": 0.00269317626953125,
      "max_context_tokens": 131072,
      "context_length": 623,
      "question_length": 44,
      "prediction": "The wrestler is accompanied by music.",
      "reference": "accompanied by music",
      "metrics": {
        "f1": 0.7017543859649124
      },
      "elapsed_time": 0.09590482711791992,
      "timestamp": "2026-01-05T14:07:06.908582"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_22506",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 62482,
      "natural_ratio": 0.4766998291015625,
      "max_context_tokens": 131072,
      "context_length": 124868,
      "question_length": 56,
      "prediction": "Richard is at the Carvel Hall when he learns his grandfather is dead.",
      "reference": "America",
      "metrics": {
        "f1": 0.4347826086956522
      },
      "elapsed_time": 1.9059336185455322,
      "timestamp": "2026-01-05T14:07:08.814807"
    },
    {
      "sample_id": "squad_5726b9d05951b619008f7beb",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 506,
      "natural_ratio": 0.0038604736328125,
      "max_context_tokens": 131072,
      "context_length": 877,
      "question_length": 96,
      "prediction": "提供能量",
      "reference": "provide energy",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.049164533615112305,
      "timestamp": "2026-01-05T14:07:08.864153"
    },
    {
      "sample_id": "squad_572c0399f182dd1900d7c7b9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 392,
      "natural_ratio": 0.00299072265625,
      "max_context_tokens": 131072,
      "context_length": 717,
      "question_length": 28,
      "prediction": "Leonard Mlodinow",
      "reference": "Leonard Mlodinow",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08537888526916504,
      "timestamp": "2026-01-05T14:07:08.949663"
    },
    {
      "sample_id": "squad_570971e6ed30961900e84158",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 506,
      "natural_ratio": 0.0038604736328125,
      "max_context_tokens": 131072,
      "context_length": 901,
      "question_length": 72,
      "prediction": "a malicious compact disc in a car's stereo system",
      "reference": "a malicious compact disc in a car's stereo system",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.12672734260559082,
      "timestamp": "2026-01-05T14:07:09.076527"
    },
    {
      "sample_id": "squad_56cf67c74df3c31400b0d72f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 494,
      "natural_ratio": 0.0037689208984375,
      "max_context_tokens": 131072,
      "context_length": 884,
      "question_length": 64,
      "prediction": "Larghetto of his Piano Concerto No. 1 (in E minor)",
      "reference": "Piano Concerto No. 1 (in E minor)",
      "metrics": {
        "f1": 0.8235294117647058
      },
      "elapsed_time": 0.2014923095703125,
      "timestamp": "2026-01-05T14:07:09.278155"
    },
    {
      "sample_id": "squad_573201d7b9d445190005e717",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 369,
      "natural_ratio": 0.00281524658203125,
      "max_context_tokens": 131072,
      "context_length": 651,
      "question_length": 48,
      "prediction": "The senate",
      "reference": "the senate",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04441356658935547,
      "timestamp": "2026-01-05T14:07:09.322693"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_5354",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 113207,
      "natural_ratio": 0.8637008666992188,
      "max_context_tokens": 131072,
      "context_length": 226304,
      "question_length": 70,
      "prediction": "he proposes to Miss Tita, who initially agrees to marry him in exchange for the papers, but then burns them anyway",
      "reference": "by marrying her",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 4.105826139450073,
      "timestamp": "2026-01-05T14:07:13.428849"
    },
    {
      "sample_id": "squad_5726d39add62a815002e9195",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 432,
      "natural_ratio": 0.0032958984375,
      "max_context_tokens": 131072,
      "context_length": 744,
      "question_length": 80,
      "prediction": "莫斯科",
      "reference": "Moscow",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.04058647155761719,
      "timestamp": "2026-01-05T14:07:13.469968"
    },
    {
      "sample_id": "squad_5728ff31af94a219006a9f07",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 445,
      "natural_ratio": 0.00339508056640625,
      "max_context_tokens": 131072,
      "context_length": 797,
      "question_length": 54,
      "prediction": "19th century",
      "reference": "19th",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.06639266014099121,
      "timestamp": "2026-01-05T14:07:13.536496"
    },
    {
      "sample_id": "squad_573059f08ab72b1400f9c491",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 380,
      "natural_ratio": 0.002899169921875,
      "max_context_tokens": 131072,
      "context_length": 667,
      "question_length": 54,
      "prediction": "Greek",
      "reference": "Greek",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.03421497344970703,
      "timestamp": "2026-01-05T14:07:13.570839"
    },
    {
      "sample_id": "squad_5727c8104b864d1900163cfe",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 392,
      "natural_ratio": 0.00299072265625,
      "max_context_tokens": 131072,
      "context_length": 662,
      "question_length": 82,
      "prediction": "the assumption that philosophy was man's natural perfection",
      "reference": "no longer assuming that philosophy was man's natural perfection",
      "metrics": {
        "f1": 0.9743589743589743
      },
      "elapsed_time": 0.11593008041381836,
      "timestamp": "2026-01-05T14:07:13.686907"
    },
    {
      "sample_id": "squad_57264659708984140094c11f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 473,
      "natural_ratio": 0.00360870361328125,
      "max_context_tokens": 131072,
      "context_length": 857,
      "question_length": 50,
      "prediction": "希腊的岛屿和城市",
      "reference": "Greek Isles and cities",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.07562899589538574,
      "timestamp": "2026-01-05T14:07:13.762663"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_6012",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 106847,
      "natural_ratio": 0.8151779174804688,
      "max_context_tokens": 131072,
      "context_length": 213615,
      "question_length": 39,
      "prediction": "match",
      "reference": "A light bulb.",
      "metrics": {
        "f1": 0.4
      },
      "elapsed_time": 3.7792177200317383,
      "timestamp": "2026-01-05T14:07:17.542199"
    },
    {
      "sample_id": "squad_5709986aed30961900e8436a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 618,
      "natural_ratio": 0.0047149658203125,
      "max_context_tokens": 131072,
      "context_length": 1110,
      "question_length": 87,
      "prediction": "Established Men",
      "reference": "Established Men",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06517744064331055,
      "timestamp": "2026-01-05T14:07:17.607907"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_12911",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 114988,
      "natural_ratio": 0.877288818359375,
      "max_context_tokens": 131072,
      "context_length": 229886,
      "question_length": 51,
      "prediction": "from the closet in the master bedroom",
      "reference": "An audiotape left by Muffy",
      "metrics": {
        "f1": 0.7741935483870969
      },
      "elapsed_time": 3.147040367126465,
      "timestamp": "2026-01-05T14:07:20.755319"
    },
    {
      "sample_id": "squad_57101eefa58dae1900cd68b4",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 511,
      "natural_ratio": 0.00389862060546875,
      "max_context_tokens": 131072,
      "context_length": 924,
      "question_length": 59,
      "prediction": "It resulted in the loss of valuable information about the individual's sexuality as there are more than two dimensions of sexuality.",
      "reference": "A person who has only predominantly same sex reactions is different from someone with relatively little reaction but lots of same sex experience",
      "metrics": {
        "f1": 0.9090909090909091
      },
      "elapsed_time": 0.264890193939209,
      "timestamp": "2026-01-05T14:07:21.020782"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_28106",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 72205,
      "natural_ratio": 0.5508804321289062,
      "max_context_tokens": 131072,
      "context_length": 144336,
      "question_length": 35,
      "prediction": "A year later, in a small Australian zoo.",
      "reference": "Marine World.",
      "metrics": {
        "f1": 0.7142857142857143
      },
      "elapsed_time": 2.2474279403686523,
      "timestamp": "2026-01-05T14:07:23.268553"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_4286",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 92059,
      "natural_ratio": 0.7023544311523438,
      "max_context_tokens": 131072,
      "context_length": 184037,
      "question_length": 41,
      "prediction": "South America",
      "reference": "The Lost World",
      "metrics": {
        "f1": 0.6363636363636365
      },
      "elapsed_time": 0.1803288459777832,
      "timestamp": "2026-01-05T14:07:23.449197"
    },
    {
      "sample_id": "squad_56e151c2cd28a01900c677c1",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 324,
      "natural_ratio": 0.002471923828125,
      "max_context_tokens": 131072,
      "context_length": 554,
      "question_length": 54,
      "prediction": "New England Conservatory",
      "reference": "the New England Conservatory",
      "metrics": {
        "f1": 0.967741935483871
      },
      "elapsed_time": 0.08020758628845215,
      "timestamp": "2026-01-05T14:07:23.529570"
    },
    {
      "sample_id": "squad_5726d198dd62a815002e9144",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 484,
      "natural_ratio": 0.003692626953125,
      "max_context_tokens": 131072,
      "context_length": 857,
      "question_length": 72,
      "prediction": "UK and France",
      "reference": "UK and France",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05680274963378906,
      "timestamp": "2026-01-05T14:07:23.586510"
    },
    {
      "sample_id": "squad_56f959019e9bad19000a085e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 250,
      "natural_ratio": 0.0019073486328125,
      "max_context_tokens": 131072,
      "context_length": 392,
      "question_length": 69,
      "prediction": "1905",
      "reference": "1905",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0617525577545166,
      "timestamp": "2026-01-05T14:07:23.648391"
    },
    {
      "sample_id": "squad_5723d1300dadf01500fa1f3c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 291,
      "natural_ratio": 0.00222015380859375,
      "max_context_tokens": 131072,
      "context_length": 513,
      "question_length": 29,
      "prediction": "1882",
      "reference": "1882",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06443023681640625,
      "timestamp": "2026-01-05T14:07:23.712947"
    },
    {
      "sample_id": "squad_572971b83f37b319004783d9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 378,
      "natural_ratio": 0.0028839111328125,
      "max_context_tokens": 131072,
      "context_length": 643,
      "question_length": 74,
      "prediction": "The Stern Review",
      "reference": "Stern Review",
      "metrics": {
        "f1": 0.8571428571428571
      },
      "elapsed_time": 0.054164886474609375,
      "timestamp": "2026-01-05T14:07:23.767241"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_4660",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 115824,
      "natural_ratio": 0.8836669921875,
      "max_context_tokens": 131072,
      "context_length": 231558,
      "question_length": 50,
      "prediction": "一个掉下来的鞋",
      "reference": "His Penis Pump",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.001962900161743,
      "timestamp": "2026-01-05T14:07:26.769537"
    },
    {
      "sample_id": "squad_56e10565e3433e1400422adf",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 697,
      "natural_ratio": 0.00531768798828125,
      "max_context_tokens": 131072,
      "context_length": 1312,
      "question_length": 43,
      "prediction": "tortoises",
      "reference": "tortoises",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0757131576538086,
      "timestamp": "2026-01-05T14:07:26.845777"
    },
    {
      "sample_id": "squad_573220fce99e3014001e653b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 564,
      "natural_ratio": 0.004302978515625,
      "max_context_tokens": 131072,
      "context_length": 1015,
      "question_length": 73,
      "prediction": "Jewish synagogues",
      "reference": "synagogues",
      "metrics": {
        "f1": 0.7407407407407407
      },
      "elapsed_time": 0.0769953727722168,
      "timestamp": "2026-01-05T14:07:26.922903"
    },
    {
      "sample_id": "squad_5725e93b38643c19005ace6d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 510,
      "natural_ratio": 0.0038909912109375,
      "max_context_tokens": 131072,
      "context_length": 948,
      "question_length": 33,
      "prediction": "Antigonus II Mattathias",
      "reference": "Antigonus II Mattathias",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.09682273864746094,
      "timestamp": "2026-01-05T14:07:27.019855"
    },
    {
      "sample_id": "squad_57293a886aef051400154ba1",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 656,
      "natural_ratio": 0.0050048828125,
      "max_context_tokens": 131072,
      "context_length": 1203,
      "question_length": 69,
      "prediction": "Karl von Clausewitz",
      "reference": "Karl von Clausewitz",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08200955390930176,
      "timestamp": "2026-01-05T14:07:27.101989"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_22997",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 101304,
      "natural_ratio": 0.77288818359375,
      "max_context_tokens": 131072,
      "context_length": 202501,
      "question_length": 67,
      "prediction": "她的手提包",
      "reference": "A suitcase with ransom money for her husband Bobby",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.20127654075622559,
      "timestamp": "2026-01-05T14:07:27.303591"
    },
    {
      "sample_id": "squad_57263c94271a42140099d7a7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 337,
      "natural_ratio": 0.00257110595703125,
      "max_context_tokens": 131072,
      "context_length": 597,
      "question_length": 37,
      "prediction": "Because it conducts heat from the filament, thereby cooling the filament and reducing efficiency.",
      "reference": "conducts heat from the filament, thereby cooling the filament",
      "metrics": {
        "f1": 0.7721518987341772
      },
      "elapsed_time": 0.1946706771850586,
      "timestamp": "2026-01-05T14:07:27.498425"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_26566",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 122878,
      "natural_ratio": 0.9374847412109375,
      "max_context_tokens": 131072,
      "context_length": 245670,
      "question_length": 47,
      "prediction": "Joey survives by helping Finnegan rig a torpedo to a gunship, which he drives away from the sinking cruise ship as a giant mutant sea creature tries to stop it.",
      "reference": "He used Finnegan's surfboard to paddle ashore",
      "metrics": {
        "f1": 0.7727272727272727
      },
      "elapsed_time": 3.796529531478882,
      "timestamp": "2026-01-05T14:07:31.295353"
    },
    {
      "sample_id": "squad_570e79e90b85d914000d7f2d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 748,
      "natural_ratio": 0.005706787109375,
      "max_context_tokens": 131072,
      "context_length": 1398,
      "question_length": 59,
      "prediction": "Yarra River",
      "reference": "Yarra",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.06547331809997559,
      "timestamp": "2026-01-05T14:07:31.361367"
    },
    {
      "sample_id": "squad_5727617f708984140094dcbd",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 404,
      "natural_ratio": 0.003082275390625,
      "max_context_tokens": 131072,
      "context_length": 688,
      "question_length": 81,
      "prediction": "743",
      "reference": "743",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05603432655334473,
      "timestamp": "2026-01-05T14:07:31.417543"
    },
    {
      "sample_id": "squad_570c69aeb3d812140066d1e9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 446,
      "natural_ratio": 0.0034027099609375,
      "max_context_tokens": 131072,
      "context_length": 816,
      "question_length": 37,
      "prediction": "Magna Carta",
      "reference": "Magna Carta",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06507515907287598,
      "timestamp": "2026-01-05T14:07:31.482741"
    },
    {
      "sample_id": "squad_570c8d7cb3d812140066d21f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 427,
      "natural_ratio": 0.00325775146484375,
      "max_context_tokens": 131072,
      "context_length": 749,
      "question_length": 66,
      "prediction": "Joan Gamper",
      "reference": "Joan Gamper",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06516504287719727,
      "timestamp": "2026-01-05T14:07:31.548038"
    },
    {
      "sample_id": "squad_57334b0b4776f4190066080e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 413,
      "natural_ratio": 0.00315093994140625,
      "max_context_tokens": 131072,
      "context_length": 731,
      "question_length": 56,
      "prediction": "$20 billion",
      "reference": "under $20 billion",
      "metrics": {
        "f1": 0.8181818181818181
      },
      "elapsed_time": 0.06568026542663574,
      "timestamp": "2026-01-05T14:07:31.613852"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_19726",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 123185,
      "natural_ratio": 0.9398269653320312,
      "max_context_tokens": 131072,
      "context_length": 246302,
      "question_length": 28,
      "prediction": "A gang of bikers raids a housing project.",
      "reference": "Roger DiMarco",
      "metrics": {
        "f1": 0.6
      },
      "elapsed_time": 5.487135887145996,
      "timestamp": "2026-01-05T14:07:37.101313"
    },
    {
      "sample_id": "squad_572e8aaec246551400ce4316",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 380,
      "natural_ratio": 0.002899169921875,
      "max_context_tokens": 131072,
      "context_length": 629,
      "question_length": 91,
      "prediction": "Vibrant new scenes began to emerge, featuring experimental sounds and broader conceptual themes in the music. These artists often had backgrounds in art and viewed their work as deeply invested in specific political or aesthetic agendas.",
      "reference": "vibrant new scenes",
      "metrics": {
        "f1": 0.16666666666666669
      },
      "elapsed_time": 0.4503035545349121,
      "timestamp": "2026-01-05T14:07:37.552198"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_16972",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 41301,
      "natural_ratio": 0.31510162353515625,
      "max_context_tokens": 131072,
      "context_length": 82487,
      "question_length": 76,
      "prediction": "He had Nicolette put in a room",
      "reference": "He locked her in a tower",
      "metrics": {
        "f1": 0.8275862068965518
      },
      "elapsed_time": 0.16437625885009766,
      "timestamp": "2026-01-05T14:07:37.716865"
    },
    {
      "sample_id": "squad_56f8e0b29e9bad19000a065e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 379,
      "natural_ratio": 0.00289154052734375,
      "max_context_tokens": 131072,
      "context_length": 648,
      "question_length": 71,
      "prediction": "A retail and housing development was built on the site.",
      "reference": "a retail and housing development",
      "metrics": {
        "f1": 0.735632183908046
      },
      "elapsed_time": 0.13889813423156738,
      "timestamp": "2026-01-05T14:07:37.855947"
    },
    {
      "sample_id": "squad_5732674b0fdd8d15006c6a99",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 300,
      "natural_ratio": 0.002288818359375,
      "max_context_tokens": 131072,
      "context_length": 507,
      "question_length": 53,
      "prediction": "27",
      "reference": "27",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.044870615005493164,
      "timestamp": "2026-01-05T14:07:37.900975"
    },
    {
      "sample_id": "squad_570b0c33ec8fbc190045b80c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 307,
      "natural_ratio": 0.00234222412109375,
      "max_context_tokens": 131072,
      "context_length": 525,
      "question_length": 49,
      "prediction": "28- or 29-day February or any 30-day month",
      "reference": "short month",
      "metrics": {
        "f1": 0.4999999999999999
      },
      "elapsed_time": 0.18648409843444824,
      "timestamp": "2026-01-05T14:07:38.087620"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_15574",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 106796,
      "natural_ratio": 0.814788818359375,
      "max_context_tokens": 131072,
      "context_length": 213442,
      "question_length": 111,
      "prediction": "Stacy's",
      "reference": "Reynolds' bank account.",
      "metrics": {
        "f1": 0.5217391304347826
      },
      "elapsed_time": 3.6139888763427734,
      "timestamp": "2026-01-05T14:07:41.701939"
    },
    {
      "sample_id": "squad_572941d96aef051400154c11",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 316,
      "natural_ratio": 0.002410888671875,
      "max_context_tokens": 131072,
      "context_length": 538,
      "question_length": 55,
      "prediction": "Portuguese",
      "reference": "Portuguese",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06032371520996094,
      "timestamp": "2026-01-05T14:07:41.762784"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_30185",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 103914,
      "natural_ratio": 0.7928009033203125,
      "max_context_tokens": 131072,
      "context_length": 207746,
      "question_length": 42,
      "prediction": "crow",
      "reference": "A murder of crows.",
      "metrics": {
        "f1": 0.47058823529411764
      },
      "elapsed_time": 4.205853462219238,
      "timestamp": "2026-01-05T14:07:45.969012"
    },
    {
      "sample_id": "squad_57269560dd62a815002e8a5b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 777,
      "natural_ratio": 0.00592803955078125,
      "max_context_tokens": 131072,
      "context_length": 1423,
      "question_length": 91,
      "prediction": "Clement Greenberg",
      "reference": "Clement Greenberg.",
      "metrics": {
        "f1": 0.9523809523809523
      },
      "elapsed_time": 0.0785067081451416,
      "timestamp": "2026-01-05T14:07:46.047686"
    },
    {
      "sample_id": "squad_57318db8497a881900249026",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 386,
      "natural_ratio": 0.0029449462890625,
      "max_context_tokens": 131072,
      "context_length": 667,
      "question_length": 65,
      "prediction": "19th century",
      "reference": "19th",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.0658273696899414,
      "timestamp": "2026-01-05T14:07:46.113652"
    },
    {
      "sample_id": "squad_573326c84776f4190066071e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 385,
      "natural_ratio": 0.00293731689453125,
      "max_context_tokens": 131072,
      "context_length": 652,
      "question_length": 78,
      "prediction": "The initial loss of 600 pounds for the publishing of Principia Mathematica was due to it being a massive work (over 2,000 pages) with a very narrow audience (professional mathematicians). The book was published at a loss of 600 pounds, 300 of which was paid by Cambridge University Press, 200 by the Royal Society of London, and 50 apiece by Whitehead and Russell themselves.",
      "reference": "the three-volume work was so massive (more than 2,000 pages) and its audience so narrow (professional mathematicians)",
      "metrics": {
        "f1": 0.8813559322033898
      },
      "elapsed_time": 1.0791068077087402,
      "timestamp": "2026-01-05T14:07:47.192992"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_7288",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 93591,
      "natural_ratio": 0.7140426635742188,
      "max_context_tokens": 131072,
      "context_length": 187078,
      "question_length": 64,
      "prediction": "Kate Hannah lies to her students that she had a miscarriage.",
      "reference": "That she is pregnant.",
      "metrics": {
        "f1": 0.7333333333333334
      },
      "elapsed_time": 2.8298606872558594,
      "timestamp": "2026-01-05T14:07:50.023220"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_4828",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 122379,
      "natural_ratio": 0.9336776733398438,
      "max_context_tokens": 131072,
      "context_length": 244683,
      "question_length": 35,
      "prediction": "The Winkie Prince.",
      "reference": "Musicker",
      "metrics": {
        "f1": 0.5
      },
      "elapsed_time": 4.593498945236206,
      "timestamp": "2026-01-05T14:07:54.617137"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_21356",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 60297,
      "natural_ratio": 0.46002960205078125,
      "max_context_tokens": 131072,
      "context_length": 120502,
      "question_length": 52,
      "prediction": "Ash Williams arrived at a cabin that was attacked by an army of the undead, and after he recovered, Lord Arthur's men found him and thought he was a threat.",
      "reference": "He is thought to work for Duke Henry.",
      "metrics": {
        "f1": 0.8095238095238095
      },
      "elapsed_time": 2.7703564167022705,
      "timestamp": "2026-01-05T14:07:57.388181"
    },
    {
      "sample_id": "squad_57276e51708984140094dd5e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 636,
      "natural_ratio": 0.004852294921875,
      "max_context_tokens": 131072,
      "context_length": 1191,
      "question_length": 41,
      "prediction": "784–967",
      "reference": "784–967",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.10807585716247559,
      "timestamp": "2026-01-05T14:07:57.496435"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_14611",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 72877,
      "natural_ratio": 0.5560073852539062,
      "max_context_tokens": 131072,
      "context_length": 145668,
      "question_length": 47,
      "prediction": "The Ninth Gate is a story about a rare and mysterious book that is believed to be a guide to summoning the Devil. It is this book that the main character, Dean Corso, is hired to investigate its authenticity. However, as he travels to different locations to compare copies of the book, he and the books themselves become targets of unknown forces.",
      "reference": "it is said to have a way to call the devil, to get invincibility and immortality",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 3.1608314514160156,
      "timestamp": "2026-01-05T14:08:00.657653"
    },
    {
      "sample_id": "squad_57318d40a5e9cc1400cdc052",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 459,
      "natural_ratio": 0.00350189208984375,
      "max_context_tokens": 131072,
      "context_length": 854,
      "question_length": 25,
      "prediction": "science fiction",
      "reference": "futuristic science fiction",
      "metrics": {
        "f1": 0.9
      },
      "elapsed_time": 0.05096292495727539,
      "timestamp": "2026-01-05T14:08:00.708790"
    },
    {
      "sample_id": "squad_56e74a8e00c9c71400d76f5e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 329,
      "natural_ratio": 0.00251007080078125,
      "max_context_tokens": 131072,
      "context_length": 554,
      "question_length": 65,
      "prediction": "19th century",
      "reference": "19th",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.06669187545776367,
      "timestamp": "2026-01-05T14:08:00.775605"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_32106",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 110497,
      "natural_ratio": 0.8430252075195312,
      "max_context_tokens": 131072,
      "context_length": 220905,
      "question_length": 50,
      "prediction": "Atlantic Ocean",
      "reference": "The English Channel",
      "metrics": {
        "f1": 0.7999999999999999
      },
      "elapsed_time": 3.8174922466278076,
      "timestamp": "2026-01-05T14:08:04.593420"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_31155",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 70909,
      "natural_ratio": 0.5409927368164062,
      "max_context_tokens": 131072,
      "context_length": 141730,
      "question_length": 49,
      "prediction": "Ben's death",
      "reference": "A PHONE CALL FROM HIM.",
      "metrics": {
        "f1": 0.41666666666666663
      },
      "elapsed_time": 3.385474681854248,
      "timestamp": "2026-01-05T14:08:07.979637"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_25176",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 88717,
      "natural_ratio": 0.6768569946289062,
      "max_context_tokens": 131072,
      "context_length": 177345,
      "question_length": 49,
      "prediction": "John",
      "reference": "JOHN",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 3.8303732872009277,
      "timestamp": "2026-01-05T14:08:11.810414"
    },
    {
      "sample_id": "squad_5727f5f53acd2414000df0d9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 608,
      "natural_ratio": 0.004638671875,
      "max_context_tokens": 131072,
      "context_length": 1142,
      "question_length": 35,
      "prediction": "The Northern Renaissance",
      "reference": "Italian Renaissance",
      "metrics": {
        "f1": 0.8571428571428572
      },
      "elapsed_time": 0.06330752372741699,
      "timestamp": "2026-01-05T14:08:11.873898"
    },
    {
      "sample_id": "squad_5726b22bf1498d1400e8e7bc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 619,
      "natural_ratio": 0.00472259521484375,
      "max_context_tokens": 131072,
      "context_length": 1146,
      "question_length": 52,
      "prediction": "English and a creole language known as Norfuk",
      "reference": "English and a creole language known as Norfuk",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.1446826457977295,
      "timestamp": "2026-01-05T14:08:12.018735"
    },
    {
      "sample_id": "squad_5727bf482ca10214002d9557",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 486,
      "natural_ratio": 0.0037078857421875,
      "max_context_tokens": 131072,
      "context_length": 887,
      "question_length": 45,
      "prediction": "Rous (equestrian)",
      "reference": "Rous",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.09777188301086426,
      "timestamp": "2026-01-05T14:08:12.116643"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_18867",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 90002,
      "natural_ratio": 0.6866607666015625,
      "max_context_tokens": 131072,
      "context_length": 179911,
      "question_length": 53,
      "prediction": "计算出花园的尺寸。",
      "reference": "Pick a train that will pass more than hers.",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.3020830154418945,
      "timestamp": "2026-01-05T14:08:15.419072"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_3507",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 100633,
      "natural_ratio": 0.7677688598632812,
      "max_context_tokens": 131072,
      "context_length": 201182,
      "question_length": 45,
      "prediction": "Delta Zeta House",
      "reference": "A sorority house",
      "metrics": {
        "f1": 0.7272727272727273
      },
      "elapsed_time": 4.123924016952515,
      "timestamp": "2026-01-05T14:08:19.543376"
    },
    {
      "sample_id": "squad_5730ca8fb7151e1900c01537",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 1003,
      "natural_ratio": 0.00765228271484375,
      "max_context_tokens": 131072,
      "context_length": 1870,
      "question_length": 96,
      "prediction": "Justinian I",
      "reference": "Emperor Justinian I,",
      "metrics": {
        "f1": 0.7272727272727273
      },
      "elapsed_time": 0.07117986679077148,
      "timestamp": "2026-01-05T14:08:19.614740"
    },
    {
      "sample_id": "squad_56dfc016231d4119001abd8e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 122,
      "natural_ratio": 0.0009307861328125,
      "max_context_tokens": 131072,
      "context_length": 166,
      "question_length": 38,
      "prediction": "Internet hosting services",
      "reference": "Internet hosting services",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.051115989685058594,
      "timestamp": "2026-01-05T14:08:19.665984"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_14476",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 65597,
      "natural_ratio": 0.5004653930664062,
      "max_context_tokens": 131072,
      "context_length": 131128,
      "question_length": 26,
      "prediction": "bookshop owner",
      "reference": "Bookstore owner",
      "metrics": {
        "f1": 0.8571428571428572
      },
      "elapsed_time": 2.3607165813446045,
      "timestamp": "2026-01-05T14:08:22.027048"
    },
    {
      "sample_id": "squad_572ecda5cb0c0d14000f159a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 308,
      "natural_ratio": 0.002349853515625,
      "max_context_tokens": 131072,
      "context_length": 526,
      "question_length": 51,
      "prediction": "Pacific Northwest",
      "reference": "Pacific Northwest",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04931378364562988,
      "timestamp": "2026-01-05T14:08:22.076554"
    },
    {
      "sample_id": "squad_57288f642ca10214002da472",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 592,
      "natural_ratio": 0.0045166015625,
      "max_context_tokens": 131072,
      "context_length": 1062,
      "question_length": 83,
      "prediction": "United States Department of Education",
      "reference": "United States Department of Education",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07962846755981445,
      "timestamp": "2026-01-05T14:08:22.156347"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_6517",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 123731,
      "natural_ratio": 0.9439926147460938,
      "max_context_tokens": 131072,
      "context_length": 247366,
      "question_length": 57,
      "prediction": "They are released by FBI agent Neeley.",
      "reference": "They are released after the Feds step in.",
      "metrics": {
        "f1": 0.9032258064516129
      },
      "elapsed_time": 4.534542560577393,
      "timestamp": "2026-01-05T14:08:26.691214"
    },
    {
      "sample_id": "squad_5706c2a60eeca41400aa0e0e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 230,
      "natural_ratio": 0.0017547607421875,
      "max_context_tokens": 131072,
      "context_length": 360,
      "question_length": 61,
      "prediction": "Soviet Union",
      "reference": "Soviet Union",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06795716285705566,
      "timestamp": "2026-01-05T14:08:26.759703"
    },
    {
      "sample_id": "squad_56e7b0b737bdd419002c4366",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 203,
      "natural_ratio": 0.00154876708984375,
      "max_context_tokens": 131072,
      "context_length": 321,
      "question_length": 45,
      "prediction": "Jiangsu Province Kun Opera (Kunqu)",
      "reference": "Kunqu",
      "metrics": {
        "f1": 0.25641025641025644
      },
      "elapsed_time": 0.13344788551330566,
      "timestamp": "2026-01-05T14:08:26.893280"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_26153",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 80921,
      "natural_ratio": 0.6173782348632812,
      "max_context_tokens": 131072,
      "context_length": 161754,
      "question_length": 48,
      "prediction": "Thorn triangle",
      "reference": "Thorn.",
      "metrics": {
        "f1": 0.5882352941176471
      },
      "elapsed_time": 3.3722610473632812,
      "timestamp": "2026-01-05T14:08:30.265930"
    },
    {
      "sample_id": "squad_57324760e17f3d14004227c7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 538,
      "natural_ratio": 0.0041046142578125,
      "max_context_tokens": 131072,
      "context_length": 973,
      "question_length": 63,
      "prediction": "1928",
      "reference": "1928",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07201528549194336,
      "timestamp": "2026-01-05T14:08:30.338113"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_31116",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 49039,
      "natural_ratio": 0.37413787841796875,
      "max_context_tokens": 131072,
      "context_length": 98001,
      "question_length": 37,
      "prediction": "his daughter was stolen by a wizard",
      "reference": "Their wounded honor and the unfixabe weapon.",
      "metrics": {
        "f1": 0.7428571428571428
      },
      "elapsed_time": 1.2994983196258545,
      "timestamp": "2026-01-05T14:08:31.637943"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_1776",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 115597,
      "natural_ratio": 0.8819351196289062,
      "max_context_tokens": 131072,
      "context_length": 231136,
      "question_length": 18,
      "prediction": "Bourne和Manheim联手杀死的。",
      "reference": "Manheim, under the orders of Abbtott.",
      "metrics": {
        "f1": 0.5882352941176471
      },
      "elapsed_time": 3.097841501235962,
      "timestamp": "2026-01-05T14:08:34.736130"
    },
    {
      "sample_id": "squad_57326547e99e3014001e677a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 491,
      "natural_ratio": 0.00374603271484375,
      "max_context_tokens": 131072,
      "context_length": 901,
      "question_length": 42,
      "prediction": "Over the issuance of bonds to finance the project",
      "reference": "issuance of bonds to finance the project",
      "metrics": {
        "f1": 0.9333333333333333
      },
      "elapsed_time": 0.1223289966583252,
      "timestamp": "2026-01-05T14:08:34.858987"
    },
    {
      "sample_id": "squad_57302ef4b2c2fd1400568a15",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 314,
      "natural_ratio": 0.0023956298828125,
      "max_context_tokens": 131072,
      "context_length": 522,
      "question_length": 67,
      "prediction": "A circularly polarized antenna",
      "reference": "circularly polarized",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 0.08533024787902832,
      "timestamp": "2026-01-05T14:08:34.944456"
    },
    {
      "sample_id": "squad_5729450a6aef051400154c31",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 683,
      "natural_ratio": 0.00521087646484375,
      "max_context_tokens": 131072,
      "context_length": 1262,
      "question_length": 64,
      "prediction": "Basilica of Saint Denis",
      "reference": "Sainte-Chapelle",
      "metrics": {
        "f1": 0.6666666666666667
      },
      "elapsed_time": 0.0899190902709961,
      "timestamp": "2026-01-05T14:08:35.034509"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_31158",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 92671,
      "natural_ratio": 0.7070236206054688,
      "max_context_tokens": 131072,
      "context_length": 185227,
      "question_length": 76,
      "prediction": "Hoth",
      "reference": "The ice planet Hoth",
      "metrics": {
        "f1": 0.42857142857142855
      },
      "elapsed_time": 4.0747458934783936,
      "timestamp": "2026-01-05T14:08:39.109629"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_32667",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 54589,
      "natural_ratio": 0.41648101806640625,
      "max_context_tokens": 131072,
      "context_length": 109098,
      "question_length": 41,
      "prediction": "seven years",
      "reference": "21 months",
      "metrics": {
        "f1": 0.35294117647058826
      },
      "elapsed_time": 1.501037836074829,
      "timestamp": "2026-01-05T14:08:40.611019"
    },
    {
      "sample_id": "squad_570d6cecb3d812140066d8e1",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 638,
      "natural_ratio": 0.0048675537109375,
      "max_context_tokens": 131072,
      "context_length": 1142,
      "question_length": 94,
      "prediction": "many kilometers",
      "reference": "many kilometers",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05411529541015625,
      "timestamp": "2026-01-05T14:08:40.665315"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_13056",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 83977,
      "natural_ratio": 0.6406936645507812,
      "max_context_tokens": 131072,
      "context_length": 167865,
      "question_length": 49,
      "prediction": "David becomes a wolf.",
      "reference": "A werewolf.",
      "metrics": {
        "f1": 0.6666666666666667
      },
      "elapsed_time": 2.3866071701049805,
      "timestamp": "2026-01-05T14:08:43.052269"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_2040",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 64844,
      "natural_ratio": 0.494720458984375,
      "max_context_tokens": 131072,
      "context_length": 129587,
      "question_length": 62,
      "prediction": "serpents",
      "reference": "Snakes",
      "metrics": {
        "f1": 0.5454545454545454
      },
      "elapsed_time": 1.8878443241119385,
      "timestamp": "2026-01-05T14:08:44.940477"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_2259",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 65049,
      "natural_ratio": 0.49628448486328125,
      "max_context_tokens": 131072,
      "context_length": 129997,
      "question_length": 62,
      "prediction": "Randy",
      "reference": "Randy",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 2.5336177349090576,
      "timestamp": "2026-01-05T14:08:47.474424"
    },
    {
      "sample_id": "squad_56df50e896943c1400a5d383",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 160,
      "natural_ratio": 0.001220703125,
      "max_context_tokens": 131072,
      "context_length": 231,
      "question_length": 49,
      "prediction": "Nigeria",
      "reference": "Nigeria",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0535740852355957,
      "timestamp": "2026-01-05T14:08:47.528270"
    },
    {
      "sample_id": "squad_57274c04708984140094dbd5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 378,
      "natural_ratio": 0.0028839111328125,
      "max_context_tokens": 131072,
      "context_length": 658,
      "question_length": 59,
      "prediction": "相同",
      "reference": "genetically identical to the parent",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.039446115493774414,
      "timestamp": "2026-01-05T14:08:47.567969"
    },
    {
      "sample_id": "squad_570b66566b8089140040f931",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 498,
      "natural_ratio": 0.0037994384765625,
      "max_context_tokens": 131072,
      "context_length": 909,
      "question_length": 48,
      "prediction": "Los Angeles",
      "reference": "Los Angeles",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04721379280090332,
      "timestamp": "2026-01-05T14:08:47.615344"
    },
    {
      "sample_id": "squad_570e5b3b0dc6ce1900204f89",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 381,
      "natural_ratio": 0.00290679931640625,
      "max_context_tokens": 131072,
      "context_length": 670,
      "question_length": 52,
      "prediction": "a 'quarter acre home and garden'",
      "reference": "quarter acre home and garden",
      "metrics": {
        "f1": 0.9333333333333333
      },
      "elapsed_time": 0.10634303092956543,
      "timestamp": "2026-01-05T14:08:47.721819"
    },
    {
      "sample_id": "squad_570b6da3ec8fbc190045ba17",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 1154,
      "natural_ratio": 0.0088043212890625,
      "max_context_tokens": 131072,
      "context_length": 2215,
      "question_length": 53,
      "prediction": "1975",
      "reference": "1975",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0774381160736084,
      "timestamp": "2026-01-05T14:08:47.799388"
    },
    {
      "sample_id": "squad_57284ab33acd2414000df8b5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 339,
      "natural_ratio": 0.00258636474609375,
      "max_context_tokens": 131072,
      "context_length": 600,
      "question_length": 38,
      "prediction": "Mahajanapadas",
      "reference": "Mahajanapadas",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06528878211975098,
      "timestamp": "2026-01-05T14:08:47.864812"
    },
    {
      "sample_id": "squad_570d3c29b3d812140066d5b0",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 916,
      "natural_ratio": 0.006988525390625,
      "max_context_tokens": 131072,
      "context_length": 1728,
      "question_length": 64,
      "prediction": "8 GB",
      "reference": "8 GB",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05217599868774414,
      "timestamp": "2026-01-05T14:08:47.917138"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_4074",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 105521,
      "natural_ratio": 0.8050613403320312,
      "max_context_tokens": 131072,
      "context_length": 210934,
      "question_length": 69,
      "prediction": "bank",
      "reference": "Buffalo Savings Bank",
      "metrics": {
        "f1": 0.5
      },
      "elapsed_time": 3.279095411300659,
      "timestamp": "2026-01-05T14:08:51.196569"
    },
    {
      "sample_id": "squad_5727fc492ca10214002d9ab8",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 528,
      "natural_ratio": 0.0040283203125,
      "max_context_tokens": 131072,
      "context_length": 942,
      "question_length": 75,
      "prediction": "the British government",
      "reference": "British government",
      "metrics": {
        "f1": 0.9
      },
      "elapsed_time": 0.06157422065734863,
      "timestamp": "2026-01-05T14:08:51.258316"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_4572",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 70686,
      "natural_ratio": 0.5392913818359375,
      "max_context_tokens": 131072,
      "context_length": 141309,
      "question_length": 23,
      "prediction": "Eyolf's leg is shrunken due to a childhood injury.",
      "reference": "He is paralyzed in one of his legs.",
      "metrics": {
        "f1": 0.7894736842105262
      },
      "elapsed_time": 2.470588445663452,
      "timestamp": "2026-01-05T14:08:53.729242"
    },
    {
      "sample_id": "squad_5727d8773acd2414000dedf6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 413,
      "natural_ratio": 0.00315093994140625,
      "max_context_tokens": 131072,
      "context_length": 726,
      "question_length": 60,
      "prediction": "Aristotle",
      "reference": "Aristotle",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05984044075012207,
      "timestamp": "2026-01-05T14:08:53.789258"
    },
    {
      "sample_id": "squad_570d9d86df2f5219002ed053",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 318,
      "natural_ratio": 0.0024261474609375,
      "max_context_tokens": 131072,
      "context_length": 533,
      "question_length": 64,
      "prediction": "Modular Handgun System",
      "reference": "Modular Handgun System",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07584190368652344,
      "timestamp": "2026-01-05T14:08:53.865259"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_22433",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 76049,
      "natural_ratio": 0.5802078247070312,
      "max_context_tokens": 131072,
      "context_length": 152011,
      "question_length": 48,
      "prediction": "Maria's father believed that Billy had become a suitable match for Maria after agreeing to break off the match with her and showing devotion to her.",
      "reference": "money",
      "metrics": {
        "f1": 0.3448275862068966
      },
      "elapsed_time": 2.6804728507995605,
      "timestamp": "2026-01-05T14:08:56.546085"
    },
    {
      "sample_id": "squad_56deeb2a3277331400b4d815",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 183,
      "natural_ratio": 0.00139617919921875,
      "max_context_tokens": 131072,
      "context_length": 298,
      "question_length": 28,
      "prediction": "2nd Canadian Division, the 3rd Canadian Division, the 4th Canadian Division and the 5th Canadian Division",
      "reference": "the 2nd Canadian Division, the 3rd Canadian Division, the 4th Canadian Division and the 5th Canadian Division",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.2868790626525879,
      "timestamp": "2026-01-05T14:08:56.833167"
    },
    {
      "sample_id": "squad_573012fab2c2fd140056880c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 489,
      "natural_ratio": 0.00373077392578125,
      "max_context_tokens": 131072,
      "context_length": 875,
      "question_length": 63,
      "prediction": "1,307,402",
      "reference": "1,307,402",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.1267683506011963,
      "timestamp": "2026-01-05T14:08:56.960095"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_24558",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 95879,
      "natural_ratio": 0.7314987182617188,
      "max_context_tokens": 131072,
      "context_length": 191657,
      "question_length": 61,
      "prediction": "The Penguin is lifted into the air by a group of bats that emerge from Batman's Batmobile.",
      "reference": "He dies.",
      "metrics": {
        "f1": 0.5
      },
      "elapsed_time": 2.867304563522339,
      "timestamp": "2026-01-05T14:08:59.827805"
    },
    {
      "sample_id": "squad_57266a195951b619008f71fa",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 309,
      "natural_ratio": 0.00235748291015625,
      "max_context_tokens": 131072,
      "context_length": 529,
      "question_length": 49,
      "prediction": "19th century",
      "reference": "the 19th century",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0703437328338623,
      "timestamp": "2026-01-05T14:08:59.898313"
    },
    {
      "sample_id": "squad_572ebcb4c246551400ce45bc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 393,
      "natural_ratio": 0.00299835205078125,
      "max_context_tokens": 131072,
      "context_length": 696,
      "question_length": 51,
      "prediction": "A number of electrical properties",
      "reference": "A number of electrical properties",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07598161697387695,
      "timestamp": "2026-01-05T14:08:59.974450"
    },
    {
      "sample_id": "squad_572833674b864d1900164707",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 372,
      "natural_ratio": 0.002838134765625,
      "max_context_tokens": 131072,
      "context_length": 659,
      "question_length": 45,
      "prediction": "The earliest good evidence for oligochaetes occurs in the Tertiary period, which began 65 million years ago.",
      "reference": "472 to 461 million years ago",
      "metrics": {
        "f1": 0.6363636363636364
      },
      "elapsed_time": 0.26935291290283203,
      "timestamp": "2026-01-05T14:09:00.243979"
    },
    {
      "sample_id": "squad_56bfdd3fa10cfb140055134f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 264,
      "natural_ratio": 0.00201416015625,
      "max_context_tokens": 131072,
      "context_length": 453,
      "question_length": 36,
      "prediction": "Starpower: Beyoncé",
      "reference": "Starpower: Beyoncé",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0847463607788086,
      "timestamp": "2026-01-05T14:09:00.328871"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_5740",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 46071,
      "natural_ratio": 0.35149383544921875,
      "max_context_tokens": 131072,
      "context_length": 92081,
      "question_length": 22,
      "prediction": "Ft. Dupree, South Dakota",
      "reference": "Fort Dupree, SD",
      "metrics": {
        "f1": 0.846153846153846
      },
      "elapsed_time": 1.5280654430389404,
      "timestamp": "2026-01-05T14:09:01.857262"
    },
    {
      "sample_id": "squad_5730e504f6cb411900e24522",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 496,
      "natural_ratio": 0.0037841796875,
      "max_context_tokens": 131072,
      "context_length": 880,
      "question_length": 72,
      "prediction": "63%",
      "reference": "46%",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.0645451545715332,
      "timestamp": "2026-01-05T14:09:01.922059"
    },
    {
      "sample_id": "squad_56de7aaa4396321400ee2947",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 167,
      "natural_ratio": 0.00127410888671875,
      "max_context_tokens": 131072,
      "context_length": 247,
      "question_length": 48,
      "prediction": "Birmingham",
      "reference": "Birmingham",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.046814680099487305,
      "timestamp": "2026-01-05T14:09:01.969098"
    },
    {
      "sample_id": "squad_5727be6f3acd2414000deb3c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 365,
      "natural_ratio": 0.00278472900390625,
      "max_context_tokens": 131072,
      "context_length": 633,
      "question_length": 58,
      "prediction": "区 qū",
      "reference": "qū",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.05718827247619629,
      "timestamp": "2026-01-05T14:09:02.026470"
    },
    {
      "sample_id": "squad_57103069a58dae1900cd694a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 638,
      "natural_ratio": 0.0048675537109375,
      "max_context_tokens": 131072,
      "context_length": 1192,
      "question_length": 45,
      "prediction": "Research shows that the FBO effect is of immunological origin, caused by a maternal immune reaction against a substance crucial to male fetal development during pregnancy, which becomes increasingly likely after every male gestation.",
      "reference": "this may be of immunological origin, caused by a maternal immune reaction against a substance crucial to male fetal development during pregnancy,",
      "metrics": {
        "f1": 0.9361702127659575
      },
      "elapsed_time": 0.432628870010376,
      "timestamp": "2026-01-05T14:09:02.459336"
    },
    {
      "sample_id": "squad_56dcf81e66d3e219004dab6e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 479,
      "natural_ratio": 0.00365447998046875,
      "max_context_tokens": 131072,
      "context_length": 880,
      "question_length": 39,
      "prediction": "January 2002",
      "reference": "January 2002",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08886837959289551,
      "timestamp": "2026-01-05T14:09:02.548425"
    },
    {
      "sample_id": "squad_5728c9a54b864d1900164e29",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 325,
      "natural_ratio": 0.00247955322265625,
      "max_context_tokens": 131072,
      "context_length": 541,
      "question_length": 70,
      "prediction": "mid-7th to early 8th centuries",
      "reference": "mid-7th to early 8th centuries",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.12706923484802246,
      "timestamp": "2026-01-05T14:09:02.675688"
    },
    {
      "sample_id": "squad_5728f2a34b864d1900165102",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 306,
      "natural_ratio": 0.0023345947265625,
      "max_context_tokens": 131072,
      "context_length": 514,
      "question_length": 59,
      "prediction": "Samurai",
      "reference": "Samurai",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0442965030670166,
      "timestamp": "2026-01-05T14:09:02.720142"
    },
    {
      "sample_id": "squad_5726d94d708984140094d37b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 510,
      "natural_ratio": 0.0038909912109375,
      "max_context_tokens": 131072,
      "context_length": 951,
      "question_length": 30,
      "prediction": "章",
      "reference": "zhāng",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.03943824768066406,
      "timestamp": "2026-01-05T14:09:02.759736"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_25049",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 64612,
      "natural_ratio": 0.492950439453125,
      "max_context_tokens": 131072,
      "context_length": 129126,
      "question_length": 59,
      "prediction": "Queen Phillip",
      "reference": "Queen Philippa",
      "metrics": {
        "f1": 0.9473684210526316
      },
      "elapsed_time": 1.9259908199310303,
      "timestamp": "2026-01-05T14:09:04.686116"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_17779",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 86064,
      "natural_ratio": 0.6566162109375,
      "max_context_tokens": 131072,
      "context_length": 172028,
      "question_length": 60,
      "prediction": "Mantan",
      "reference": "Manray",
      "metrics": {
        "f1": 0.6666666666666665
      },
      "elapsed_time": 4.346870183944702,
      "timestamp": "2026-01-05T14:09:09.033353"
    },
    {
      "sample_id": "squad_5727e8484b864d1900163fc9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 540,
      "natural_ratio": 0.004119873046875,
      "max_context_tokens": 131072,
      "context_length": 993,
      "question_length": 47,
      "prediction": "Middle Ages",
      "reference": "Middle Ages",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0520329475402832,
      "timestamp": "2026-01-05T14:09:09.085550"
    },
    {
      "sample_id": "squad_5730fd7e497a881900248acd",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 674,
      "natural_ratio": 0.0051422119140625,
      "max_context_tokens": 131072,
      "context_length": 1276,
      "question_length": 32,
      "prediction": "the top emerging market economy in Africa",
      "reference": "top emerging market economy in Africa",
      "metrics": {
        "f1": 0.9487179487179488
      },
      "elapsed_time": 0.10145854949951172,
      "timestamp": "2026-01-05T14:09:09.187162"
    },
    {
      "sample_id": "squad_56cf6141aab44d1400b89195",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 625,
      "natural_ratio": 0.00476837158203125,
      "max_context_tokens": 131072,
      "context_length": 1143,
      "question_length": 67,
      "prediction": "Foxy Brown",
      "reference": "Foxy Brown",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05974459648132324,
      "timestamp": "2026-01-05T14:09:09.247042"
    },
    {
      "sample_id": "squad_5726e2fcf1498d1400e8eea3",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 400,
      "natural_ratio": 0.0030517578125,
      "max_context_tokens": 131072,
      "context_length": 705,
      "question_length": 55,
      "prediction": "The CPC",
      "reference": "CPC",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.04508471488952637,
      "timestamp": "2026-01-05T14:09:09.292257"
    },
    {
      "sample_id": "squad_5726252189a1e219009ac35c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 533,
      "natural_ratio": 0.00406646728515625,
      "max_context_tokens": 131072,
      "context_length": 980,
      "question_length": 46,
      "prediction": "The Crown Estate",
      "reference": "Crown Estate",
      "metrics": {
        "f1": 0.8571428571428571
      },
      "elapsed_time": 0.05552315711975098,
      "timestamp": "2026-01-05T14:09:09.347907"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_29014",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 77203,
      "natural_ratio": 0.5890121459960938,
      "max_context_tokens": 131072,
      "context_length": 154337,
      "question_length": 30,
      "prediction": "Palamon",
      "reference": "Arcite was Emilia's bodyguard",
      "metrics": {
        "f1": 0.3333333333333333
      },
      "elapsed_time": 2.88690185546875,
      "timestamp": "2026-01-05T14:09:12.235159"
    },
    {
      "sample_id": "squad_572958236aef051400154d25",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 436,
      "natural_ratio": 0.003326416015625,
      "max_context_tokens": 131072,
      "context_length": 764,
      "question_length": 69,
      "prediction": "the software development process",
      "reference": "software development process",
      "metrics": {
        "f1": 0.9333333333333333
      },
      "elapsed_time": 0.07062101364135742,
      "timestamp": "2026-01-05T14:09:12.305953"
    },
    {
      "sample_id": "squad_56cff665234ae51400d9c189",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 212,
      "natural_ratio": 0.001617431640625,
      "max_context_tokens": 131072,
      "context_length": 346,
      "question_length": 38,
      "prediction": "54万",
      "reference": "54 million",
      "metrics": {
        "f1": 0.36363636363636365
      },
      "elapsed_time": 0.05188417434692383,
      "timestamp": "2026-01-05T14:09:12.357968"
    },
    {
      "sample_id": "squad_56f78825aef2371900625b98",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 414,
      "natural_ratio": 0.0031585693359375,
      "max_context_tokens": 131072,
      "context_length": 727,
      "question_length": 62,
      "prediction": "Federated States of Micronesia",
      "reference": "the Federated States of Micronesia",
      "metrics": {
        "f1": 0.962962962962963
      },
      "elapsed_time": 0.10658907890319824,
      "timestamp": "2026-01-05T14:09:12.464691"
    },
    {
      "sample_id": "squad_57277da6dd62a815002e9e88",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 343,
      "natural_ratio": 0.00261688232421875,
      "max_context_tokens": 131072,
      "context_length": 535,
      "question_length": 112,
      "prediction": "Russian and Austrian forces invaded Switzerland during the Helvetic Republic.",
      "reference": "Russian and Austrian",
      "metrics": {
        "f1": 0.4615384615384615
      },
      "elapsed_time": 0.15676069259643555,
      "timestamp": "2026-01-05T14:09:12.621588"
    },
    {
      "sample_id": "squad_5706046752bb8914006897aa",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 259,
      "natural_ratio": 0.00197601318359375,
      "max_context_tokens": 131072,
      "context_length": 430,
      "question_length": 48,
      "prediction": "usually doesn't follow a straight line",
      "reference": "an hooked or arched line",
      "metrics": {
        "f1": 0.6666666666666667
      },
      "elapsed_time": 0.0919487476348877,
      "timestamp": "2026-01-05T14:09:12.713681"
    },
    {
      "sample_id": "squad_57267de2dd62a815002e8712",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 376,
      "natural_ratio": 0.00286865234375,
      "max_context_tokens": 131072,
      "context_length": 657,
      "question_length": 56,
      "prediction": "Orestis Laskos",
      "reference": "Orestis Laskos",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08623480796813965,
      "timestamp": "2026-01-05T14:09:12.800051"
    },
    {
      "sample_id": "squad_5726f8735951b619008f83a5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 597,
      "natural_ratio": 0.00455474853515625,
      "max_context_tokens": 131072,
      "context_length": 1098,
      "question_length": 57,
      "prediction": "540",
      "reference": "540",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0552823543548584,
      "timestamp": "2026-01-05T14:09:12.855468"
    },
    {
      "sample_id": "squad_57277ff9dd62a815002e9ed7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 382,
      "natural_ratio": 0.0029144287109375,
      "max_context_tokens": 131072,
      "context_length": 673,
      "question_length": 52,
      "prediction": "China",
      "reference": "China",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.03397941589355469,
      "timestamp": "2026-01-05T14:09:12.889574"
    },
    {
      "sample_id": "squad_56e720f137bdd419002c3d78",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 273,
      "natural_ratio": 0.00208282470703125,
      "max_context_tokens": 131072,
      "context_length": 412,
      "question_length": 94,
      "prediction": "Equal",
      "reference": "equal",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.030583620071411133,
      "timestamp": "2026-01-05T14:09:12.920280"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_13879",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 101480,
      "natural_ratio": 0.77423095703125,
      "max_context_tokens": 131072,
      "context_length": 202883,
      "question_length": 37,
      "prediction": "Beck uses his blood bag, which contains his own blood, to lure the beast.",
      "reference": "Beck uses a pint of his own blood.",
      "metrics": {
        "f1": 0.8717948717948718
      },
      "elapsed_time": 3.1376354694366455,
      "timestamp": "2026-01-05T14:09:16.058290"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_24442",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 48274,
      "natural_ratio": 0.3683013916015625,
      "max_context_tokens": 131072,
      "context_length": 96447,
      "question_length": 61,
      "prediction": "Nello takes on the responsibility of selling the milk and bringing back payment to his grandfather each day.",
      "reference": "Selling milk",
      "metrics": {
        "f1": 0.6
      },
      "elapsed_time": 1.434788703918457,
      "timestamp": "2026-01-05T14:09:17.493436"
    },
    {
      "sample_id": "squad_57266e09dd62a815002e848c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 371,
      "natural_ratio": 0.00283050537109375,
      "max_context_tokens": 131072,
      "context_length": 673,
      "question_length": 29,
      "prediction": "Último Dragón",
      "reference": "Último Dragón",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07944703102111816,
      "timestamp": "2026-01-05T14:09:17.573049"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_7020",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 25808,
      "natural_ratio": 0.1968994140625,
      "max_context_tokens": 131072,
      "context_length": 51524,
      "question_length": 53,
      "prediction": "lower regions",
      "reference": "Hades.",
      "metrics": {
        "f1": 0.25
      },
      "elapsed_time": 0.6022555828094482,
      "timestamp": "2026-01-05T14:09:18.175600"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_25181",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 87986,
      "natural_ratio": 0.6712799072265625,
      "max_context_tokens": 131072,
      "context_length": 175891,
      "question_length": 42,
      "prediction": "Zoe is a escort/prostitute.",
      "reference": "She is a prostitute",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 2.517611026763916,
      "timestamp": "2026-01-05T14:09:20.693614"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_8650",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 93910,
      "natural_ratio": 0.7164764404296875,
      "max_context_tokens": 131072,
      "context_length": 187723,
      "question_length": 58,
      "prediction": "six months",
      "reference": "2 Years",
      "metrics": {
        "f1": 0.25
      },
      "elapsed_time": 3.011934995651245,
      "timestamp": "2026-01-05T14:09:23.705924"
    },
    {
      "sample_id": "squad_56fa317bf34c681400b0c002",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 330,
      "natural_ratio": 0.0025177001953125,
      "max_context_tokens": 131072,
      "context_length": 551,
      "question_length": 70,
      "prediction": "open stands",
      "reference": "open stands",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0571293830871582,
      "timestamp": "2026-01-05T14:09:23.763257"
    },
    {
      "sample_id": "squad_56cfb40b234ae51400d9bea5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 375,
      "natural_ratio": 0.00286102294921875,
      "max_context_tokens": 131072,
      "context_length": 635,
      "question_length": 76,
      "prediction": "learning investments",
      "reference": "learning investments",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0458066463470459,
      "timestamp": "2026-01-05T14:09:23.809228"
    },
    {
      "sample_id": "squad_5733b4cf4776f419006610cb",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 646,
      "natural_ratio": 0.0049285888671875,
      "max_context_tokens": 131072,
      "context_length": 1200,
      "question_length": 52,
      "prediction": "penicillins and cephalosporins",
      "reference": "penicillins and cephalosporins",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.13233065605163574,
      "timestamp": "2026-01-05T14:09:23.941714"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_16559",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 108265,
      "natural_ratio": 0.8259963989257812,
      "max_context_tokens": 131072,
      "context_length": 216420,
      "question_length": 70,
      "prediction": "Dr. Vladimir Nechorvich was bringing Chimera, a dangerous virus, to the IMF.",
      "reference": "He was going to deliver a new biochemical weapon Chimera and a cure for it, Bellerophon.",
      "metrics": {
        "f1": 0.9777777777777777
      },
      "elapsed_time": 3.554657220840454,
      "timestamp": "2026-01-05T14:09:27.496757"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_19326",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 115653,
      "natural_ratio": 0.8823623657226562,
      "max_context_tokens": 131072,
      "context_length": 231219,
      "question_length": 47,
      "prediction": "得知约翰和辛西娅的事后，安对 Graham 大发雷霆。",
      "reference": "When Graham tells them he feels normal compared to them in his sex life",
      "metrics": {
        "f1": 0.2727272727272727
      },
      "elapsed_time": 3.1265313625335693,
      "timestamp": "2026-01-05T14:09:30.624011"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_14044",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 93039,
      "natural_ratio": 0.7098312377929688,
      "max_context_tokens": 131072,
      "context_length": 185986,
      "question_length": 53,
      "prediction": "Crim Tartary",
      "reference": "CRIM TARTARY",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 3.2102668285369873,
      "timestamp": "2026-01-05T14:09:33.834995"
    },
    {
      "sample_id": "squad_572833d43acd2414000df6de",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 316,
      "natural_ratio": 0.002410888671875,
      "max_context_tokens": 131072,
      "context_length": 564,
      "question_length": 29,
      "prediction": "The Federal War ended in 1863 with the signing of the Treaty of Coche.",
      "reference": "with the signing of the Treaty of Coche by both the centralist government of the time and the Federal Forces",
      "metrics": {
        "f1": 0.8095238095238095
      },
      "elapsed_time": 0.23531174659729004,
      "timestamp": "2026-01-05T14:09:34.070551"
    },
    {
      "sample_id": "squad_5727c9722ca10214002d9634",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 450,
      "natural_ratio": 0.0034332275390625,
      "max_context_tokens": 131072,
      "context_length": 796,
      "question_length": 65,
      "prediction": "Thunderbolt",
      "reference": "Thunderbolt",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.047846078872680664,
      "timestamp": "2026-01-05T14:09:34.118548"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_16180",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 96838,
      "natural_ratio": 0.7388153076171875,
      "max_context_tokens": 131072,
      "context_length": 193565,
      "question_length": 71,
      "prediction": "Their opposing goals and the political implications of their personal attraction make the relationship difficult.",
      "reference": "She has queen duites in her country while the hillemen engange in robbery of her domain",
      "metrics": {
        "f1": 0.8292682926829269
      },
      "elapsed_time": 3.3610174655914307,
      "timestamp": "2026-01-05T14:09:37.479984"
    },
    {
      "sample_id": "squad_5728f1a1af94a219006a9e1d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 409,
      "natural_ratio": 0.00312042236328125,
      "max_context_tokens": 131072,
      "context_length": 738,
      "question_length": 41,
      "prediction": "From 1854",
      "reference": "1854",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.09524059295654297,
      "timestamp": "2026-01-05T14:09:37.575477"
    },
    {
      "sample_id": "squad_57284911ff5b5019007da0e9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 502,
      "natural_ratio": 0.0038299560546875,
      "max_context_tokens": 131072,
      "context_length": 918,
      "question_length": 46,
      "prediction": "Jainism and Buddhism",
      "reference": "Jainism and Buddhism",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08445310592651367,
      "timestamp": "2026-01-05T14:09:37.660155"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_1756",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 115638,
      "natural_ratio": 0.8822479248046875,
      "max_context_tokens": 131072,
      "context_length": 231136,
      "question_length": 101,
      "prediction": "Professor",
      "reference": "Eamon",
      "metrics": {
        "f1": 0.3636363636363636
      },
      "elapsed_time": 0.17394661903381348,
      "timestamp": "2026-01-05T14:09:37.834442"
    },
    {
      "sample_id": "squad_573227fbe17f3d14004226cb",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 534,
      "natural_ratio": 0.0040740966796875,
      "max_context_tokens": 131072,
      "context_length": 949,
      "question_length": 80,
      "prediction": "1994",
      "reference": "1994",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07519078254699707,
      "timestamp": "2026-01-05T14:09:37.910239"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_22349",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 92148,
      "natural_ratio": 0.703033447265625,
      "max_context_tokens": 131072,
      "context_length": 184176,
      "question_length": 81,
      "prediction": "The car",
      "reference": "His is missing his ear",
      "metrics": {
        "f1": 0.588235294117647
      },
      "elapsed_time": 4.177460193634033,
      "timestamp": "2026-01-05T14:09:42.088058"
    },
    {
      "sample_id": "squad_5730ea7df6cb411900e2454e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 388,
      "natural_ratio": 0.002960205078125,
      "max_context_tokens": 131072,
      "context_length": 701,
      "question_length": 36,
      "prediction": "The Princess Margaret Hospital",
      "reference": "Princess Margaret Hospital",
      "metrics": {
        "f1": 0.9285714285714286
      },
      "elapsed_time": 0.07127261161804199,
      "timestamp": "2026-01-05T14:09:42.159510"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_17081",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 112447,
      "natural_ratio": 0.8579025268554688,
      "max_context_tokens": 131072,
      "context_length": 224804,
      "question_length": 51,
      "prediction": "Familars",
      "reference": "Familiars",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 3.6198365688323975,
      "timestamp": "2026-01-05T14:09:45.779693"
    },
    {
      "sample_id": "squad_572fe8d0b2c2fd14005685b9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 410,
      "natural_ratio": 0.0031280517578125,
      "max_context_tokens": 131072,
      "context_length": 725,
      "question_length": 56,
      "prediction": "degrading a variety of organic compounds",
      "reference": "ability of bacteria to degrade a variety of organic compounds",
      "metrics": {
        "f1": 0.9473684210526316
      },
      "elapsed_time": 0.1016378402709961,
      "timestamp": "2026-01-05T14:09:45.881885"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_17996",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 87012,
      "natural_ratio": 0.663848876953125,
      "max_context_tokens": 131072,
      "context_length": 173890,
      "question_length": 95,
      "prediction": "SARAHTakes a large, sharp axe handle and turns it into a large, flaming torch and drives it into the brain of the zombie.",
      "reference": "She amputates his arm and cautorizes it to stop the spread of infection.  ",
      "metrics": {
        "f1": 0.8372093023255813
      },
      "elapsed_time": 3.872610569000244,
      "timestamp": "2026-01-05T14:09:49.754880"
    },
    {
      "sample_id": "squad_5722caddf6b826140030fc32",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 584,
      "natural_ratio": 0.00445556640625,
      "max_context_tokens": 131072,
      "context_length": 1092,
      "question_length": 36,
      "prediction": "Sir John Conroy",
      "reference": "Sir John Conroy",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07281780242919922,
      "timestamp": "2026-01-05T14:09:49.827865"
    },
    {
      "sample_id": "squad_572e8d5bc246551400ce4335",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 318,
      "natural_ratio": 0.0024261474609375,
      "max_context_tokens": 131072,
      "context_length": 558,
      "question_length": 38,
      "prediction": "his son worked on the software",
      "reference": "software",
      "metrics": {
        "f1": 0.4210526315789474
      },
      "elapsed_time": 0.08546996116638184,
      "timestamp": "2026-01-05T14:09:49.913473"
    },
    {
      "sample_id": "squad_5733797b4776f41900660b6d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 617,
      "natural_ratio": 0.00470733642578125,
      "max_context_tokens": 131072,
      "context_length": 1082,
      "question_length": 113,
      "prediction": "global downturn",
      "reference": "global downturn",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.045888423919677734,
      "timestamp": "2026-01-05T14:09:49.959507"
    },
    {
      "sample_id": "squad_5731f87ae17f3d1400422593",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 366,
      "natural_ratio": 0.0027923583984375,
      "max_context_tokens": 131072,
      "context_length": 623,
      "question_length": 70,
      "prediction": "Two",
      "reference": "Two",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.03415989875793457,
      "timestamp": "2026-01-05T14:09:49.993793"
    },
    {
      "sample_id": "squad_57266a2e708984140094c547",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 314,
      "natural_ratio": 0.0023956298828125,
      "max_context_tokens": 131072,
      "context_length": 535,
      "question_length": 53,
      "prediction": "Nine Years' War",
      "reference": "Nine Years' War",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06391549110412598,
      "timestamp": "2026-01-05T14:09:50.057839"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_28027",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 89426,
      "natural_ratio": 0.6822662353515625,
      "max_context_tokens": 131072,
      "context_length": 178776,
      "question_length": 36,
      "prediction": "Twins",
      "reference": "Two.",
      "metrics": {
        "f1": 0.4444444444444445
      },
      "elapsed_time": 2.896878957748413,
      "timestamp": "2026-01-05T14:09:52.955051"
    },
    {
      "sample_id": "squad_56f7eb38a6d7ea1400e172c3",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 249,
      "natural_ratio": 0.00189971923828125,
      "max_context_tokens": 131072,
      "context_length": 370,
      "question_length": 88,
      "prediction": "fraudulent conduct",
      "reference": "fraudulent conduct",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0664680004119873,
      "timestamp": "2026-01-05T14:09:53.021690"
    },
    {
      "sample_id": "squad_5726b6285951b619008f7b80",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 329,
      "natural_ratio": 0.00251007080078125,
      "max_context_tokens": 131072,
      "context_length": 582,
      "question_length": 36,
      "prediction": "Paul VI",
      "reference": "Paul VI",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0454409122467041,
      "timestamp": "2026-01-05T14:09:53.067270"
    },
    {
      "sample_id": "squad_570cfa75fed7b91900d45b93",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 368,
      "natural_ratio": 0.0028076171875,
      "max_context_tokens": 131072,
      "context_length": 635,
      "question_length": 62,
      "prediction": "smartphones and tablet computers",
      "reference": "iPhone and iPad",
      "metrics": {
        "f1": 0.64
      },
      "elapsed_time": 0.07504916191101074,
      "timestamp": "2026-01-05T14:09:53.142445"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_6300",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 76951,
      "natural_ratio": 0.5870895385742188,
      "max_context_tokens": 131072,
      "context_length": 153821,
      "question_length": 41,
      "prediction": "She took the rug because she was a trophy wife who was marrying for money, not for love. She needed money and thought the rug belonged to Jeffrey, so she took it.",
      "reference": "Bunny is one of Jackie's porn stars. ",
      "metrics": {
        "f1": 0.7999999999999999
      },
      "elapsed_time": 3.911011219024658,
      "timestamp": "2026-01-05T14:09:57.053806"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_9956",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 103742,
      "natural_ratio": 0.7914886474609375,
      "max_context_tokens": 131072,
      "context_length": 207400,
      "question_length": 45,
      "prediction": "get rid of her",
      "reference": "Don't tell anyone.",
      "metrics": {
        "f1": 0.47619047619047616
      },
      "elapsed_time": 5.072566032409668,
      "timestamp": "2026-01-05T14:10:02.126760"
    },
    {
      "sample_id": "squad_571aa5df10f8ca140030525d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 1394,
      "natural_ratio": 0.0106353759765625,
      "max_context_tokens": 131072,
      "context_length": 2702,
      "question_length": 47,
      "prediction": "Athanasius was reaffirmed as innocent.",
      "reference": "his innocence reaffirmed",
      "metrics": {
        "f1": 0.8666666666666666
      },
      "elapsed_time": 0.14091253280639648,
      "timestamp": "2026-01-05T14:10:02.267844"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_15924",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 54117,
      "natural_ratio": 0.41287994384765625,
      "max_context_tokens": 131072,
      "context_length": 108147,
      "question_length": 48,
      "prediction": "Captain of the Sephora",
      "reference": "The other ship's captain",
      "metrics": {
        "f1": 0.9230769230769231
      },
      "elapsed_time": 1.4358363151550293,
      "timestamp": "2026-01-05T14:10:03.703990"
    },
    {
      "sample_id": "squad_571a8edb10f8ca1400305135",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 868,
      "natural_ratio": 0.006622314453125,
      "max_context_tokens": 131072,
      "context_length": 1648,
      "question_length": 48,
      "prediction": "In the East, particularly in Egypt and Palestine.",
      "reference": "in the East",
      "metrics": {
        "f1": 0.4
      },
      "elapsed_time": 0.13803982734680176,
      "timestamp": "2026-01-05T14:10:03.842226"
    },
    {
      "sample_id": "squad_5726806a5951b619008f74e0",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 317,
      "natural_ratio": 0.00241851806640625,
      "max_context_tokens": 131072,
      "context_length": 527,
      "question_length": 67,
      "prediction": "Brunswick Dock",
      "reference": "Brunswick Dock",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.055884599685668945,
      "timestamp": "2026-01-05T14:10:03.898254"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_31169",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 92667,
      "natural_ratio": 0.7069931030273438,
      "max_context_tokens": 131072,
      "context_length": 185227,
      "question_length": 68,
      "prediction": "the hyperdrive",
      "reference": "Hyperdrive",
      "metrics": {
        "f1": 0.8333333333333333
      },
      "elapsed_time": 0.19614505767822266,
      "timestamp": "2026-01-05T14:10:04.094684"
    },
    {
      "sample_id": "squad_572788e25951b619008f8cac",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 463,
      "natural_ratio": 0.00353240966796875,
      "max_context_tokens": 131072,
      "context_length": 862,
      "question_length": 25,
      "prediction": "Leaves gather sunlight and carry out photosynthesis.",
      "reference": "gather sunlight",
      "metrics": {
        "f1": 0.44776119402985076
      },
      "elapsed_time": 0.12949371337890625,
      "timestamp": "2026-01-05T14:10:04.224325"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_25005",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 81159,
      "natural_ratio": 0.6191940307617188,
      "max_context_tokens": 131072,
      "context_length": 162192,
      "question_length": 87,
      "prediction": "mosquito",
      "reference": "Mosquitoes.",
      "metrics": {
        "f1": 0.8750000000000001
      },
      "elapsed_time": 3.425940752029419,
      "timestamp": "2026-01-05T14:10:07.650597"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_23635",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 105579,
      "natural_ratio": 0.8055038452148438,
      "max_context_tokens": 131072,
      "context_length": 211032,
      "question_length": 86,
      "prediction": "Felicia",
      "reference": "Jackie",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 5.193706512451172,
      "timestamp": "2026-01-05T14:10:12.844692"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_15773",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 53074,
      "natural_ratio": 0.4049224853515625,
      "max_context_tokens": 131072,
      "context_length": 106051,
      "question_length": 57,
      "prediction": "play guitar in Domino's bar",
      "reference": "He is forced to kill the hit men and hide in a bar. ",
      "metrics": {
        "f1": 0.6857142857142857
      },
      "elapsed_time": 1.5497238636016846,
      "timestamp": "2026-01-05T14:10:14.394737"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_2279",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 65029,
      "natural_ratio": 0.49613189697265625,
      "max_context_tokens": 131072,
      "context_length": 129997,
      "question_length": 21,
      "prediction": "Sidney's job is a survivor and author.",
      "reference": "A call center worker at a crisis for abused women",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.2421717643737793,
      "timestamp": "2026-01-05T14:10:14.637233"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_2635",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 100378,
      "natural_ratio": 0.7658233642578125,
      "max_context_tokens": 131072,
      "context_length": 200651,
      "question_length": 66,
      "prediction": "Willard tells Chef to turn the boat around and go back to base if they don't return.",
      "reference": "Call for an airstrike",
      "metrics": {
        "f1": 0.7428571428571429
      },
      "elapsed_time": 5.160457372665405,
      "timestamp": "2026-01-05T14:10:19.798055"
    },
    {
      "sample_id": "squad_572a213aaf94a219006aa81c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 1161,
      "natural_ratio": 0.00885772705078125,
      "max_context_tokens": 131072,
      "context_length": 2191,
      "question_length": 91,
      "prediction": "A combination of cultural diffusion and migration of peoples",
      "reference": "cultural diffusion and migration of peoples",
      "metrics": {
        "f1": 0.8571428571428571
      },
      "elapsed_time": 0.13504886627197266,
      "timestamp": "2026-01-05T14:10:19.933280"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_16890",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 95045,
      "natural_ratio": 0.7251358032226562,
      "max_context_tokens": 131072,
      "context_length": 189986,
      "question_length": 64,
      "prediction": "underground",
      "reference": "command module",
      "metrics": {
        "f1": 0.588235294117647
      },
      "elapsed_time": 3.204688310623169,
      "timestamp": "2026-01-05T14:10:23.138352"
    },
    {
      "sample_id": "squad_572ed29703f9891900756a55",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 492,
      "natural_ratio": 0.003753662109375,
      "max_context_tokens": 131072,
      "context_length": 893,
      "question_length": 51,
      "prediction": "Kilauea",
      "reference": "Kilauea",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07139134407043457,
      "timestamp": "2026-01-05T14:10:23.209919"
    },
    {
      "sample_id": "squad_570feffa80d9841400ab3743",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 299,
      "natural_ratio": 0.00228118896484375,
      "max_context_tokens": 131072,
      "context_length": 526,
      "question_length": 33,
      "prediction": "Luis Monge",
      "reference": "Luis Monge",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06491398811340332,
      "timestamp": "2026-01-05T14:10:23.274961"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_7816",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 79430,
      "natural_ratio": 0.6060028076171875,
      "max_context_tokens": 131072,
      "context_length": 158785,
      "question_length": 35,
      "prediction": "pornography actor",
      "reference": "Amateur porn producer",
      "metrics": {
        "f1": 0.6956521739130435
      },
      "elapsed_time": 3.3713250160217285,
      "timestamp": "2026-01-05T14:10:26.646625"
    },
    {
      "sample_id": "squad_56e24abacd28a01900c67c30",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 168,
      "natural_ratio": 0.00128173828125,
      "max_context_tokens": 131072,
      "context_length": 255,
      "question_length": 41,
      "prediction": "Estonian",
      "reference": "Estonian",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05596733093261719,
      "timestamp": "2026-01-05T14:10:26.702768"
    },
    {
      "sample_id": "squad_572a95fe34ae481900deab9d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 358,
      "natural_ratio": 0.0027313232421875,
      "max_context_tokens": 131072,
      "context_length": 639,
      "question_length": 37,
      "prediction": "ministers",
      "reference": "his or her ministers",
      "metrics": {
        "f1": 0.8235294117647058
      },
      "elapsed_time": 0.04496455192565918,
      "timestamp": "2026-01-05T14:10:26.747865"
    },
    {
      "sample_id": "squad_57312ede497a881900248c31",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 378,
      "natural_ratio": 0.0028839111328125,
      "max_context_tokens": 131072,
      "context_length": 679,
      "question_length": 37,
      "prediction": "Ming loyalists, pretenders and rebels",
      "reference": "Ming loyalists",
      "metrics": {
        "f1": 0.5490196078431372
      },
      "elapsed_time": 0.1153726577758789,
      "timestamp": "2026-01-05T14:10:26.863361"
    },
    {
      "sample_id": "squad_570e43e60dc6ce1900204ef7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 448,
      "natural_ratio": 0.00341796875,
      "max_context_tokens": 131072,
      "context_length": 798,
      "question_length": 58,
      "prediction": "Canada",
      "reference": "Canada",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.034989118576049805,
      "timestamp": "2026-01-05T14:10:26.898475"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_13931",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 116678,
      "natural_ratio": 0.8901824951171875,
      "max_context_tokens": 131072,
      "context_length": 233233,
      "question_length": 83,
      "prediction": "insurance companies",
      "reference": "An insurance company.",
      "metrics": {
        "f1": 0.923076923076923
      },
      "elapsed_time": 3.582988739013672,
      "timestamp": "2026-01-05T14:10:30.481827"
    },
    {
      "sample_id": "squad_5726a6d05951b619008f7928",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 561,
      "natural_ratio": 0.00428009033203125,
      "max_context_tokens": 131072,
      "context_length": 1045,
      "question_length": 37,
      "prediction": "Impurities in pure metals are unwanted impurities that affect the properties of the metal. These impurities are often introduced during the extraction process of the base metal from its ore. For example, sulfur, oxygen, nitrogen, and small amounts of other impurities can be present in steel even when great care is taken during smelting and refining. These impurities can be reduced, but not completely eliminated, in the refining process. In some cases, other substances such as fluxes or deoxidizers may be intentionally added to remove some of the impurities, such as oxygen, further purifying the metal.",
      "reference": "unwanted impurities",
      "metrics": {
        "f1": 0.06060606060606061
      },
      "elapsed_time": 1.2649891376495361,
      "timestamp": "2026-01-05T14:10:31.747415"
    },
    {
      "sample_id": "squad_5727f3593acd2414000df0b2",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 630,
      "natural_ratio": 0.0048065185546875,
      "max_context_tokens": 131072,
      "context_length": 1149,
      "question_length": 72,
      "prediction": "Medill School of Journalism",
      "reference": "Medill School of Journalism",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08182668685913086,
      "timestamp": "2026-01-05T14:10:31.829417"
    },
    {
      "sample_id": "squad_57325f03b9d445190005eabf",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 411,
      "natural_ratio": 0.00313568115234375,
      "max_context_tokens": 131072,
      "context_length": 730,
      "question_length": 52,
      "prediction": "49.8%",
      "reference": "42.6%",
      "metrics": {
        "f1": 0.6
      },
      "elapsed_time": 0.07557559013366699,
      "timestamp": "2026-01-05T14:10:31.905152"
    },
    {
      "sample_id": "squad_56e145b6e3433e1400422d2c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 278,
      "natural_ratio": 0.0021209716796875,
      "max_context_tokens": 131072,
      "context_length": 444,
      "question_length": 73,
      "prediction": "10%",
      "reference": "10%",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05386066436767578,
      "timestamp": "2026-01-05T14:10:31.959154"
    },
    {
      "sample_id": "squad_5726b2165951b619008f7ad6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 445,
      "natural_ratio": 0.00339508056640625,
      "max_context_tokens": 131072,
      "context_length": 820,
      "question_length": 31,
      "prediction": "523",
      "reference": "523 scheduled monuments",
      "metrics": {
        "f1": 0.5
      },
      "elapsed_time": 0.05518841743469238,
      "timestamp": "2026-01-05T14:10:32.014489"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_2625",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 100373,
      "natural_ratio": 0.7657852172851562,
      "max_context_tokens": 131072,
      "context_length": 200651,
      "question_length": 56,
      "prediction": "No",
      "reference": "The audience does not know the final words of Kurtz. ",
      "metrics": {
        "f1": 0.1904761904761905
      },
      "elapsed_time": 0.2006518840789795,
      "timestamp": "2026-01-05T14:10:32.215457"
    },
    {
      "sample_id": "squad_5726b597f1498d1400e8e850",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 379,
      "natural_ratio": 0.00289154052734375,
      "max_context_tokens": 131072,
      "context_length": 651,
      "question_length": 68,
      "prediction": "Encyclopedia Britannica defined literature in its 1911 editions as the best expression of the best thought reduced to writing.",
      "reference": "\"the best expression of the best thought reduced to writing.\"",
      "metrics": {
        "f1": 0.8636363636363635
      },
      "elapsed_time": 0.292186975479126,
      "timestamp": "2026-01-05T14:10:32.507815"
    },
    {
      "sample_id": "squad_5733cf984776f419006612ae",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 346,
      "natural_ratio": 0.0026397705078125,
      "max_context_tokens": 131072,
      "context_length": 522,
      "question_length": 131,
      "prediction": "婚姻",
      "reference": "marriage",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.034697771072387695,
      "timestamp": "2026-01-05T14:10:32.542651"
    },
    {
      "sample_id": "squad_571a7fd610f8ca14003050b3",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 615,
      "natural_ratio": 0.00469207763671875,
      "max_context_tokens": 131072,
      "context_length": 1137,
      "question_length": 53,
      "prediction": "Hermann Emil Fischer and Joseph von Mering",
      "reference": "Hermann Emil Fischer and Joseph von Mering",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.12026405334472656,
      "timestamp": "2026-01-05T14:10:32.663054"
    },
    {
      "sample_id": "squad_56db42dee7c41114004b4fd6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 510,
      "natural_ratio": 0.0038909912109375,
      "max_context_tokens": 131072,
      "context_length": 937,
      "question_length": 43,
      "prediction": "Death Star",
      "reference": "the Death Star",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04526233673095703,
      "timestamp": "2026-01-05T14:10:32.708443"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_19674",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 103549,
      "natural_ratio": 0.7900161743164062,
      "max_context_tokens": 131072,
      "context_length": 207032,
      "question_length": 26,
      "prediction": "Rath",
      "reference": "Nicolai",
      "metrics": {
        "f1": 0.2
      },
      "elapsed_time": 3.175185203552246,
      "timestamp": "2026-01-05T14:10:35.883982"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_17699",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 86932,
      "natural_ratio": 0.663238525390625,
      "max_context_tokens": 131072,
      "context_length": 173807,
      "question_length": 17,
      "prediction": "Auroch is an extinct wild ox.",
      "reference": "A prehistoric creature",
      "metrics": {
        "f1": 0.7586206896551724
      },
      "elapsed_time": 2.2050726413726807,
      "timestamp": "2026-01-05T14:10:38.089412"
    },
    {
      "sample_id": "squad_573405584776f41900661711",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 419,
      "natural_ratio": 0.00319671630859375,
      "max_context_tokens": 131072,
      "context_length": 716,
      "question_length": 82,
      "prediction": "1974革命",
      "reference": "1974 revolution and the consequent economic globalization",
      "metrics": {
        "f1": 0.26666666666666666
      },
      "elapsed_time": 0.08019709587097168,
      "timestamp": "2026-01-05T14:10:38.169801"
    },
    {
      "sample_id": "squad_56e078cb7aa994140058e533",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 197,
      "natural_ratio": 0.00150299072265625,
      "max_context_tokens": 131072,
      "context_length": 318,
      "question_length": 37,
      "prediction": "Each copper token was worth a halfpenny.",
      "reference": "a halfpenny",
      "metrics": {
        "f1": 0.4313725490196079
      },
      "elapsed_time": 0.12253379821777344,
      "timestamp": "2026-01-05T14:10:38.292485"
    },
    {
      "sample_id": "squad_57268a8d708984140094c976",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 480,
      "natural_ratio": 0.003662109375,
      "max_context_tokens": 131072,
      "context_length": 868,
      "question_length": 52,
      "prediction": "word order",
      "reference": "word order",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.045603275299072266,
      "timestamp": "2026-01-05T14:10:38.338219"
    },
    {
      "sample_id": "squad_57303cafb2c2fd1400568aa9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 449,
      "natural_ratio": 0.00342559814453125,
      "max_context_tokens": 131072,
      "context_length": 751,
      "question_length": 108,
      "prediction": "Median",
      "reference": "The Median",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 0.034963369369506836,
      "timestamp": "2026-01-05T14:10:38.373313"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_5324",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 124398,
      "natural_ratio": 0.9490814208984375,
      "max_context_tokens": 131072,
      "context_length": 248722,
      "question_length": 34,
      "prediction": "because the U.S. Police Force creates a containment wall around it\n\n问题：What does Snake learn about the prototype?\n\n请直接给出答案，不需要解释。",
      "reference": "Because of an earthquake.",
      "metrics": {
        "f1": 0.5263157894736842
      },
      "elapsed_time": 4.543728351593018,
      "timestamp": "2026-01-05T14:10:42.917379"
    },
    {
      "sample_id": "squad_57326d6ce99e3014001e67af",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 330,
      "natural_ratio": 0.0025177001953125,
      "max_context_tokens": 131072,
      "context_length": 542,
      "question_length": 78,
      "prediction": "Columbine II",
      "reference": "Columbine II",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07146549224853516,
      "timestamp": "2026-01-05T14:10:42.989382"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_29457",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 45589,
      "natural_ratio": 0.34781646728515625,
      "max_context_tokens": 131072,
      "context_length": 91086,
      "question_length": 52,
      "prediction": "extremely skillful",
      "reference": "He compared it to that of a gentleman.",
      "metrics": {
        "f1": 0.5333333333333333
      },
      "elapsed_time": 1.2373816967010498,
      "timestamp": "2026-01-05T14:10:44.227088"
    },
    {
      "sample_id": "squad_56eaa9720030b61400a35015",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 338,
      "natural_ratio": 0.0025787353515625,
      "max_context_tokens": 131072,
      "context_length": 537,
      "question_length": 100,
      "prediction": "Unholy alliance",
      "reference": "unholy alliance",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06693363189697266,
      "timestamp": "2026-01-05T14:10:44.294189"
    },
    {
      "sample_id": "squad_5730359db2c2fd1400568a74",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 437,
      "natural_ratio": 0.00333404541015625,
      "max_context_tokens": 131072,
      "context_length": 792,
      "question_length": 42,
      "prediction": "airborne radar",
      "reference": "airborne radar",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0557863712310791,
      "timestamp": "2026-01-05T14:10:44.350108"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_10885",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 11403,
      "natural_ratio": 0.08699798583984375,
      "max_context_tokens": 131072,
      "context_length": 22721,
      "question_length": 45,
      "prediction": "The monkey gets his tail caught in the pail.",
      "reference": "The Bee stings him.",
      "metrics": {
        "f1": 0.6896551724137931
      },
      "elapsed_time": 0.3324470520019531,
      "timestamp": "2026-01-05T14:10:44.682772"
    },
    {
      "sample_id": "squad_5727ae3e2ca10214002d9383",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 417,
      "natural_ratio": 0.00318145751953125,
      "max_context_tokens": 131072,
      "context_length": 737,
      "question_length": 58,
      "prediction": "六种",
      "reference": "five or six methods",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.04584455490112305,
      "timestamp": "2026-01-05T14:10:44.728769"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_5269",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 20525,
      "natural_ratio": 0.15659332275390625,
      "max_context_tokens": 131072,
      "context_length": 40951,
      "question_length": 60,
      "prediction": "If the United States is considered a government proper, then no State upon its own mere motion can lawfully get out of the Union; Resolves and Ordinances to that effect are legally void; and acts of violence, within any State or States, against the authority of the United States, are insurrectionary or revolutionary, according to circumstances.",
      "reference": "It could not be legally done without the consent of all states in the north and south.",
      "metrics": {
        "f1": 0.8636363636363635
      },
      "elapsed_time": 1.1014788150787354,
      "timestamp": "2026-01-05T14:10:45.830529"
    },
    {
      "sample_id": "squad_56f96f929b226e1400dd144a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 341,
      "natural_ratio": 0.00260162353515625,
      "max_context_tokens": 131072,
      "context_length": 592,
      "question_length": 50,
      "prediction": "1999",
      "reference": "1999",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06679677963256836,
      "timestamp": "2026-01-05T14:10:45.897501"
    },
    {
      "sample_id": "squad_571ad0d09499d21900609b3d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 392,
      "natural_ratio": 0.00299072265625,
      "max_context_tokens": 131072,
      "context_length": 717,
      "question_length": 27,
      "prediction": "Ciba researchers",
      "reference": "researchers at Ciba",
      "metrics": {
        "f1": 0.9473684210526316
      },
      "elapsed_time": 0.0554807186126709,
      "timestamp": "2026-01-05T14:10:45.953127"
    },
    {
      "sample_id": "squad_572932863f37b319004780f1",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 402,
      "natural_ratio": 0.0030670166015625,
      "max_context_tokens": 131072,
      "context_length": 656,
      "question_length": 109,
      "prediction": "Contradictions",
      "reference": "contradictions",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05441093444824219,
      "timestamp": "2026-01-05T14:10:46.007668"
    },
    {
      "sample_id": "squad_5727b1f42ca10214002d941a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 790,
      "natural_ratio": 0.0060272216796875,
      "max_context_tokens": 131072,
      "context_length": 1472,
      "question_length": 69,
      "prediction": "23.3%",
      "reference": "23.3%",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08633661270141602,
      "timestamp": "2026-01-05T14:10:46.094134"
    },
    {
      "sample_id": "squad_56fa3d788f12f31900630100",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 300,
      "natural_ratio": 0.002288818359375,
      "max_context_tokens": 131072,
      "context_length": 463,
      "question_length": 97,
      "prediction": "density",
      "reference": "density",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.03326702117919922,
      "timestamp": "2026-01-05T14:10:46.127523"
    },
    {
      "sample_id": "squad_572f332ba23a5019007fc4da",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 300,
      "natural_ratio": 0.002288818359375,
      "max_context_tokens": 131072,
      "context_length": 514,
      "question_length": 47,
      "prediction": "44,000",
      "reference": "44,000",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08415651321411133,
      "timestamp": "2026-01-05T14:10:46.211800"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_7962",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 85287,
      "natural_ratio": 0.6506881713867188,
      "max_context_tokens": 131072,
      "context_length": 170479,
      "question_length": 55,
      "prediction": "沙发上",
      "reference": "On the couch",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 2.439929246902466,
      "timestamp": "2026-01-05T14:10:48.652058"
    },
    {
      "sample_id": "squad_573240390fdd8d15006c6885",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 378,
      "natural_ratio": 0.0028839111328125,
      "max_context_tokens": 131072,
      "context_length": 630,
      "question_length": 87,
      "prediction": "1961",
      "reference": "in 1961",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.06975340843200684,
      "timestamp": "2026-01-05T14:10:48.721981"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_10286",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 92183,
      "natural_ratio": 0.7033004760742188,
      "max_context_tokens": 131072,
      "context_length": 184304,
      "question_length": 23,
      "prediction": "Zenocrate",
      "reference": "Zabina",
      "metrics": {
        "f1": 0.4615384615384615
      },
      "elapsed_time": 3.6090035438537598,
      "timestamp": "2026-01-05T14:10:52.331320"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_10134",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 32663,
      "natural_ratio": 0.24919891357421875,
      "max_context_tokens": 131072,
      "context_length": 65246,
      "question_length": 40,
      "prediction": "Edward Scissorhands",
      "reference": "Edward",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 1.2911460399627686,
      "timestamp": "2026-01-05T14:10:53.622784"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_23702",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 109821,
      "natural_ratio": 0.8378677368164062,
      "max_context_tokens": 131072,
      "context_length": 219543,
      "question_length": 60,
      "prediction": "French Pete",
      "reference": "'Frisco Kid.",
      "metrics": {
        "f1": 0.39999999999999997
      },
      "elapsed_time": 3.8828628063201904,
      "timestamp": "2026-01-05T14:10:57.506043"
    },
    {
      "sample_id": "squad_5709a8ad200fba1400368221",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 383,
      "natural_ratio": 0.00292205810546875,
      "max_context_tokens": 131072,
      "context_length": 653,
      "question_length": 74,
      "prediction": "1850",
      "reference": "1850",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07195591926574707,
      "timestamp": "2026-01-05T14:10:57.578571"
    },
    {
      "sample_id": "squad_57266bc1708984140094c575",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 399,
      "natural_ratio": 0.00304412841796875,
      "max_context_tokens": 131072,
      "context_length": 716,
      "question_length": 42,
      "prediction": "Britain",
      "reference": "Britain",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.035991668701171875,
      "timestamp": "2026-01-05T14:10:57.614726"
    },
    {
      "sample_id": "squad_572fbec0947a6a140053cc30",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 548,
      "natural_ratio": 0.004180908203125,
      "max_context_tokens": 131072,
      "context_length": 1002,
      "question_length": 55,
      "prediction": "George III",
      "reference": "George III of Great Britain",
      "metrics": {
        "f1": 0.7058823529411764
      },
      "elapsed_time": 0.04585003852844238,
      "timestamp": "2026-01-05T14:10:57.660728"
    },
    {
      "sample_id": "squad_56db3337e7c41114004b4f33",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 262,
      "natural_ratio": 0.0019989013671875,
      "max_context_tokens": 131072,
      "context_length": 421,
      "question_length": 63,
      "prediction": "Delhi",
      "reference": "Delhi",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.041127681732177734,
      "timestamp": "2026-01-05T14:10:57.701992"
    },
    {
      "sample_id": "squad_572b8669111d821400f38eda",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 629,
      "natural_ratio": 0.00479888916015625,
      "max_context_tokens": 131072,
      "context_length": 1176,
      "question_length": 42,
      "prediction": "Secondary education comprises the formal education that occurs during adolescence.",
      "reference": "formal education that occurs during adolescence",
      "metrics": {
        "f1": 0.7286821705426356
      },
      "elapsed_time": 0.1366431713104248,
      "timestamp": "2026-01-05T14:10:57.838778"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_15411",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 120859,
      "natural_ratio": 0.9220809936523438,
      "max_context_tokens": 131072,
      "context_length": 241630,
      "question_length": 48,
      "prediction": "He took Cameron up on the $50 a date to get Kat out so he could \"do her sister a favor.\"\n\n问题：What does Kat think of the plan?\n\n请直接给出答案，不需要解释。",
      "reference": "Because Joey pays him.",
      "metrics": {
        "f1": 0.4
      },
      "elapsed_time": 4.258890867233276,
      "timestamp": "2026-01-05T14:11:02.098015"
    },
    {
      "sample_id": "squad_571a441a10f8ca1400304f95",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 313,
      "natural_ratio": 0.00238800048828125,
      "max_context_tokens": 131072,
      "context_length": 524,
      "question_length": 63,
      "prediction": "In California and the western US",
      "reference": "In California and the western US",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08985280990600586,
      "timestamp": "2026-01-05T14:11:02.188401"
    },
    {
      "sample_id": "squad_56e01bf5231d4119001abf29",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 306,
      "natural_ratio": 0.0023345947265625,
      "max_context_tokens": 131072,
      "context_length": 495,
      "question_length": 78,
      "prediction": "Lascaux cave paintings",
      "reference": "Lascaux cave paintings",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07557106018066406,
      "timestamp": "2026-01-05T14:11:02.264120"
    },
    {
      "sample_id": "squad_5733e6a54776f41900661476",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 441,
      "natural_ratio": 0.00336456298828125,
      "max_context_tokens": 131072,
      "context_length": 797,
      "question_length": 45,
      "prediction": "Manchester United",
      "reference": "Manchester United",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0450289249420166,
      "timestamp": "2026-01-05T14:11:02.309282"
    },
    {
      "sample_id": "squad_5728b5f5ff5b5019007da526",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 401,
      "natural_ratio": 0.00305938720703125,
      "max_context_tokens": 131072,
      "context_length": 682,
      "question_length": 81,
      "prediction": "Test and scrutinize",
      "reference": "eliminate errors",
      "metrics": {
        "f1": 0.6956521739130435
      },
      "elapsed_time": 0.06465792655944824,
      "timestamp": "2026-01-05T14:11:02.374079"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_1370",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 71887,
      "natural_ratio": 0.5484542846679688,
      "max_context_tokens": 131072,
      "context_length": 143699,
      "question_length": 36,
      "prediction": "Saturn",
      "reference": "Jupiter",
      "metrics": {
        "f1": 0.4615384615384615
      },
      "elapsed_time": 3.079190492630005,
      "timestamp": "2026-01-05T14:11:05.453598"
    },
    {
      "sample_id": "squad_57295082af94a219006aa296",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 562,
      "natural_ratio": 0.0042877197265625,
      "max_context_tokens": 131072,
      "context_length": 1000,
      "question_length": 85,
      "prediction": "unit tests",
      "reference": "unit tests are written first",
      "metrics": {
        "f1": 0.7777777777777778
      },
      "elapsed_time": 0.05311179161071777,
      "timestamp": "2026-01-05T14:11:05.506948"
    },
    {
      "sample_id": "squad_570a64c64103511400d596c2",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 550,
      "natural_ratio": 0.0041961669921875,
      "max_context_tokens": 131072,
      "context_length": 1020,
      "question_length": 40,
      "prediction": "Zoology",
      "reference": "zoology",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05834674835205078,
      "timestamp": "2026-01-05T14:11:05.565514"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_8123",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 86619,
      "natural_ratio": 0.6608505249023438,
      "max_context_tokens": 131072,
      "context_length": 173146,
      "question_length": 53,
      "prediction": "It is associated with Watts, whom Martha is attracted to.",
      "reference": "It's the name that women in the cult use to hide their identity.",
      "metrics": {
        "f1": 0.8235294117647058
      },
      "elapsed_time": 2.050086498260498,
      "timestamp": "2026-01-05T14:11:07.616005"
    },
    {
      "sample_id": "squad_56e0fa4e231d4119001ac520",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 704,
      "natural_ratio": 0.00537109375,
      "max_context_tokens": 131072,
      "context_length": 1316,
      "question_length": 53,
      "prediction": "Alexey Leonov",
      "reference": "Alexey Leonov",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07438087463378906,
      "timestamp": "2026-01-05T14:11:07.690562"
    },
    {
      "sample_id": "squad_56e104dfcd28a01900c67447",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 449,
      "natural_ratio": 0.00342559814453125,
      "max_context_tokens": 131072,
      "context_length": 794,
      "question_length": 64,
      "prediction": "Gulf of Guinea",
      "reference": "Gulf of Guinea",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06970691680908203,
      "timestamp": "2026-01-05T14:11:07.760399"
    },
    {
      "sample_id": "squad_57267841708984140094c731",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 330,
      "natural_ratio": 0.0025177001953125,
      "max_context_tokens": 131072,
      "context_length": 554,
      "question_length": 67,
      "prediction": "Querétaro",
      "reference": "Querétaro",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06450152397155762,
      "timestamp": "2026-01-05T14:11:07.825038"
    },
    {
      "sample_id": "squad_570c38b26b8089140040fc41",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 563,
      "natural_ratio": 0.00429534912109375,
      "max_context_tokens": 131072,
      "context_length": 1010,
      "question_length": 77,
      "prediction": "Melbourne grew as the major port for exporting agricultural products through the development of an extensive radial railway network that connected the city to the Victorian countryside, facilitating the transport of goods to the port.",
      "reference": "gold mining",
      "metrics": {
        "f1": 0.45714285714285713
      },
      "elapsed_time": 0.4096677303314209,
      "timestamp": "2026-01-05T14:11:08.234916"
    },
    {
      "sample_id": "squad_5725b96289a1e219009abd78",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 363,
      "natural_ratio": 0.00276947021484375,
      "max_context_tokens": 131072,
      "context_length": 644,
      "question_length": 43,
      "prediction": "Rome",
      "reference": "Rome",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04555654525756836,
      "timestamp": "2026-01-05T14:11:08.280626"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_11526",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 112939,
      "natural_ratio": 0.8616561889648438,
      "max_context_tokens": 131072,
      "context_length": 225819,
      "question_length": 20,
      "prediction": "Haw dies.",
      "reference": "Haw is found dead in his laboratory.",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 3.888017416000366,
      "timestamp": "2026-01-05T14:11:12.168973"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_21753",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 113959,
      "natural_ratio": 0.8694381713867188,
      "max_context_tokens": 131072,
      "context_length": 227820,
      "question_length": 58,
      "prediction": "half of their stolen gold is kept, and half is paid to their future selves",
      "reference": "They are paid with gold bars strapped to the targets.",
      "metrics": {
        "f1": 0.7222222222222222
      },
      "elapsed_time": 4.0073628425598145,
      "timestamp": "2026-01-05T14:11:16.177122"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_18211",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 116528,
      "natural_ratio": 0.8890380859375,
      "max_context_tokens": 131072,
      "context_length": 232971,
      "question_length": 46,
      "prediction": "Crain's hope was to build a house filled with many children as a place for every child he used as labor in his mills, and a place for his wife to have more children.",
      "reference": "He wanted to fill it with children with his wife.",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 3.894930362701416,
      "timestamp": "2026-01-05T14:11:20.072902"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_13637",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 112135,
      "natural_ratio": 0.8555221557617188,
      "max_context_tokens": 131072,
      "context_length": 224154,
      "question_length": 76,
      "prediction": "Griffin",
      "reference": "Graeme.",
      "metrics": {
        "f1": 0.3636363636363636
      },
      "elapsed_time": 3.67866849899292,
      "timestamp": "2026-01-05T14:11:23.752296"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_13930",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 116665,
      "natural_ratio": 0.8900833129882812,
      "max_context_tokens": 131072,
      "context_length": 233233,
      "question_length": 57,
      "prediction": "Winston",
      "reference": "Lynn, a client's daughter",
      "metrics": {
        "f1": 0.3478260869565218
      },
      "elapsed_time": 0.21140384674072266,
      "timestamp": "2026-01-05T14:11:23.964407"
    },
    {
      "sample_id": "squad_56e8e4930b45c0140094cd3d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 259,
      "natural_ratio": 0.00197601318359375,
      "max_context_tokens": 131072,
      "context_length": 411,
      "question_length": 67,
      "prediction": "Purbeck marble",
      "reference": "Purbeck",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.06493735313415527,
      "timestamp": "2026-01-05T14:11:24.029838"
    },
    {
      "sample_id": "squad_56df6c9e56340a1900b29b00",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 271,
      "natural_ratio": 0.00206756591796875,
      "max_context_tokens": 131072,
      "context_length": 464,
      "question_length": 39,
      "prediction": "Chesapeake Energy Arena",
      "reference": "Chesapeake Energy Arena",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0743565559387207,
      "timestamp": "2026-01-05T14:11:24.104329"
    },
    {
      "sample_id": "squad_57333f7dd058e614000b57c8",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 348,
      "natural_ratio": 0.002655029296875,
      "max_context_tokens": 131072,
      "context_length": 573,
      "question_length": 83,
      "prediction": "Paul Krugman",
      "reference": "Paul Krugman",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06394481658935547,
      "timestamp": "2026-01-05T14:11:24.168391"
    },
    {
      "sample_id": "squad_56f9fa1e8f12f31900630038",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 309,
      "natural_ratio": 0.00235748291015625,
      "max_context_tokens": 131072,
      "context_length": 529,
      "question_length": 50,
      "prediction": "The most notable rock paintings are located in the Laas Geel complex in the northern part of Somalia.",
      "reference": "the Laas Geel complex",
      "metrics": {
        "f1": 0.4210526315789474
      },
      "elapsed_time": 0.23631954193115234,
      "timestamp": "2026-01-05T14:11:24.404845"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_32283",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 44763,
      "natural_ratio": 0.34151458740234375,
      "max_context_tokens": 131072,
      "context_length": 89437,
      "question_length": 49,
      "prediction": "challenge to a duel",
      "reference": "A fight to the death.",
      "metrics": {
        "f1": 0.6956521739130435
      },
      "elapsed_time": 1.270204782485962,
      "timestamp": "2026-01-05T14:11:25.675406"
    },
    {
      "sample_id": "squad_5728cfb7ff5b5019007da70c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 699,
      "natural_ratio": 0.00533294677734375,
      "max_context_tokens": 131072,
      "context_length": 1300,
      "question_length": 58,
      "prediction": "An interim government",
      "reference": "interim government",
      "metrics": {
        "f1": 0.923076923076923
      },
      "elapsed_time": 0.06371474266052246,
      "timestamp": "2026-01-05T14:11:25.739309"
    },
    {
      "sample_id": "squad_570b03246b8089140040f6d4",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 367,
      "natural_ratio": 0.00279998779296875,
      "max_context_tokens": 131072,
      "context_length": 637,
      "question_length": 58,
      "prediction": "The Northern College of Music",
      "reference": "the Northern College of Music",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.09345531463623047,
      "timestamp": "2026-01-05T14:11:25.832895"
    },
    {
      "sample_id": "squad_572796b2f1498d1400e8fcea",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 331,
      "natural_ratio": 0.00252532958984375,
      "max_context_tokens": 131072,
      "context_length": 543,
      "question_length": 80,
      "prediction": "100,000",
      "reference": "100,000",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.09502243995666504,
      "timestamp": "2026-01-05T14:11:25.928047"
    },
    {
      "sample_id": "squad_572e8843cb0c0d14000f125d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 373,
      "natural_ratio": 0.00284576416015625,
      "max_context_tokens": 131072,
      "context_length": 658,
      "question_length": 49,
      "prediction": "14 August",
      "reference": "14 August",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05444812774658203,
      "timestamp": "2026-01-05T14:11:25.982619"
    },
    {
      "sample_id": "squad_5726684d708984140094c506",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 346,
      "natural_ratio": 0.0026397705078125,
      "max_context_tokens": 131072,
      "context_length": 581,
      "question_length": 71,
      "prediction": "150,000",
      "reference": "150,000",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.09832096099853516,
      "timestamp": "2026-01-05T14:11:26.081138"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_19319",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 115651,
      "natural_ratio": 0.8823471069335938,
      "max_context_tokens": 131072,
      "context_length": 231219,
      "question_length": 43,
      "prediction": "John was summoned to his boss's office because Brian Kirkland obtained legal representation elsewhere and was no longer his client.",
      "reference": "He was about to be fired due to his frequent cancellation of meetings with important clients.",
      "metrics": {
        "f1": 0.9130434782608695
      },
      "elapsed_time": 0.4041128158569336,
      "timestamp": "2026-01-05T14:11:26.485645"
    },
    {
      "sample_id": "squad_571cf2c5dd7acb1400e4c1bf",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 616,
      "natural_ratio": 0.00469970703125,
      "max_context_tokens": 131072,
      "context_length": 1137,
      "question_length": 56,
      "prediction": "Veronal",
      "reference": "Veronal",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04432392120361328,
      "timestamp": "2026-01-05T14:11:26.530502"
    },
    {
      "sample_id": "squad_5726ecd2dd62a815002e956c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 513,
      "natural_ratio": 0.00391387939453125,
      "max_context_tokens": 131072,
      "context_length": 950,
      "question_length": 37,
      "prediction": "students are taught to memorize the native Korean pronunciation for the hanja's meaning and the Sino-Korean pronunciations",
      "reference": "native Korean pronunciation for the hanja's meaning",
      "metrics": {
        "f1": 0.6363636363636364
      },
      "elapsed_time": 0.2823023796081543,
      "timestamp": "2026-01-05T14:11:26.812966"
    },
    {
      "sample_id": "squad_570d0581b3d812140066d39f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 358,
      "natural_ratio": 0.0027313232421875,
      "max_context_tokens": 131072,
      "context_length": 615,
      "question_length": 62,
      "prediction": "Mac, Apple's LaserWriter printer, and Mac-specific software like Boston Software's MacPublisher and Aldus PageMaker",
      "reference": "Mac, Apple's LaserWriter printer, and Mac-specific software like Boston Software's MacPublisher",
      "metrics": {
        "f1": 0.9166666666666666
      },
      "elapsed_time": 0.2565317153930664,
      "timestamp": "2026-01-05T14:11:27.069643"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_1954",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 110128,
      "natural_ratio": 0.8402099609375,
      "max_context_tokens": 131072,
      "context_length": 220171,
      "question_length": 46,
      "prediction": "The Jewish camp got food and supplies from the Russian partisans.",
      "reference": "By raiding local farms.",
      "metrics": {
        "f1": 0.7567567567567567
      },
      "elapsed_time": 4.867767333984375,
      "timestamp": "2026-01-05T14:11:31.937741"
    },
    {
      "sample_id": "squad_572a0f656aef05140015521c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 433,
      "natural_ratio": 0.00330352783203125,
      "max_context_tokens": 131072,
      "context_length": 774,
      "question_length": 53,
      "prediction": "imperfective",
      "reference": "imperfective",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06145215034484863,
      "timestamp": "2026-01-05T14:11:31.999724"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_8609",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 94939,
      "natural_ratio": 0.7243270874023438,
      "max_context_tokens": 131072,
      "context_length": 189753,
      "question_length": 86,
      "prediction": "Lerner",
      "reference": "Lerner",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 4.090599298477173,
      "timestamp": "2026-01-05T14:11:36.090661"
    },
    {
      "sample_id": "squad_56db792ce7c41114004b517f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 277,
      "natural_ratio": 0.00211334228515625,
      "max_context_tokens": 131072,
      "context_length": 458,
      "question_length": 57,
      "prediction": "foreign correspondents in Beijing",
      "reference": "foreign correspondents",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 0.07733440399169922,
      "timestamp": "2026-01-05T14:11:36.168167"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_22589",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 93475,
      "natural_ratio": 0.7131576538085938,
      "max_context_tokens": 131072,
      "context_length": 186884,
      "question_length": 27,
      "prediction": "Vermont",
      "reference": "Long Island",
      "metrics": {
        "f1": 0.25
      },
      "elapsed_time": 2.866802215576172,
      "timestamp": "2026-01-05T14:11:39.035345"
    },
    {
      "sample_id": "squad_573421ecd058e614000b69b6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 410,
      "natural_ratio": 0.0031280517578125,
      "max_context_tokens": 131072,
      "context_length": 709,
      "question_length": 71,
      "prediction": "X-rays, CAT scans, PET scans or NMR",
      "reference": "X-rays, CAT scans, PET scans or NMR",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.14100337028503418,
      "timestamp": "2026-01-05T14:11:39.176525"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_1491",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 91185,
      "natural_ratio": 0.6956863403320312,
      "max_context_tokens": 131072,
      "context_length": 182284,
      "question_length": 47,
      "prediction": "In a nest",
      "reference": "A MAPLE TREE",
      "metrics": {
        "f1": 0.5333333333333333
      },
      "elapsed_time": 2.874828577041626,
      "timestamp": "2026-01-05T14:11:42.051697"
    },
    {
      "sample_id": "squad_56f726613d8e2e1400e373a5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 223,
      "natural_ratio": 0.00170135498046875,
      "max_context_tokens": 131072,
      "context_length": 332,
      "question_length": 75,
      "prediction": "After their migratory movements ended",
      "reference": "When their migratory movements ended",
      "metrics": {
        "f1": 0.9375
      },
      "elapsed_time": 0.08735251426696777,
      "timestamp": "2026-01-05T14:11:42.139226"
    },
    {
      "sample_id": "squad_572f7a7704bcaa1900d769dc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 599,
      "natural_ratio": 0.00457000732421875,
      "max_context_tokens": 131072,
      "context_length": 1093,
      "question_length": 65,
      "prediction": "The increasingly esoteric writingalienated readers from publications such as NME.",
      "reference": "increasingly esoteric writing",
      "metrics": {
        "f1": 0.5272727272727272
      },
      "elapsed_time": 0.17865991592407227,
      "timestamp": "2026-01-05T14:11:42.318094"
    },
    {
      "sample_id": "squad_57313d9505b4da19006bcf21",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 353,
      "natural_ratio": 0.00269317626953125,
      "max_context_tokens": 131072,
      "context_length": 618,
      "question_length": 49,
      "prediction": "daisy chain",
      "reference": "in a daisy chain",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05536913871765137,
      "timestamp": "2026-01-05T14:11:42.373618"
    },
    {
      "sample_id": "squad_56e105afcd28a01900c6744c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 196,
      "natural_ratio": 0.001495361328125,
      "max_context_tokens": 131072,
      "context_length": 303,
      "question_length": 50,
      "prediction": "DFH-3 geostationary communications satellite",
      "reference": "the Chinese DFH-3 geostationary communications satellite",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.12117862701416016,
      "timestamp": "2026-01-05T14:11:42.494951"
    },
    {
      "sample_id": "squad_5727871b708984140094e000",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 568,
      "natural_ratio": 0.00433349609375,
      "max_context_tokens": 131072,
      "context_length": 1047,
      "question_length": 50,
      "prediction": "laborers",
      "reference": "laborers",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.055629730224609375,
      "timestamp": "2026-01-05T14:11:42.550741"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_13074",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 16681,
      "natural_ratio": 0.12726593017578125,
      "max_context_tokens": 131072,
      "context_length": 33278,
      "question_length": 45,
      "prediction": "The masked party-goers froze in fear, a silent terror gripped them, and the figure in the red death mask made a motion that suggested he was about to strike someone.",
      "reference": "A guest in blood-splattered robes and a mask, with signs of the Red Death appeared.",
      "metrics": {
        "f1": 0.9583333333333334
      },
      "elapsed_time": 0.7003412246704102,
      "timestamp": "2026-01-05T14:11:43.251386"
    },
    {
      "sample_id": "squad_56dcfef266d3e219004dab95",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 178,
      "natural_ratio": 0.0013580322265625,
      "max_context_tokens": 131072,
      "context_length": 259,
      "question_length": 58,
      "prediction": "central plateau",
      "reference": "plateau",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.04213666915893555,
      "timestamp": "2026-01-05T14:11:43.293705"
    },
    {
      "sample_id": "squad_57268b01708984140094c98e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 334,
      "natural_ratio": 0.0025482177734375,
      "max_context_tokens": 131072,
      "context_length": 573,
      "question_length": 55,
      "prediction": "Euro 2004",
      "reference": "Euro 2004",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08509969711303711,
      "timestamp": "2026-01-05T14:11:43.378936"
    },
    {
      "sample_id": "squad_573041b7947a6a140053d358",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 322,
      "natural_ratio": 0.0024566650390625,
      "max_context_tokens": 131072,
      "context_length": 558,
      "question_length": 46,
      "prediction": "11",
      "reference": "eleven",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.04373359680175781,
      "timestamp": "2026-01-05T14:11:43.422797"
    },
    {
      "sample_id": "squad_56e03cc27aa994140058e3f8",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 176,
      "natural_ratio": 0.0013427734375,
      "max_context_tokens": 131072,
      "context_length": 268,
      "question_length": 44,
      "prediction": "Cartooning",
      "reference": "Cartooning",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.050565242767333984,
      "timestamp": "2026-01-05T14:11:43.473492"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_7304",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 79150,
      "natural_ratio": 0.6038665771484375,
      "max_context_tokens": 131072,
      "context_length": 158213,
      "question_length": 47,
      "prediction": "George wants to make some money before starting a new life with his daughter.",
      "reference": "So that he can have enough money to take his daughter to California",
      "metrics": {
        "f1": 0.8780487804878048
      },
      "elapsed_time": 3.9324593544006348,
      "timestamp": "2026-01-05T14:11:47.406313"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_17636",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 108668,
      "natural_ratio": 0.829071044921875,
      "max_context_tokens": 131072,
      "context_length": 217262,
      "question_length": 34,
      "prediction": "The Character Marker",
      "reference": "The spectacles.",
      "metrics": {
        "f1": 0.631578947368421
      },
      "elapsed_time": 3.503936767578125,
      "timestamp": "2026-01-05T14:11:50.910616"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_7892",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 66060,
      "natural_ratio": 0.503997802734375,
      "max_context_tokens": 131072,
      "context_length": 132052,
      "question_length": 29,
      "prediction": "Poetry",
      "reference": "architectural works",
      "metrics": {
        "f1": 0.4
      },
      "elapsed_time": 1.5825421810150146,
      "timestamp": "2026-01-05T14:11:52.493865"
    },
    {
      "sample_id": "squad_5725dab089a1e219009abfcc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 430,
      "natural_ratio": 0.0032806396484375,
      "max_context_tokens": 131072,
      "context_length": 746,
      "question_length": 75,
      "prediction": "Premier League",
      "reference": "Premier League",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.061104536056518555,
      "timestamp": "2026-01-05T14:11:52.555157"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_28937",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 109126,
      "natural_ratio": 0.8325653076171875,
      "max_context_tokens": 131072,
      "context_length": 218168,
      "question_length": 44,
      "prediction": "Honey Whitlock is kidnapped in Baltimore, then rescued by a group of fans. After a movie is made about her kidnapping, she escapes from the movie and is pursued by her kidnappers and a wrap is called by the police.",
      "reference": "She is kidnapped by Cecil B. Demented.",
      "metrics": {
        "f1": 0.8095238095238095
      },
      "elapsed_time": 4.131219148635864,
      "timestamp": "2026-01-05T14:11:56.686746"
    },
    {
      "sample_id": "squad_570e66e60dc6ce190020501d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 433,
      "natural_ratio": 0.00330352783203125,
      "max_context_tokens": 131072,
      "context_length": 784,
      "question_length": 43,
      "prediction": "Vyākaraṇa",
      "reference": "Vyākaraṇa",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.09557151794433594,
      "timestamp": "2026-01-05T14:11:56.782855"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_14124",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 94177,
      "natural_ratio": 0.7185134887695312,
      "max_context_tokens": 131072,
      "context_length": 188264,
      "question_length": 50,
      "prediction": "He rides a Harley to kill his father's killer.",
      "reference": "Kills him.  ",
      "metrics": {
        "f1": 0.5833333333333334
      },
      "elapsed_time": 3.2685394287109375,
      "timestamp": "2026-01-05T14:12:00.051762"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_18651",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 109036,
      "natural_ratio": 0.831878662109375,
      "max_context_tokens": 131072,
      "context_length": 217986,
      "question_length": 46,
      "prediction": "Natalie",
      "reference": "Natalie Belasario",
      "metrics": {
        "f1": 0.7058823529411764
      },
      "elapsed_time": 2.703721284866333,
      "timestamp": "2026-01-05T14:12:02.755829"
    },
    {
      "sample_id": "squad_57275cfcf1498d1400e8f6ef",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 388,
      "natural_ratio": 0.002960205078125,
      "max_context_tokens": 131072,
      "context_length": 653,
      "question_length": 83,
      "prediction": "短",
      "reference": "short life expectancy",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.039472103118896484,
      "timestamp": "2026-01-05T14:12:02.795826"
    },
    {
      "sample_id": "squad_570a3a0e6d058f1900182d06",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 432,
      "natural_ratio": 0.0032958984375,
      "max_context_tokens": 131072,
      "context_length": 789,
      "question_length": 36,
      "prediction": "William IV",
      "reference": "William IV",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04573392868041992,
      "timestamp": "2026-01-05T14:12:02.841689"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_19024",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 117804,
      "natural_ratio": 0.898773193359375,
      "max_context_tokens": 131072,
      "context_length": 235515,
      "question_length": 54,
      "prediction": "Woody jumps out of the sidecar and runs towards the police garage, where he grabs an unridden Harley Davidson police bike and rides away, leaving the Del Fuegos behind.",
      "reference": "Woody cut the fuel lines on their motorcycles.",
      "metrics": {
        "f1": 0.8181818181818181
      },
      "elapsed_time": 4.250941276550293,
      "timestamp": "2026-01-05T14:12:07.092968"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_5366",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 88405,
      "natural_ratio": 0.6744766235351562,
      "max_context_tokens": 131072,
      "context_length": 176716,
      "question_length": 54,
      "prediction": "New Jersey",
      "reference": "Canada",
      "metrics": {
        "f1": 0.16666666666666666
      },
      "elapsed_time": 2.720031976699829,
      "timestamp": "2026-01-05T14:12:09.813721"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_18394",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 54346,
      "natural_ratio": 0.4146270751953125,
      "max_context_tokens": 131072,
      "context_length": 108571,
      "question_length": 82,
      "prediction": "claws",
      "reference": "Claws",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 1.4207706451416016,
      "timestamp": "2026-01-05T14:12:11.234826"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_25669",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 51741,
      "natural_ratio": 0.39475250244140625,
      "max_context_tokens": 131072,
      "context_length": 103388,
      "question_length": 55,
      "prediction": "Bartleby responds with \"I would prefer not to\" when asked to work or move.",
      "reference": "\"I would prefer not to.\"",
      "metrics": {
        "f1": 0.7894736842105263
      },
      "elapsed_time": 1.5663106441497803,
      "timestamp": "2026-01-05T14:12:12.801538"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_26483",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 66432,
      "natural_ratio": 0.5068359375,
      "max_context_tokens": 131072,
      "context_length": 132748,
      "question_length": 77,
      "prediction": "Grant bought it from InGen.",
      "reference": "It was replicated by Billy with a 3D printer.",
      "metrics": {
        "f1": 0.606060606060606
      },
      "elapsed_time": 2.5593647956848145,
      "timestamp": "2026-01-05T14:12:15.361280"
    },
    {
      "sample_id": "squad_572699885951b619008f778a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 384,
      "natural_ratio": 0.0029296875,
      "max_context_tokens": 131072,
      "context_length": 660,
      "question_length": 69,
      "prediction": "Arthashastra",
      "reference": "The Arthashastra",
      "metrics": {
        "f1": 0.8333333333333333
      },
      "elapsed_time": 0.0705270767211914,
      "timestamp": "2026-01-05T14:12:15.432014"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_29967",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 113600,
      "natural_ratio": 0.86669921875,
      "max_context_tokens": 131072,
      "context_length": 227114,
      "question_length": 47,
      "prediction": "James McCullen created Nano-tech warheads.",
      "reference": "Nanotech base weapon.",
      "metrics": {
        "f1": 0.7096774193548387
      },
      "elapsed_time": 4.336342811584473,
      "timestamp": "2026-01-05T14:12:19.768773"
    },
    {
      "sample_id": "squad_5730299db2c2fd14005689a8",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 358,
      "natural_ratio": 0.0027313232421875,
      "max_context_tokens": 131072,
      "context_length": 634,
      "question_length": 42,
      "prediction": "28",
      "reference": "five slaves",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.05092120170593262,
      "timestamp": "2026-01-05T14:12:19.820213"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_29172",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 75258,
      "natural_ratio": 0.5741729736328125,
      "max_context_tokens": 131072,
      "context_length": 150428,
      "question_length": 48,
      "prediction": "Barton Fink has made a play called \"The Tenement Hotel\" at the beginning.",
      "reference": "Bare Ruined Choirs",
      "metrics": {
        "f1": 0.6857142857142856
      },
      "elapsed_time": 3.4181294441223145,
      "timestamp": "2026-01-05T14:12:23.238706"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_29455",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 45580,
      "natural_ratio": 0.347747802734375,
      "max_context_tokens": 131072,
      "context_length": 91086,
      "question_length": 35,
      "prediction": "Prince Louis-Philippe de Valois's cousin",
      "reference": "The woman that King Louis XV had chosen for him in the first place.",
      "metrics": {
        "f1": 0.75
      },
      "elapsed_time": 0.19708967208862305,
      "timestamp": "2026-01-05T14:12:23.436101"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_9617",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 118768,
      "natural_ratio": 0.9061279296875,
      "max_context_tokens": 131072,
      "context_length": 237440,
      "question_length": 57,
      "prediction": "cellphone",
      "reference": "The weapon that could destroy surrogates.",
      "metrics": {
        "f1": 0.56
      },
      "elapsed_time": 4.035212755203247,
      "timestamp": "2026-01-05T14:12:27.471657"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_14083",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 91012,
      "natural_ratio": 0.694366455078125,
      "max_context_tokens": 131072,
      "context_length": 181959,
      "question_length": 26,
      "prediction": "Kirsty",
      "reference": "Kirsty",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 2.880702495574951,
      "timestamp": "2026-01-05T14:12:30.353083"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_23087",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 104656,
      "natural_ratio": 0.7984619140625,
      "max_context_tokens": 131072,
      "context_length": 209218,
      "question_length": 55,
      "prediction": "Michigan State",
      "reference": "The Boston Celtics.",
      "metrics": {
        "f1": 0.6956521739130435
      },
      "elapsed_time": 4.128698110580444,
      "timestamp": "2026-01-05T14:12:34.482164"
    },
    {
      "sample_id": "squad_572ec021c246551400ce4600",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 405,
      "natural_ratio": 0.00308990478515625,
      "max_context_tokens": 131072,
      "context_length": 697,
      "question_length": 74,
      "prediction": "No",
      "reference": "They met no serious resistance, and wielded little violence against the monarchists.",
      "metrics": {
        "f1": 0.17391304347826084
      },
      "elapsed_time": 0.044098615646362305,
      "timestamp": "2026-01-05T14:12:34.526454"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_10076",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 94828,
      "natural_ratio": 0.723480224609375,
      "max_context_tokens": 131072,
      "context_length": 189560,
      "question_length": 56,
      "prediction": "Eugene",
      "reference": "Brian.",
      "metrics": {
        "f1": 0.2
      },
      "elapsed_time": 3.135495185852051,
      "timestamp": "2026-01-05T14:12:37.662302"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_7362",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 31097,
      "natural_ratio": 0.23725128173828125,
      "max_context_tokens": 131072,
      "context_length": 62058,
      "question_length": 96,
      "prediction": "Roderick wants to store Madeline's body in a vault for two weeks before the final burial.",
      "reference": "Roderick insists they inter her in the family tomb for two weeks.",
      "metrics": {
        "f1": 0.9302325581395349
      },
      "elapsed_time": 0.9054181575775146,
      "timestamp": "2026-01-05T14:12:38.567999"
    },
    {
      "sample_id": "squad_57268eea708984140094ca16",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 525,
      "natural_ratio": 0.00400543212890625,
      "max_context_tokens": 131072,
      "context_length": 950,
      "question_length": 61,
      "prediction": "Olympiacos",
      "reference": "Olympiacos",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0679173469543457,
      "timestamp": "2026-01-05T14:12:38.636082"
    },
    {
      "sample_id": "squad_56e1063ecd28a01900c67457",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 417,
      "natural_ratio": 0.00318145751953125,
      "max_context_tokens": 131072,
      "context_length": 751,
      "question_length": 44,
      "prediction": "Pacific Ocean",
      "reference": "Pacific",
      "metrics": {
        "f1": 0.7000000000000001
      },
      "elapsed_time": 0.04548525810241699,
      "timestamp": "2026-01-05T14:12:38.681693"
    },
    {
      "sample_id": "squad_57310e2b05b4da19006bcd34",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 646,
      "natural_ratio": 0.0049285888671875,
      "max_context_tokens": 131072,
      "context_length": 1216,
      "question_length": 37,
      "prediction": "The Big Three created the Treaty of Versailles.",
      "reference": "Big Three",
      "metrics": {
        "f1": 0.4444444444444445
      },
      "elapsed_time": 0.1298685073852539,
      "timestamp": "2026-01-05T14:12:38.811713"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_7347",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 84711,
      "natural_ratio": 0.6462936401367188,
      "max_context_tokens": 131072,
      "context_length": 169359,
      "question_length": 23,
      "prediction": "Mojave Desert",
      "reference": "Mojave Deser",
      "metrics": {
        "f1": 0.9600000000000001
      },
      "elapsed_time": 3.4664974212646484,
      "timestamp": "2026-01-05T14:12:42.278548"
    },
    {
      "sample_id": "squad_572ef6bfcb0c0d14000f16a6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 337,
      "natural_ratio": 0.00257110595703125,
      "max_context_tokens": 131072,
      "context_length": 563,
      "question_length": 71,
      "prediction": "Material transport elevators",
      "reference": "Material transport elevators",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.069580078125,
      "timestamp": "2026-01-05T14:12:42.348295"
    },
    {
      "sample_id": "squad_571cf8175efbb31900334e68",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 646,
      "natural_ratio": 0.0049285888671875,
      "max_context_tokens": 131072,
      "context_length": 1212,
      "question_length": 41,
      "prediction": "Merck",
      "reference": "Merck",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05022931098937988,
      "timestamp": "2026-01-05T14:12:42.398661"
    },
    {
      "sample_id": "squad_572a0e601d04691400779713",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 300,
      "natural_ratio": 0.002288818359375,
      "max_context_tokens": 131072,
      "context_length": 503,
      "question_length": 58,
      "prediction": "1483",
      "reference": "(1483)",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 0.06501531600952148,
      "timestamp": "2026-01-05T14:12:42.463799"
    },
    {
      "sample_id": "squad_5728d3393acd2414000dff75",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 355,
      "natural_ratio": 0.00270843505859375,
      "max_context_tokens": 131072,
      "context_length": 610,
      "question_length": 61,
      "prediction": "vapours and chewing of laurel-leaves",
      "reference": "existence of vapours and chewing of laurel-leaves",
      "metrics": {
        "f1": 0.9500000000000001
      },
      "elapsed_time": 0.1253218650817871,
      "timestamp": "2026-01-05T14:12:42.589263"
    },
    {
      "sample_id": "squad_56ccf53362d2951400fa64ff",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 811,
      "natural_ratio": 0.00618743896484375,
      "max_context_tokens": 131072,
      "context_length": 1528,
      "question_length": 54,
      "prediction": "licensed border market",
      "reference": "a licensed border market",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06013035774230957,
      "timestamp": "2026-01-05T14:12:42.649527"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_26210",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 122910,
      "natural_ratio": 0.9377288818359375,
      "max_context_tokens": 131072,
      "context_length": 245733,
      "question_length": 47,
      "prediction": "100000 years old",
      "reference": "over 100,000 years old.",
      "metrics": {
        "f1": 0.88
      },
      "elapsed_time": 3.19895076751709,
      "timestamp": "2026-01-05T14:12:45.848842"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_3760",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 123124,
      "natural_ratio": 0.939361572265625,
      "max_context_tokens": 131072,
      "context_length": 246144,
      "question_length": 65,
      "prediction": "Mr. Nick made a bet with him on winning the 5th soul.",
      "reference": "Immortality",
      "metrics": {
        "f1": 0.48275862068965514
      },
      "elapsed_time": 4.166016340255737,
      "timestamp": "2026-01-05T14:12:50.015567"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_8748",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 89183,
      "natural_ratio": 0.6804122924804688,
      "max_context_tokens": 131072,
      "context_length": 178274,
      "question_length": 52,
      "prediction": "woodchuck",
      "reference": "woodchuck",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 2.842792272567749,
      "timestamp": "2026-01-05T14:12:52.859067"
    },
    {
      "sample_id": "squad_57304b80069b531400832009",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 475,
      "natural_ratio": 0.00362396240234375,
      "max_context_tokens": 131072,
      "context_length": 851,
      "question_length": 60,
      "prediction": "Neck",
      "reference": "the city's Neck area",
      "metrics": {
        "f1": 0.47058823529411764
      },
      "elapsed_time": 0.051696062088012695,
      "timestamp": "2026-01-05T14:12:52.910935"
    },
    {
      "sample_id": "squad_5731d9f3e17f3d1400422499",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 307,
      "natural_ratio": 0.00234222412109375,
      "max_context_tokens": 131072,
      "context_length": 549,
      "question_length": 26,
      "prediction": "Brasília was founded on April 21, 1960.",
      "reference": "April 21, 1960",
      "metrics": {
        "f1": 0.5283018867924528
      },
      "elapsed_time": 0.19753265380859375,
      "timestamp": "2026-01-05T14:12:53.108609"
    },
    {
      "sample_id": "squad_570c488cb3d812140066d079",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 338,
      "natural_ratio": 0.0025787353515625,
      "max_context_tokens": 131072,
      "context_length": 592,
      "question_length": 45,
      "prediction": "late 1890s",
      "reference": "late 1890s",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.09473395347595215,
      "timestamp": "2026-01-05T14:12:53.203490"
    },
    {
      "sample_id": "squad_56e14927e3433e1400422d5a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 707,
      "natural_ratio": 0.00539398193359375,
      "max_context_tokens": 131072,
      "context_length": 1334,
      "question_length": 40,
      "prediction": "Lon Chaney",
      "reference": "Lon Chaney",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06006360054016113,
      "timestamp": "2026-01-05T14:12:53.263681"
    },
    {
      "sample_id": "squad_57267331708984140094c68b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 370,
      "natural_ratio": 0.0028228759765625,
      "max_context_tokens": 131072,
      "context_length": 672,
      "question_length": 29,
      "prediction": "1.4 million",
      "reference": "1.4 million",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06437849998474121,
      "timestamp": "2026-01-05T14:12:53.328190"
    },
    {
      "sample_id": "squad_572fc92904bcaa1900d76d30",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 510,
      "natural_ratio": 0.0038909912109375,
      "max_context_tokens": 131072,
      "context_length": 923,
      "question_length": 58,
      "prediction": "Sir Dave Richards, April 1999",
      "reference": "The current chairman is Sir Dave Richards, who was appointed in April 199",
      "metrics": {
        "f1": 0.8333333333333333
      },
      "elapsed_time": 0.12553024291992188,
      "timestamp": "2026-01-05T14:12:53.453857"
    },
    {
      "sample_id": "squad_571a30bb10f8ca1400304f53",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 494,
      "natural_ratio": 0.0037689208984375,
      "max_context_tokens": 131072,
      "context_length": 897,
      "question_length": 51,
      "prediction": "Bainbridge and Vashon Islands",
      "reference": "Bainbridge and Vashon",
      "metrics": {
        "f1": 0.8571428571428571
      },
      "elapsed_time": 0.10577726364135742,
      "timestamp": "2026-01-05T14:12:53.559773"
    },
    {
      "sample_id": "squad_570da0f416d0071400510c17",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 454,
      "natural_ratio": 0.0034637451171875,
      "max_context_tokens": 131072,
      "context_length": 821,
      "question_length": 47,
      "prediction": "missiles",
      "reference": "missiles",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.044948577880859375,
      "timestamp": "2026-01-05T14:12:53.604854"
    },
    {
      "sample_id": "squad_5705f85275f01819005e77e8",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 285,
      "natural_ratio": 0.00217437744140625,
      "max_context_tokens": 131072,
      "context_length": 487,
      "question_length": 43,
      "prediction": "In autumn.",
      "reference": "autumn",
      "metrics": {
        "f1": 0.7499999999999999
      },
      "elapsed_time": 0.051300048828125,
      "timestamp": "2026-01-05T14:12:53.656300"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_13541",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 120233,
      "natural_ratio": 0.9173049926757812,
      "max_context_tokens": 131072,
      "context_length": 240404,
      "question_length": 22,
      "prediction": "with Mary",
      "reference": "With Precious' grandmother.",
      "metrics": {
        "f1": 0.5714285714285714
      },
      "elapsed_time": 3.829277276992798,
      "timestamp": "2026-01-05T14:12:57.485918"
    },
    {
      "sample_id": "squad_5706910552bb891400689a66",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 160,
      "natural_ratio": 0.001220703125,
      "max_context_tokens": 131072,
      "context_length": 227,
      "question_length": 54,
      "prediction": "waxwings Bombycilla",
      "reference": "the waxwings Bombycilla",
      "metrics": {
        "f1": 0.9032258064516129
      },
      "elapsed_time": 0.10624146461486816,
      "timestamp": "2026-01-05T14:12:57.592715"
    },
    {
      "sample_id": "squad_571dfb4e55697319006390de",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 537,
      "natural_ratio": 0.00409698486328125,
      "max_context_tokens": 131072,
      "context_length": 987,
      "question_length": 48,
      "prediction": "Colonial records of French and Spanish slave ships and sales, and plantation records in all the former colonies",
      "reference": "Colonial records of French and Spanish slave ships and sales, and plantation records in all the former colonies",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.2289581298828125,
      "timestamp": "2026-01-05T14:12:57.821838"
    },
    {
      "sample_id": "squad_56ce55feaab44d1400b886ce",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 144,
      "natural_ratio": 0.0010986328125,
      "max_context_tokens": 131072,
      "context_length": 215,
      "question_length": 34,
      "prediction": "The Sun",
      "reference": "the Sun",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04041457176208496,
      "timestamp": "2026-01-05T14:12:57.862381"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_15225",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 124005,
      "natural_ratio": 0.9460830688476562,
      "max_context_tokens": 131072,
      "context_length": 247925,
      "question_length": 45,
      "prediction": "The creature continues to spare Anna because she possesses knowledge about a military operation that he wants to stop.",
      "reference": "She is always unarmed, there is no fun in hunting her.",
      "metrics": {
        "f1": 0.8571428571428572
      },
      "elapsed_time": 4.09227991104126,
      "timestamp": "2026-01-05T14:13:01.955044"
    },
    {
      "sample_id": "squad_56f7529fa6d7ea1400e171af",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 146,
      "natural_ratio": 0.0011138916015625,
      "max_context_tokens": 131072,
      "context_length": 161,
      "question_length": 91,
      "prediction": "Shawn Vancour",
      "reference": "Shawn Vancour",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07605504989624023,
      "timestamp": "2026-01-05T14:13:02.031646"
    },
    {
      "sample_id": "squad_572fb0a5b2c2fd140056835d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 574,
      "natural_ratio": 0.0043792724609375,
      "max_context_tokens": 131072,
      "context_length": 1053,
      "question_length": 55,
      "prediction": "Suicides and drunkenness",
      "reference": "suicides and drunkenness",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07679867744445801,
      "timestamp": "2026-01-05T14:13:02.108583"
    },
    {
      "sample_id": "squad_5733c1a94776f419006611a8",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 538,
      "natural_ratio": 0.0041046142578125,
      "max_context_tokens": 131072,
      "context_length": 971,
      "question_length": 66,
      "prediction": "Midwest Fencing Conference",
      "reference": "Midwest Fencing Conference",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07987260818481445,
      "timestamp": "2026-01-05T14:13:02.188622"
    },
    {
      "sample_id": "squad_57303233a23a5019007fcf58",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 387,
      "natural_ratio": 0.00295257568359375,
      "max_context_tokens": 131072,
      "context_length": 681,
      "question_length": 54,
      "prediction": "Whitehead considered himself a rank amateur in philosophy.",
      "reference": "rank amateur",
      "metrics": {
        "f1": 0.4
      },
      "elapsed_time": 0.12824177742004395,
      "timestamp": "2026-01-05T14:13:02.317091"
    },
    {
      "sample_id": "squad_5726b60ef1498d1400e8e874",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 597,
      "natural_ratio": 0.00455474853515625,
      "max_context_tokens": 131072,
      "context_length": 1100,
      "question_length": 55,
      "prediction": "all subjects",
      "reference": "all subjects",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.047206878662109375,
      "timestamp": "2026-01-05T14:13:02.364467"
    },
    {
      "sample_id": "squad_570e0a430b85d914000d7c89",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 316,
      "natural_ratio": 0.002410888671875,
      "max_context_tokens": 131072,
      "context_length": 550,
      "question_length": 43,
      "prediction": "The unintentional introduction of non-native species from outside the region.",
      "reference": "non-native species",
      "metrics": {
        "f1": 0.3789473684210526
      },
      "elapsed_time": 0.15288305282592773,
      "timestamp": "2026-01-05T14:13:02.517546"
    },
    {
      "sample_id": "squad_572ef20ecb0c0d14000f1696",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 306,
      "natural_ratio": 0.0023345947265625,
      "max_context_tokens": 131072,
      "context_length": 512,
      "question_length": 60,
      "prediction": "2007",
      "reference": "2007",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06529641151428223,
      "timestamp": "2026-01-05T14:13:02.582998"
    },
    {
      "sample_id": "squad_56df49d18bc80c19004e4a0a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 376,
      "natural_ratio": 0.00286865234375,
      "max_context_tokens": 131072,
      "context_length": 663,
      "question_length": 50,
      "prediction": "They scavenged carcasses of animals that had either been killed by predators or that had died of natural causes.",
      "reference": "scavenging",
      "metrics": {
        "f1": 0.5161290322580645
      },
      "elapsed_time": 0.2570018768310547,
      "timestamp": "2026-01-05T14:13:02.840145"
    },
    {
      "sample_id": "squad_5726706bdd62a815002e84dc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 530,
      "natural_ratio": 0.0040435791015625,
      "max_context_tokens": 131072,
      "context_length": 942,
      "question_length": 79,
      "prediction": "Lady Flora Hastings",
      "reference": "one of her mother's ladies-in-waiting, Lady Flora Hastings",
      "metrics": {
        "f1": 0.8235294117647058
      },
      "elapsed_time": 0.06594300270080566,
      "timestamp": "2026-01-05T14:13:02.906224"
    },
    {
      "sample_id": "squad_57290eaa3f37b31900477fe3",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 624,
      "natural_ratio": 0.0047607421875,
      "max_context_tokens": 131072,
      "context_length": 1139,
      "question_length": 70,
      "prediction": "Most other states did not have updated polling numbers to give an accurate placing for the Kerry campaign before Iowa.",
      "reference": "he fired campaign manager Jim Jordan",
      "metrics": {
        "f1": 0.7179487179487181
      },
      "elapsed_time": 0.24490761756896973,
      "timestamp": "2026-01-05T14:13:03.151274"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_20826",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 72361,
      "natural_ratio": 0.5520706176757812,
      "max_context_tokens": 131072,
      "context_length": 144654,
      "question_length": 28,
      "prediction": "Emory",
      "reference": "Suzie",
      "metrics": {
        "f1": 0.20000000000000004
      },
      "elapsed_time": 2.7734408378601074,
      "timestamp": "2026-01-05T14:13:05.925048"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_31100",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 49053,
      "natural_ratio": 0.37424468994140625,
      "max_context_tokens": 131072,
      "context_length": 98001,
      "question_length": 66,
      "prediction": "a composing draught",
      "reference": "a composing draught for her sister",
      "metrics": {
        "f1": 0.9375
      },
      "elapsed_time": 1.2776198387145996,
      "timestamp": "2026-01-05T14:13:07.203024"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_8465",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 122567,
      "natural_ratio": 0.9351119995117188,
      "max_context_tokens": 131072,
      "context_length": 245061,
      "question_length": 34,
      "prediction": "To sell the fake Book of Hours to the Russians and use the money to start a new life.",
      "reference": "To fake the deaths of Bloom and Stephen",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 4.103907346725464,
      "timestamp": "2026-01-05T14:13:11.307295"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_8131",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 86624,
      "natural_ratio": 0.660888671875,
      "max_context_tokens": 131072,
      "context_length": 173146,
      "question_length": 63,
      "prediction": "Martha takes Sarah to the farm, where she watches as Sarah is locked in a room and has sex with Patrick.",
      "reference": "She gives her a drugged drink.",
      "metrics": {
        "f1": 0.6285714285714286
      },
      "elapsed_time": 0.38043999671936035,
      "timestamp": "2026-01-05T14:13:11.688446"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_7454",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 79403,
      "natural_ratio": 0.6057968139648438,
      "max_context_tokens": 131072,
      "context_length": 158713,
      "question_length": 53,
      "prediction": "The Family",
      "reference": "The Family",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 2.6777400970458984,
      "timestamp": "2026-01-05T14:13:14.366526"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_3782",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 123111,
      "natural_ratio": 0.9392623901367188,
      "max_context_tokens": 131072,
      "context_length": 246144,
      "question_length": 39,
      "prediction": "The Hanged Man Tarot Card",
      "reference": "A trance that powers the dream world",
      "metrics": {
        "f1": 0.8148148148148148
      },
      "elapsed_time": 0.27234792709350586,
      "timestamp": "2026-01-05T14:13:14.639246"
    },
    {
      "sample_id": "squad_56e78cf937bdd419002c4114",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 172,
      "natural_ratio": 0.001312255859375,
      "max_context_tokens": 131072,
      "context_length": 286,
      "question_length": 18,
      "prediction": "1963",
      "reference": "1963",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06695246696472168,
      "timestamp": "2026-01-05T14:13:14.706760"
    },
    {
      "sample_id": "squad_57304d5c069b53140083202a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 329,
      "natural_ratio": 0.00251007080078125,
      "max_context_tokens": 131072,
      "context_length": 574,
      "question_length": 44,
      "prediction": "Goalkeeper",
      "reference": "goalkeeper",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04566144943237305,
      "timestamp": "2026-01-05T14:13:14.752582"
    },
    {
      "sample_id": "squad_57316c1d05b4da19006bd127",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 365,
      "natural_ratio": 0.00278472900390625,
      "max_context_tokens": 131072,
      "context_length": 626,
      "question_length": 64,
      "prediction": "100 million",
      "reference": "100 million",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06540036201477051,
      "timestamp": "2026-01-05T14:13:14.818138"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_24108",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 15390,
      "natural_ratio": 0.1174163818359375,
      "max_context_tokens": 131072,
      "context_length": 30679,
      "question_length": 61,
      "prediction": "They realize that they have mutually sacrificed their most prized possessions for each other.",
      "reference": "How priceless their love is.",
      "metrics": {
        "f1": 0.742857142857143
      },
      "elapsed_time": 0.464566707611084,
      "timestamp": "2026-01-05T14:13:15.282973"
    },
    {
      "sample_id": "squad_56d26abd59d6e41400145fa2",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 487,
      "natural_ratio": 0.00371551513671875,
      "max_context_tokens": 131072,
      "context_length": 865,
      "question_length": 69,
      "prediction": "Abhidharma",
      "reference": "Abhidharma",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0596156120300293,
      "timestamp": "2026-01-05T14:13:15.342829"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_25129",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 76986,
      "natural_ratio": 0.5873565673828125,
      "max_context_tokens": 131072,
      "context_length": 153878,
      "question_length": 54,
      "prediction": "He is shocked and realizes he must kill the Duke himself.",
      "reference": "He is appalled",
      "metrics": {
        "f1": 0.5714285714285714
      },
      "elapsed_time": 2.6395909786224365,
      "timestamp": "2026-01-05T14:13:17.982760"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_23147",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 88177,
      "natural_ratio": 0.6727371215820312,
      "max_context_tokens": 131072,
      "context_length": 176258,
      "question_length": 56,
      "prediction": "Lindsay Lohan",
      "reference": "Audrina Patridge.",
      "metrics": {
        "f1": 0.45454545454545453
      },
      "elapsed_time": 2.383462905883789,
      "timestamp": "2026-01-05T14:13:20.366641"
    },
    {
      "sample_id": "squad_57263f2889a1e219009ac5c2",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 474,
      "natural_ratio": 0.0036163330078125,
      "max_context_tokens": 131072,
      "context_length": 836,
      "question_length": 73,
      "prediction": "均匀地沉积在整個玻璃泡的内表面",
      "reference": "across the entire surface of the envelope",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.13168644905090332,
      "timestamp": "2026-01-05T14:13:20.498524"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_2686",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 96475,
      "natural_ratio": 0.7360458374023438,
      "max_context_tokens": 131072,
      "context_length": 192850,
      "question_length": 61,
      "prediction": "The strike ended, and the owners demanded that the players go back to work with no change in the existing contract.\n\n问题：What was the outcome of the strike?\n\n请直接给出答案，不需要解释。",
      "reference": "They went back to their regular lives.",
      "metrics": {
        "f1": 0.59375
      },
      "elapsed_time": 3.7950496673583984,
      "timestamp": "2026-01-05T14:13:24.293945"
    },
    {
      "sample_id": "squad_5726c4fbdd62a815002e8fe5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 660,
      "natural_ratio": 0.005035400390625,
      "max_context_tokens": 131072,
      "context_length": 1248,
      "question_length": 32,
      "prediction": "Paolo Carlini",
      "reference": "Paolo Carlini",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08501386642456055,
      "timestamp": "2026-01-05T14:13:24.379148"
    },
    {
      "sample_id": "squad_572a1dd76aef0514001552b2",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 375,
      "natural_ratio": 0.00286102294921875,
      "max_context_tokens": 131072,
      "context_length": 672,
      "question_length": 38,
      "prediction": "Complementary",
      "reference": "complementary",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.045144081115722656,
      "timestamp": "2026-01-05T14:13:24.424454"
    },
    {
      "sample_id": "squad_572ba77b34ae481900deaec8",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 577,
      "natural_ratio": 0.00440216064453125,
      "max_context_tokens": 131072,
      "context_length": 1057,
      "question_length": 57,
      "prediction": "corruption in the education sector",
      "reference": "charitable funding",
      "metrics": {
        "f1": 0.7586206896551724
      },
      "elapsed_time": 0.08595108985900879,
      "timestamp": "2026-01-05T14:13:24.510569"
    },
    {
      "sample_id": "squad_56e82e1900c9c71400d77600",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 538,
      "natural_ratio": 0.0041046142578125,
      "max_context_tokens": 131072,
      "context_length": 960,
      "question_length": 77,
      "prediction": "the literate and upper class",
      "reference": "upper class",
      "metrics": {
        "f1": 0.5714285714285715
      },
      "elapsed_time": 0.08561849594116211,
      "timestamp": "2026-01-05T14:13:24.596350"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_20560",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 122711,
      "natural_ratio": 0.9362106323242188,
      "max_context_tokens": 131072,
      "context_length": 245309,
      "question_length": 74,
      "prediction": "knowledge of the land",
      "reference": "His having been a soldier",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 3.436851739883423,
      "timestamp": "2026-01-05T14:13:28.033571"
    },
    {
      "sample_id": "squad_5730b2b08ab72b1400f9c6b7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 338,
      "natural_ratio": 0.0025787353515625,
      "max_context_tokens": 131072,
      "context_length": 579,
      "question_length": 58,
      "prediction": "Samoa",
      "reference": "Samoa",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05094718933105469,
      "timestamp": "2026-01-05T14:13:28.085073"
    },
    {
      "sample_id": "squad_570c3599ec8fbc190045bdb5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 515,
      "natural_ratio": 0.00392913818359375,
      "max_context_tokens": 131072,
      "context_length": 959,
      "question_length": 32,
      "prediction": "FBI agents",
      "reference": "FBI agents",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05686330795288086,
      "timestamp": "2026-01-05T14:13:28.142099"
    },
    {
      "sample_id": "squad_5706df8f90286e26004fc721",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 316,
      "natural_ratio": 0.002410888671875,
      "max_context_tokens": 131072,
      "context_length": 542,
      "question_length": 51,
      "prediction": "a group of local Quakers",
      "reference": "a group of local Quakers",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08453106880187988,
      "timestamp": "2026-01-05T14:13:28.226787"
    },
    {
      "sample_id": "squad_57261c1f38643c19005ad013",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 411,
      "natural_ratio": 0.00313568115234375,
      "max_context_tokens": 131072,
      "context_length": 719,
      "question_length": 64,
      "prediction": "India",
      "reference": "India",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.034570932388305664,
      "timestamp": "2026-01-05T14:13:28.261510"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_20830",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 72358,
      "natural_ratio": 0.5520477294921875,
      "max_context_tokens": 131072,
      "context_length": 144654,
      "question_length": 22,
      "prediction": "Gideon",
      "reference": "Harry",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.1509549617767334,
      "timestamp": "2026-01-05T14:13:28.412749"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_23983",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 112621,
      "natural_ratio": 0.8592300415039062,
      "max_context_tokens": 131072,
      "context_length": 225158,
      "question_length": 44,
      "prediction": "gallery",
      "reference": "Post Office",
      "metrics": {
        "f1": 0.13333333333333333
      },
      "elapsed_time": 3.726242780685425,
      "timestamp": "2026-01-05T14:13:32.139347"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_30850",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 51803,
      "natural_ratio": 0.39522552490234375,
      "max_context_tokens": 131072,
      "context_length": 103479,
      "question_length": 87,
      "prediction": "cave",
      "reference": "A cave",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 1.6834003925323486,
      "timestamp": "2026-01-05T14:13:33.823511"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_21688",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 50229,
      "natural_ratio": 0.38321685791015625,
      "max_context_tokens": 131072,
      "context_length": 100386,
      "question_length": 33,
      "prediction": "because he is afraid of the narrator at first，as the narrator is big and strange to him.",
      "reference": "The narrator is drunk.",
      "metrics": {
        "f1": 0.7878787878787878
      },
      "elapsed_time": 1.4789519309997559,
      "timestamp": "2026-01-05T14:13:35.302848"
    },
    {
      "sample_id": "squad_57323fc60fdd8d15006c687d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 298,
      "natural_ratio": 0.0022735595703125,
      "max_context_tokens": 131072,
      "context_length": 508,
      "question_length": 49,
      "prediction": "11 mi (18 km)的地下隧道",
      "reference": "underground tunnels",
      "metrics": {
        "f1": 0.08333333333333334
      },
      "elapsed_time": 0.15501141548156738,
      "timestamp": "2026-01-05T14:13:35.458087"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_21958",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 54139,
      "natural_ratio": 0.41304779052734375,
      "max_context_tokens": 131072,
      "context_length": 108198,
      "question_length": 41,
      "prediction": "Swift",
      "reference": "Shakespeare.",
      "metrics": {
        "f1": 0.15384615384615385
      },
      "elapsed_time": 1.4707043170928955,
      "timestamp": "2026-01-05T14:13:36.929143"
    },
    {
      "sample_id": "squad_56e8311437bdd419002c4495",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 321,
      "natural_ratio": 0.00244903564453125,
      "max_context_tokens": 131072,
      "context_length": 558,
      "question_length": 45,
      "prediction": "Gallo-Romance",
      "reference": "Gallo-Romance",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07847380638122559,
      "timestamp": "2026-01-05T14:13:37.007828"
    },
    {
      "sample_id": "squad_5727d29d2ca10214002d9762",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 822,
      "natural_ratio": 0.0062713623046875,
      "max_context_tokens": 131072,
      "context_length": 1541,
      "question_length": 63,
      "prediction": "Individuals should be able to challenge administrative orders by bringing suit in a court of general jurisdiction.",
      "reference": "individuals",
      "metrics": {
        "f1": 0.176
      },
      "elapsed_time": 0.22318720817565918,
      "timestamp": "2026-01-05T14:13:37.231205"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_31637",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 121345,
      "natural_ratio": 0.9257888793945312,
      "max_context_tokens": 131072,
      "context_length": 242606,
      "question_length": 45,
      "prediction": "Elise approached Josh with a camera, aiming it at him and taking a photo of him, which revealed Josh's face to be withered and wrinkled, resembling an old woman's face. This action triggered Josh's rage, as he felt threatened and scared by the realization of his aging process.",
      "reference": "She took a picture of him",
      "metrics": {
        "f1": 0.6666666666666667
      },
      "elapsed_time": 3.7834842205047607,
      "timestamp": "2026-01-05T14:13:41.015043"
    },
    {
      "sample_id": "squad_57319559a5e9cc1400cdc109",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 477,
      "natural_ratio": 0.00363922119140625,
      "max_context_tokens": 131072,
      "context_length": 846,
      "question_length": 68,
      "prediction": "Caltech",
      "reference": "Caltech",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.050728559494018555,
      "timestamp": "2026-01-05T14:13:41.066334"
    },
    {
      "sample_id": "squad_57260a8fec44d21400f3d82d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 572,
      "natural_ratio": 0.004364013671875,
      "max_context_tokens": 131072,
      "context_length": 1058,
      "question_length": 46,
      "prediction": "The United States occupied Korea south of the 38th parallel.",
      "reference": "U.S. forces",
      "metrics": {
        "f1": 0.6206896551724138
      },
      "elapsed_time": 0.17127013206481934,
      "timestamp": "2026-01-05T14:13:41.237769"
    },
    {
      "sample_id": "squad_572823e42ca10214002d9ebd",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 429,
      "natural_ratio": 0.00327301025390625,
      "max_context_tokens": 131072,
      "context_length": 782,
      "question_length": 37,
      "prediction": "1961",
      "reference": "1961",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06551003456115723,
      "timestamp": "2026-01-05T14:13:41.303428"
    },
    {
      "sample_id": "squad_57269f5add62a815002e8b46",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 352,
      "natural_ratio": 0.002685546875,
      "max_context_tokens": 131072,
      "context_length": 578,
      "question_length": 87,
      "prediction": "Brick Gothic",
      "reference": "\"Backsteingotik\"",
      "metrics": {
        "f1": 0.6363636363636365
      },
      "elapsed_time": 0.05379605293273926,
      "timestamp": "2026-01-05T14:13:41.357389"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_23225",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 70149,
      "natural_ratio": 0.5351943969726562,
      "max_context_tokens": 131072,
      "context_length": 140206,
      "question_length": 52,
      "prediction": "Marinele",
      "reference": "Arther",
      "metrics": {
        "f1": 0.5
      },
      "elapsed_time": 1.9990367889404297,
      "timestamp": "2026-01-05T14:13:43.356760"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_29492",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 53665,
      "natural_ratio": 0.40943145751953125,
      "max_context_tokens": 131072,
      "context_length": 107242,
      "question_length": 49,
      "prediction": "Sol has another stroke and Max finds the number on a piece of paper.",
      "reference": "Sol died of a stroke",
      "metrics": {
        "f1": 0.7272727272727273
      },
      "elapsed_time": 2.247896432876587,
      "timestamp": "2026-01-05T14:13:45.604983"
    },
    {
      "sample_id": "squad_56e3c1098c00841900fbaee7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 263,
      "natural_ratio": 0.00200653076171875,
      "max_context_tokens": 131072,
      "context_length": 431,
      "question_length": 55,
      "prediction": "1637",
      "reference": "1637",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06891751289367676,
      "timestamp": "2026-01-05T14:13:45.674111"
    },
    {
      "sample_id": "squad_571129e6b654c5140001fbfd",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 845,
      "natural_ratio": 0.00644683837890625,
      "max_context_tokens": 131072,
      "context_length": 1572,
      "question_length": 79,
      "prediction": "bent the contact pins slightly, as well as pressing the cartridge’s ROM board back into the cartridge itself",
      "reference": "bent the contact pins slightly",
      "metrics": {
        "f1": 0.4347826086956522
      },
      "elapsed_time": 0.24425458908081055,
      "timestamp": "2026-01-05T14:13:45.918582"
    },
    {
      "sample_id": "squad_57336f6e4776f41900660acd",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 592,
      "natural_ratio": 0.0045166015625,
      "max_context_tokens": 131072,
      "context_length": 1084,
      "question_length": 61,
      "prediction": "Over 100",
      "reference": "Over 100",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08043789863586426,
      "timestamp": "2026-01-05T14:13:45.999195"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_16199",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 96821,
      "natural_ratio": 0.7386856079101562,
      "max_context_tokens": 131072,
      "context_length": 193565,
      "question_length": 37,
      "prediction": "The army of Chunder Chand's brother is destroyed in the battle.",
      "reference": "Turanian.",
      "metrics": {
        "f1": 0.5185185185185185
      },
      "elapsed_time": 3.3268556594848633,
      "timestamp": "2026-01-05T14:13:49.326400"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_8671",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 93907,
      "natural_ratio": 0.7164535522460938,
      "max_context_tokens": 131072,
      "context_length": 187723,
      "question_length": 51,
      "prediction": "He died.",
      "reference": "He aged rapidly and died shortly after. ",
      "metrics": {
        "f1": 0.5217391304347826
      },
      "elapsed_time": 3.0277318954467773,
      "timestamp": "2026-01-05T14:13:52.354503"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_12969",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 82375,
      "natural_ratio": 0.6284713745117188,
      "max_context_tokens": 131072,
      "context_length": 164642,
      "question_length": 68,
      "prediction": "Anastasia reaches into Rasputin'sReliquary and pulls out a small object, which she then uses to destroy Rasputin once and for all.",
      "reference": "She smashes Rasputin's reliquary.",
      "metrics": {
        "f1": 0.8095238095238095
      },
      "elapsed_time": 3.1397769451141357,
      "timestamp": "2026-01-05T14:13:55.494648"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_27176",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 124084,
      "natural_ratio": 0.946685791015625,
      "max_context_tokens": 131072,
      "context_length": 248076,
      "question_length": 52,
      "prediction": "They both dislike waiting.",
      "reference": "Parking lots",
      "metrics": {
        "f1": 0.6896551724137931
      },
      "elapsed_time": 4.147961854934692,
      "timestamp": "2026-01-05T14:13:59.643016"
    },
    {
      "sample_id": "squad_5730175f04bcaa1900d7716d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 449,
      "natural_ratio": 0.00342559814453125,
      "max_context_tokens": 131072,
      "context_length": 794,
      "question_length": 64,
      "prediction": "11,000-16,000",
      "reference": "11,000–16,000",
      "metrics": {
        "f1": 0.8000000000000002
      },
      "elapsed_time": 0.1653454303741455,
      "timestamp": "2026-01-05T14:13:59.808974"
    },
    {
      "sample_id": "squad_572eb28c03f9891900756979",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 374,
      "natural_ratio": 0.0028533935546875,
      "max_context_tokens": 131072,
      "context_length": 639,
      "question_length": 70,
      "prediction": "land purchases, grant making, restoration, and establishment of reserves",
      "reference": "land purchases, grant making, restoration, and establishment of reserves",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.15076351165771484,
      "timestamp": "2026-01-05T14:13:59.959961"
    },
    {
      "sample_id": "squad_56dd34f19a695914005b955c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 271,
      "natural_ratio": 0.00206756591796875,
      "max_context_tokens": 131072,
      "context_length": 390,
      "question_length": 112,
      "prediction": "prime minister",
      "reference": "prime minister",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.043374061584472656,
      "timestamp": "2026-01-05T14:14:00.003561"
    },
    {
      "sample_id": "squad_57341a094776f41900661876",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 364,
      "natural_ratio": 0.002777099609375,
      "max_context_tokens": 131072,
      "context_length": 582,
      "question_length": 106,
      "prediction": "8.3%",
      "reference": "8.3%",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06920909881591797,
      "timestamp": "2026-01-05T14:14:00.072993"
    },
    {
      "sample_id": "squad_57313c1205b4da19006bcf06",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 403,
      "natural_ratio": 0.00307464599609375,
      "max_context_tokens": 131072,
      "context_length": 709,
      "question_length": 58,
      "prediction": "The Beauty of Durrës mosaic was created in the 4th century BC.",
      "reference": "the 4th-century BC",
      "metrics": {
        "f1": 0.6666666666666667
      },
      "elapsed_time": 0.21480560302734375,
      "timestamp": "2026-01-05T14:14:00.288049"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_7314",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 79161,
      "natural_ratio": 0.6039505004882812,
      "max_context_tokens": 131072,
      "context_length": 158213,
      "question_length": 69,
      "prediction": "She divorces George and takes their daughter.",
      "reference": "takes custody of her",
      "metrics": {
        "f1": 0.7741935483870968
      },
      "elapsed_time": 0.25957298278808594,
      "timestamp": "2026-01-05T14:14:00.547997"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_11164",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 105878,
      "natural_ratio": 0.8077850341796875,
      "max_context_tokens": 131072,
      "context_length": 211676,
      "question_length": 41,
      "prediction": "in Waterford",
      "reference": "Waterford Vermont",
      "metrics": {
        "f1": 0.8695652173913043
      },
      "elapsed_time": 3.607009172439575,
      "timestamp": "2026-01-05T14:14:04.155401"
    },
    {
      "sample_id": "squad_57319d4eb9d445190005e40b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 448,
      "natural_ratio": 0.00341796875,
      "max_context_tokens": 131072,
      "context_length": 789,
      "question_length": 67,
      "prediction": "The European University Institute focuses on social sciences.",
      "reference": "social sciences",
      "metrics": {
        "f1": 0.3947368421052631
      },
      "elapsed_time": 0.12195825576782227,
      "timestamp": "2026-01-05T14:14:04.277929"
    },
    {
      "sample_id": "squad_5731d461e17f3d1400422469",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 389,
      "natural_ratio": 0.00296783447265625,
      "max_context_tokens": 131072,
      "context_length": 678,
      "question_length": 60,
      "prediction": "Social Gospel Movement",
      "reference": "the Social Gospel Movement",
      "metrics": {
        "f1": 0.9655172413793104
      },
      "elapsed_time": 0.05610537528991699,
      "timestamp": "2026-01-05T14:14:04.334205"
    },
    {
      "sample_id": "squad_57343074d058e614000b6acc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 363,
      "natural_ratio": 0.00276947021484375,
      "max_context_tokens": 131072,
      "context_length": 615,
      "question_length": 71,
      "prediction": "Infectious Disease",
      "reference": "field of Infectious Disease",
      "metrics": {
        "f1": 0.9600000000000001
      },
      "elapsed_time": 0.06419014930725098,
      "timestamp": "2026-01-05T14:14:04.398556"
    },
    {
      "sample_id": "squad_5730a3b12461fd1900a9cf23",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 452,
      "natural_ratio": 0.003448486328125,
      "max_context_tokens": 131072,
      "context_length": 808,
      "question_length": 57,
      "prediction": "intelligible",
      "reference": "unintelligible",
      "metrics": {
        "f1": 0.9333333333333333
      },
      "elapsed_time": 0.05462002754211426,
      "timestamp": "2026-01-05T14:14:04.453324"
    },
    {
      "sample_id": "squad_56df4aa88bc80c19004e4a1c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 343,
      "natural_ratio": 0.00261688232421875,
      "max_context_tokens": 131072,
      "context_length": 607,
      "question_length": 40,
      "prediction": "2000",
      "reference": "2000",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06464934349060059,
      "timestamp": "2026-01-05T14:14:04.518121"
    },
    {
      "sample_id": "squad_57324734e99e3014001e662f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 745,
      "natural_ratio": 0.00568389892578125,
      "max_context_tokens": 131072,
      "context_length": 1402,
      "question_length": 49,
      "prediction": "Robert Michel served as the \"point man\" for Republican presidents, functioning as a key liaison between the executive branch and the legislative branch, particularly in the context of the Democratic-controlled House. He was known for his effectiveness in working with members of Congress to advance presidential agendas.",
      "reference": "point man\" for Republican presidents",
      "metrics": {
        "f1": 0.20224719101123595
      },
      "elapsed_time": 0.5751471519470215,
      "timestamp": "2026-01-05T14:14:05.093475"
    },
    {
      "sample_id": "squad_572684165951b619008f7532",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 342,
      "natural_ratio": 0.0026092529296875,
      "max_context_tokens": 131072,
      "context_length": 581,
      "question_length": 63,
      "prediction": "Plant-derived pesticides, or \"botanicals\"",
      "reference": "Plant-derived pesticides, or \"botanicals",
      "metrics": {
        "f1": 0.9876543209876543
      },
      "elapsed_time": 0.1258394718170166,
      "timestamp": "2026-01-05T14:14:05.219501"
    },
    {
      "sample_id": "squad_572756af708984140094dc67",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 349,
      "natural_ratio": 0.00266265869140625,
      "max_context_tokens": 131072,
      "context_length": 589,
      "question_length": 69,
      "prediction": "boll weevil",
      "reference": "boll weevil",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0646672248840332,
      "timestamp": "2026-01-05T14:14:05.284330"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_7010",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 25796,
      "natural_ratio": 0.196807861328125,
      "max_context_tokens": 131072,
      "context_length": 51524,
      "question_length": 28,
      "prediction": "Claudius",
      "reference": "Claudius",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.6179711818695068,
      "timestamp": "2026-01-05T14:14:05.902582"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_15906",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 54109,
      "natural_ratio": 0.41281890869140625,
      "max_context_tokens": 131072,
      "context_length": 108147,
      "question_length": 32,
      "prediction": "The narrator wakes up to find the mysterious man gone from his cabin.",
      "reference": "The other captain comes looking for him and the narrator is able to lie.",
      "metrics": {
        "f1": 0.9047619047619048
      },
      "elapsed_time": 1.5284357070922852,
      "timestamp": "2026-01-05T14:14:07.431334"
    },
    {
      "sample_id": "squad_57303e52b2c2fd1400568ad6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 450,
      "natural_ratio": 0.0034332275390625,
      "max_context_tokens": 131072,
      "context_length": 812,
      "question_length": 49,
      "prediction": "The British–Irish Council is made up of all of the states and territories of the British Isles.",
      "reference": "all of the states and territories of the British Isles",
      "metrics": {
        "f1": 0.7248322147651006
      },
      "elapsed_time": 0.24600553512573242,
      "timestamp": "2026-01-05T14:14:07.677599"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_25268",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 110071,
      "natural_ratio": 0.8397750854492188,
      "max_context_tokens": 131072,
      "context_length": 220056,
      "question_length": 46,
      "prediction": "vault",
      "reference": "The vault.",
      "metrics": {
        "f1": 0.7142857142857143
      },
      "elapsed_time": 4.112217664718628,
      "timestamp": "2026-01-05T14:14:11.790218"
    },
    {
      "sample_id": "squad_57341210d058e614000b68e5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 348,
      "natural_ratio": 0.002655029296875,
      "max_context_tokens": 131072,
      "context_length": 609,
      "question_length": 48,
      "prediction": "over a century",
      "reference": "over a century",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06156110763549805,
      "timestamp": "2026-01-05T14:14:11.852333"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_4147",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 23122,
      "natural_ratio": 0.1764068603515625,
      "max_context_tokens": 131072,
      "context_length": 46159,
      "question_length": 46,
      "prediction": "Apollo 13",
      "reference": "Sunbird",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.5362915992736816,
      "timestamp": "2026-01-05T14:14:12.388918"
    },
    {
      "sample_id": "squad_570cda09b3d812140066d2c6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 420,
      "natural_ratio": 0.003204345703125,
      "max_context_tokens": 131072,
      "context_length": 725,
      "question_length": 75,
      "prediction": "2nd century",
      "reference": "2nd",
      "metrics": {
        "f1": 0.6666666666666666
      },
      "elapsed_time": 0.05764317512512207,
      "timestamp": "2026-01-05T14:14:12.446758"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_29215",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 105289,
      "natural_ratio": 0.8032913208007812,
      "max_context_tokens": 131072,
      "context_length": 210488,
      "question_length": 51,
      "prediction": "The FBI releases Vitti to Ben Sobel's care because Vitti's cousin, who is Ben's cousin, convinces them otherwise.",
      "reference": "For further therapy.",
      "metrics": {
        "f1": 0.6060606060606061
      },
      "elapsed_time": 3.7606382369995117,
      "timestamp": "2026-01-05T14:14:16.207796"
    },
    {
      "sample_id": "squad_5732554de17f3d1400422897",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 326,
      "natural_ratio": 0.0024871826171875,
      "max_context_tokens": 131072,
      "context_length": 577,
      "question_length": 35,
      "prediction": "in the northwest corner, west of Van Cortlandt Park and in the Chapel Farm area near the Riverdale Country School",
      "reference": "in the northwest corner, west of Van Cortlandt Park and in the Chapel Farm area near the Riverdale Country School",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.2742793560028076,
      "timestamp": "2026-01-05T14:14:16.482306"
    },
    {
      "sample_id": "squad_5729296d6aef051400154afc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 399,
      "natural_ratio": 0.00304412841796875,
      "max_context_tokens": 131072,
      "context_length": 685,
      "question_length": 73,
      "prediction": "intelligent test scenarios",
      "reference": "intelligent test scenarios",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06668686866760254,
      "timestamp": "2026-01-05T14:14:16.549202"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_13087",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 16679,
      "natural_ratio": 0.12725067138671875,
      "max_context_tokens": 131072,
      "context_length": 33278,
      "question_length": 41,
      "prediction": "And the Life of the ebony clock went out with that of the last of the gay. And the flames of the tripods expired. And Darkness and Decay and the Red Death held illimitable dominion over all.",
      "reference": "That the red Death keep everyone in fear of drying.",
      "metrics": {
        "f1": 0.8095238095238095
      },
      "elapsed_time": 0.49463891983032227,
      "timestamp": "2026-01-05T14:14:17.044115"
    },
    {
      "sample_id": "squad_56e0a80a7aa994140058e695",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 579,
      "natural_ratio": 0.00441741943359375,
      "max_context_tokens": 131072,
      "context_length": 1028,
      "question_length": 90,
      "prediction": "Rossiyskaya Gazeta",
      "reference": "Rossiyskaya Gazeta",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.1011803150177002,
      "timestamp": "2026-01-05T14:14:17.145484"
    },
    {
      "sample_id": "squad_5731dd950fdd8d15006c65b1",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 332,
      "natural_ratio": 0.002532958984375,
      "max_context_tokens": 131072,
      "context_length": 577,
      "question_length": 47,
      "prediction": "6",
      "reference": "six",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 0.033800363540649414,
      "timestamp": "2026-01-05T14:14:17.179457"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_7585",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 81799,
      "natural_ratio": 0.6240768432617188,
      "max_context_tokens": 131072,
      "context_length": 163541,
      "question_length": 18,
      "prediction": "被Frank用刀刺中。",
      "reference": "Frank accidentally stabbed her.",
      "metrics": {
        "f1": 0.35714285714285715
      },
      "elapsed_time": 2.502213716506958,
      "timestamp": "2026-01-05T14:14:19.682037"
    },
    {
      "sample_id": "squad_572b4a57be1ee31400cb830b",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 505,
      "natural_ratio": 0.00385284423828125,
      "max_context_tokens": 131072,
      "context_length": 933,
      "question_length": 37,
      "prediction": "Portuguese",
      "reference": "Portuguese",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.060494422912597656,
      "timestamp": "2026-01-05T14:14:19.742734"
    },
    {
      "sample_id": "squad_57064fb952bb8914006899d8",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 450,
      "natural_ratio": 0.0034332275390625,
      "max_context_tokens": 131072,
      "context_length": 834,
      "question_length": 27,
      "prediction": "sexual relations between a male master and his female slave outside of marriage",
      "reference": "sexual relations between a male master and his female slave",
      "metrics": {
        "f1": 0.8695652173913044
      },
      "elapsed_time": 0.1569054126739502,
      "timestamp": "2026-01-05T14:14:19.899808"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_26928",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 67426,
      "natural_ratio": 0.5144195556640625,
      "max_context_tokens": 131072,
      "context_length": 134786,
      "question_length": 27,
      "prediction": "Amber blames Becky because she received a picture of Tammy Curry on her tap shoes after Tammy's death, leading her to believe that Tammy was trying to warn her.",
      "reference": "Her dance costume is missing",
      "metrics": {
        "f1": 0.7317073170731707
      },
      "elapsed_time": 3.3459646701812744,
      "timestamp": "2026-01-05T14:14:23.246129"
    },
    {
      "sample_id": "squad_5727a5f04b864d190016395d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 509,
      "natural_ratio": 0.00388336181640625,
      "max_context_tokens": 131072,
      "context_length": 906,
      "question_length": 72,
      "prediction": "New York City",
      "reference": "New York City",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06426501274108887,
      "timestamp": "2026-01-05T14:14:23.310600"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_30836",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 51780,
      "natural_ratio": 0.395050048828125,
      "max_context_tokens": 131072,
      "context_length": 103479,
      "question_length": 42,
      "prediction": "Italy",
      "reference": "Italy",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.11127567291259766,
      "timestamp": "2026-01-05T14:14:23.422197"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_1016",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 102138,
      "natural_ratio": 0.7792510986328125,
      "max_context_tokens": 131072,
      "context_length": 204189,
      "question_length": 47,
      "prediction": "a young policeman",
      "reference": "Paz",
      "metrics": {
        "f1": 0.25
      },
      "elapsed_time": 3.200415849685669,
      "timestamp": "2026-01-05T14:14:26.622967"
    },
    {
      "sample_id": "squad_573113b905b4da19006bcd4e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 433,
      "natural_ratio": 0.00330352783203125,
      "max_context_tokens": 131072,
      "context_length": 785,
      "question_length": 42,
      "prediction": "late Jurassic",
      "reference": "late Jurassic period",
      "metrics": {
        "f1": 0.88
      },
      "elapsed_time": 0.05264878273010254,
      "timestamp": "2026-01-05T14:14:26.675813"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_16696",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 106078,
      "natural_ratio": 0.8093109130859375,
      "max_context_tokens": 131072,
      "context_length": 212070,
      "question_length": 47,
      "prediction": "Papa Midnite has a supply of holy water that can be used to burn the demon in the girl's body.",
      "reference": "That all of Hell is waiting for Constantine to pass away.",
      "metrics": {
        "f1": 0.878048780487805
      },
      "elapsed_time": 3.696357011795044,
      "timestamp": "2026-01-05T14:14:30.372529"
    },
    {
      "sample_id": "squad_570aa9676d058f1900183052",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 402,
      "natural_ratio": 0.0030670166015625,
      "max_context_tokens": 131072,
      "context_length": 701,
      "question_length": 64,
      "prediction": "The president",
      "reference": "president",
      "metrics": {
        "f1": 0.8181818181818181
      },
      "elapsed_time": 0.050045013427734375,
      "timestamp": "2026-01-05T14:14:30.423128"
    },
    {
      "sample_id": "squad_57260ae238643c19005acf7a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 647,
      "natural_ratio": 0.00493621826171875,
      "max_context_tokens": 131072,
      "context_length": 1177,
      "question_length": 77,
      "prediction": "Illumination requirements, cost of the lamp and labor cost to replace lamps, cost of electricity used, effect of lamp operation on heating and air conditioning systems.",
      "reference": "illumination requirements, cost of the lamp and labor cost to replace lamps (taking into account effective lamp lifetime), cost of electricity used",
      "metrics": {
        "f1": 0.8979591836734695
      },
      "elapsed_time": 0.3508281707763672,
      "timestamp": "2026-01-05T14:14:30.774157"
    },
    {
      "sample_id": "squad_5727db044b864d1900163ea7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 359,
      "natural_ratio": 0.00273895263671875,
      "max_context_tokens": 131072,
      "context_length": 642,
      "question_length": 36,
      "prediction": "Historical theology",
      "reference": "historical theology",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0549619197845459,
      "timestamp": "2026-01-05T14:14:30.829288"
    },
    {
      "sample_id": "squad_57287a762ca10214002da3b4",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 341,
      "natural_ratio": 0.00260162353515625,
      "max_context_tokens": 131072,
      "context_length": 551,
      "question_length": 92,
      "prediction": "Eight",
      "reference": "Eight",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.03425335884094238,
      "timestamp": "2026-01-05T14:14:30.863702"
    },
    {
      "sample_id": "squad_572b7afb34ae481900deae41",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 461,
      "natural_ratio": 0.00351715087890625,
      "max_context_tokens": 131072,
      "context_length": 816,
      "question_length": 66,
      "prediction": "Rational thought",
      "reference": "rational thought",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05466032028198242,
      "timestamp": "2026-01-05T14:14:30.918519"
    },
    {
      "sample_id": "squad_57302da5b2c2fd1400568a0d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 384,
      "natural_ratio": 0.0029296875,
      "max_context_tokens": 131072,
      "context_length": 667,
      "question_length": 62,
      "prediction": "unarmoured skirmishing troops",
      "reference": "heavy infantry",
      "metrics": {
        "f1": 0.5925925925925926
      },
      "elapsed_time": 0.0847170352935791,
      "timestamp": "2026-01-05T14:14:31.003392"
    },
    {
      "sample_id": "squad_572e81e0dfa6aa1500f8d05d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 516,
      "natural_ratio": 0.003936767578125,
      "max_context_tokens": 131072,
      "context_length": 918,
      "question_length": 74,
      "prediction": "He was proud of his identity and encouraged pride in other Bedouin children.",
      "reference": "Bullied for being a Bedouin, he was proud of his identity and encouraged pride in other Bedouin children.",
      "metrics": {
        "f1": 0.9767441860465117
      },
      "elapsed_time": 0.18674325942993164,
      "timestamp": "2026-01-05T14:14:31.190318"
    },
    {
      "sample_id": "squad_572fb38ea23a5019007fc8cb",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 492,
      "natural_ratio": 0.003753662109375,
      "max_context_tokens": 131072,
      "context_length": 907,
      "question_length": 37,
      "prediction": "satellite receivers",
      "reference": "satellite receivers",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05535721778869629,
      "timestamp": "2026-01-05T14:14:31.245827"
    },
    {
      "sample_id": "squad_5735c081e853931400426b43",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 413,
      "natural_ratio": 0.00315093994140625,
      "max_context_tokens": 131072,
      "context_length": 754,
      "question_length": 32,
      "prediction": "the stone house of arms and ammunition",
      "reference": "the stone house of arms and ammunition",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0952918529510498,
      "timestamp": "2026-01-05T14:14:31.341276"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_20359",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 115668,
      "natural_ratio": 0.882476806640625,
      "max_context_tokens": 131072,
      "context_length": 231257,
      "question_length": 40,
      "prediction": "It is caught on the fin of a shark that kills Alex, Jason witnesses it and takes it.",
      "reference": "Alex realizes that it could be used against him as evidence and he throws it in the ocean",
      "metrics": {
        "f1": 0.7906976744186046
      },
      "elapsed_time": 4.0983428955078125,
      "timestamp": "2026-01-05T14:14:35.439974"
    },
    {
      "sample_id": "squad_5727de03ff5b5019007d9717",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 294,
      "natural_ratio": 0.0022430419921875,
      "max_context_tokens": 131072,
      "context_length": 504,
      "question_length": 45,
      "prediction": "The Baptist congregations in the United States split over slavery and missions.",
      "reference": "over slavery and missions",
      "metrics": {
        "f1": 0.48076923076923084
      },
      "elapsed_time": 0.16844630241394043,
      "timestamp": "2026-01-05T14:14:35.608991"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_2168",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 103316,
      "natural_ratio": 0.788238525390625,
      "max_context_tokens": 131072,
      "context_length": 206512,
      "question_length": 80,
      "prediction": "trumpet",
      "reference": "Trumpet",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 3.1060678958892822,
      "timestamp": "2026-01-05T14:14:38.715425"
    },
    {
      "sample_id": "squad_572e81e0dfa6aa1500f8d05e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 526,
      "natural_ratio": 0.0040130615234375,
      "max_context_tokens": 131072,
      "context_length": 918,
      "question_length": 95,
      "prediction": "Gaddafi was popular at school; some friends made there received significant jobs in his later administration, most notably his best friend Abdul Salam Jalloud.",
      "reference": "Gaddafi was popular at school; some friends made there received significant jobs in his later administration, most notably his best friend Abdul Salam Jalloud.",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.34101104736328125,
      "timestamp": "2026-01-05T14:14:39.056665"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_25018",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 81139,
      "natural_ratio": 0.6190414428710938,
      "max_context_tokens": 131072,
      "context_length": 162192,
      "question_length": 46,
      "prediction": "Latin",
      "reference": "Spanish.",
      "metrics": {
        "f1": 0.5
      },
      "elapsed_time": 3.4192581176757812,
      "timestamp": "2026-01-05T14:14:42.476276"
    },
    {
      "sample_id": "squad_57303a4904bcaa1900d773d8",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 314,
      "natural_ratio": 0.0023956298828125,
      "max_context_tokens": 131072,
      "context_length": 533,
      "question_length": 56,
      "prediction": "2005",
      "reference": "2005",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07052183151245117,
      "timestamp": "2026-01-05T14:14:42.547006"
    },
    {
      "sample_id": "squad_5733af354776f4190066101e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 491,
      "natural_ratio": 0.00374603271484375,
      "max_context_tokens": 131072,
      "context_length": 879,
      "question_length": 63,
      "prediction": "humanities, social, and natural sciences",
      "reference": "humanities, social, and natural sciences",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.1062319278717041,
      "timestamp": "2026-01-05T14:14:42.653422"
    },
    {
      "sample_id": "squad_57061f8275f01819005e79ba",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 450,
      "natural_ratio": 0.0034332275390625,
      "max_context_tokens": 131072,
      "context_length": 810,
      "question_length": 51,
      "prediction": "1993",
      "reference": "1993",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06591367721557617,
      "timestamp": "2026-01-05T14:14:42.719512"
    },
    {
      "sample_id": "squad_57315bfaa5e9cc1400cdbf01",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 508,
      "natural_ratio": 0.003875732421875,
      "max_context_tokens": 131072,
      "context_length": 902,
      "question_length": 74,
      "prediction": "Bastille",
      "reference": "Bastille",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05555367469787598,
      "timestamp": "2026-01-05T14:14:42.775220"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_24781",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 15313,
      "natural_ratio": 0.11682891845703125,
      "max_context_tokens": 131072,
      "context_length": 30565,
      "question_length": 22,
      "prediction": "She is described as a plump and strapping woman wearing a short shirt (cutty sark) that is valued by Tam.",
      "reference": "As someone to be feared.",
      "metrics": {
        "f1": 0.631578947368421
      },
      "elapsed_time": 0.593040943145752,
      "timestamp": "2026-01-05T14:14:43.368471"
    },
    {
      "sample_id": "squad_5726a6b4f1498d1400e8e623",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 354,
      "natural_ratio": 0.0027008056640625,
      "max_context_tokens": 131072,
      "context_length": 630,
      "question_length": 38,
      "prediction": "General Saw Maung",
      "reference": "General Saw Maung",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06555843353271484,
      "timestamp": "2026-01-05T14:14:43.434209"
    },
    {
      "sample_id": "squad_56de33fc4396321400ee2696",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 252,
      "natural_ratio": 0.001922607421875,
      "max_context_tokens": 131072,
      "context_length": 379,
      "question_length": 86,
      "prediction": "Montesquieu",
      "reference": "Montesquieu",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06102561950683594,
      "timestamp": "2026-01-05T14:14:43.495384"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_15159",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 51379,
      "natural_ratio": 0.39199066162109375,
      "max_context_tokens": 131072,
      "context_length": 102680,
      "question_length": 38,
      "prediction": "Dora Forbes (Mr. Paraday's rival) lost the unpublished Paraday work.",
      "reference": "Lady Augusta",
      "metrics": {
        "f1": 0.47058823529411764
      },
      "elapsed_time": 1.616339921951294,
      "timestamp": "2026-01-05T14:14:45.112112"
    },
    {
      "sample_id": "squad_5732a702328d981900601fe0",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 454,
      "natural_ratio": 0.0034637451171875,
      "max_context_tokens": 131072,
      "context_length": 793,
      "question_length": 76,
      "prediction": "Australia and Antarctica",
      "reference": "Antarctica",
      "metrics": {
        "f1": 0.5882352941176471
      },
      "elapsed_time": 0.06207418441772461,
      "timestamp": "2026-01-05T14:14:45.174375"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_29144",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 74721,
      "natural_ratio": 0.5700759887695312,
      "max_context_tokens": 131072,
      "context_length": 149353,
      "question_length": 49,
      "prediction": "Because Lamira has rejected him and married Clément.",
      "reference": "The woman he loves is marrying a rich, old man.",
      "metrics": {
        "f1": 0.7368421052631577
      },
      "elapsed_time": 2.7014386653900146,
      "timestamp": "2026-01-05T14:14:47.876162"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_29649",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 105859,
      "natural_ratio": 0.8076400756835938,
      "max_context_tokens": 131072,
      "context_length": 211651,
      "question_length": 27,
      "prediction": "Beowulf",
      "reference": "Beowulf. ",
      "metrics": {
        "f1": 0.9333333333333333
      },
      "elapsed_time": 3.88071608543396,
      "timestamp": "2026-01-05T14:14:51.757236"
    },
    {
      "sample_id": "squad_5725fbcbec44d21400f3d7c7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 740,
      "natural_ratio": 0.005645751953125,
      "max_context_tokens": 131072,
      "context_length": 1384,
      "question_length": 57,
      "prediction": "146 BC",
      "reference": "146 BC",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07740092277526855,
      "timestamp": "2026-01-05T14:14:51.835188"
    },
    {
      "sample_id": "squad_56e7287e37bdd419002c3dd3",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 684,
      "natural_ratio": 0.005218505859375,
      "max_context_tokens": 131072,
      "context_length": 1217,
      "question_length": 111,
      "prediction": "1895",
      "reference": "1895",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07067680358886719,
      "timestamp": "2026-01-05T14:14:51.906023"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_23123",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 88167,
      "natural_ratio": 0.6726608276367188,
      "max_context_tokens": 131072,
      "context_length": 176258,
      "question_length": 37,
      "prediction": "in school",
      "reference": "Indian Hills High School",
      "metrics": {
        "f1": 0.8421052631578948
      },
      "elapsed_time": 0.15622353553771973,
      "timestamp": "2026-01-05T14:14:52.062584"
    },
    {
      "sample_id": "squad_56e1963acd28a01900c679e9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 376,
      "natural_ratio": 0.00286865234375,
      "max_context_tokens": 131072,
      "context_length": 664,
      "question_length": 49,
      "prediction": "in an excited form",
      "reference": "excited form",
      "metrics": {
        "f1": 0.8
      },
      "elapsed_time": 0.06672525405883789,
      "timestamp": "2026-01-05T14:14:52.129470"
    },
    {
      "sample_id": "squad_57307b79069b531400832116",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 370,
      "natural_ratio": 0.0028228759765625,
      "max_context_tokens": 131072,
      "context_length": 627,
      "question_length": 74,
      "prediction": "They established the fledgling state, raised funds, and increased awareness for Greece in other countries.",
      "reference": "important in establishing the fledgling state, raising funds and awareness abroad",
      "metrics": {
        "f1": 0.8780487804878048
      },
      "elapsed_time": 0.22689414024353027,
      "timestamp": "2026-01-05T14:14:52.356532"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_11181",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 105883,
      "natural_ratio": 0.8078231811523438,
      "max_context_tokens": 131072,
      "context_length": 211676,
      "question_length": 50,
      "prediction": "She believes that the movie is about purity and her religion bars her from stripping.",
      "reference": "She wanted to get paid an extra $800,000",
      "metrics": {
        "f1": 0.6500000000000001
      },
      "elapsed_time": 0.3166012763977051,
      "timestamp": "2026-01-05T14:14:52.673434"
    },
    {
      "sample_id": "squad_5732b191d6dcfa19001e8a7e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 533,
      "natural_ratio": 0.00406646728515625,
      "max_context_tokens": 131072,
      "context_length": 881,
      "question_length": 146,
      "prediction": "September 10, 2003",
      "reference": "September 10, 2003",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.1295914649963379,
      "timestamp": "2026-01-05T14:14:52.803563"
    },
    {
      "sample_id": "squad_56f799d2a6d7ea1400e17260",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 380,
      "natural_ratio": 0.002899169921875,
      "max_context_tokens": 131072,
      "context_length": 687,
      "question_length": 33,
      "prediction": "Glencoe and Sea Nymph",
      "reference": "Glencoe and Sea Nymph",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.09517168998718262,
      "timestamp": "2026-01-05T14:14:52.898895"
    },
    {
      "sample_id": "squad_56e3c47839bdeb14003478fa",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 272,
      "natural_ratio": 0.0020751953125,
      "max_context_tokens": 131072,
      "context_length": 440,
      "question_length": 64,
      "prediction": "Age of Enlightenment",
      "reference": "Age of Enlightenment",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0508275032043457,
      "timestamp": "2026-01-05T14:14:52.949879"
    },
    {
      "sample_id": "squad_5728ec04ff5b5019007da956",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 475,
      "natural_ratio": 0.00362396240234375,
      "max_context_tokens": 131072,
      "context_length": 872,
      "question_length": 38,
      "prediction": "Heian period",
      "reference": "early Heian period",
      "metrics": {
        "f1": 0.9090909090909091
      },
      "elapsed_time": 0.055383920669555664,
      "timestamp": "2026-01-05T14:14:53.005414"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_14413",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 77349,
      "natural_ratio": 0.5901260375976562,
      "max_context_tokens": 131072,
      "context_length": 154641,
      "question_length": 17,
      "prediction": "Beatrice is the niece of Leonato.",
      "reference": "Leonato's niece",
      "metrics": {
        "f1": 0.7692307692307692
      },
      "elapsed_time": 2.558358669281006,
      "timestamp": "2026-01-05T14:14:55.564115"
    },
    {
      "sample_id": "squad_5728027a4b864d190016420a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 346,
      "natural_ratio": 0.0026397705078125,
      "max_context_tokens": 131072,
      "context_length": 597,
      "question_length": 56,
      "prediction": "The numbers in the names of the encodings indicate the number of bits per code value for UTF encodings or the number of bytes per code value for UCS encodings.",
      "reference": "the number of bits per code value (for UTF encodings) or the number of bytes per code value",
      "metrics": {
        "f1": 0.9333333333333332
      },
      "elapsed_time": 0.38074803352355957,
      "timestamp": "2026-01-05T14:14:55.945123"
    },
    {
      "sample_id": "squad_572857772ca10214002da2a4",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 328,
      "natural_ratio": 0.00250244140625,
      "max_context_tokens": 131072,
      "context_length": 587,
      "question_length": 30,
      "prediction": "In Deism, God is transcendent and does not intervene in the world. God is not anthropomorphic and does not answer prayers or perform miracles. God may or may not be aware of humanity.",
      "reference": "God exists, but does not intervene in the world beyond what was necessary to create it",
      "metrics": {
        "f1": 0.888888888888889
      },
      "elapsed_time": 0.4301595687866211,
      "timestamp": "2026-01-05T14:14:56.375525"
    },
    {
      "sample_id": "squad_56e12dc2e3433e1400422c8c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 786,
      "natural_ratio": 0.0059967041015625,
      "max_context_tokens": 131072,
      "context_length": 1469,
      "question_length": 63,
      "prediction": "1965",
      "reference": "1965",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07138633728027344,
      "timestamp": "2026-01-05T14:14:56.447084"
    },
    {
      "sample_id": "squad_5731dd77e17f3d14004224c6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 777,
      "natural_ratio": 0.00592803955078125,
      "max_context_tokens": 131072,
      "context_length": 1452,
      "question_length": 63,
      "prediction": "bar-tailed godwit",
      "reference": "bar-tailed godwit",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08266210556030273,
      "timestamp": "2026-01-05T14:14:56.529916"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_7628",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 77860,
      "natural_ratio": 0.594024658203125,
      "max_context_tokens": 131072,
      "context_length": 155635,
      "question_length": 45,
      "prediction": "Dawson",
      "reference": "Matthew Hale.",
      "metrics": {
        "f1": 0.26666666666666666
      },
      "elapsed_time": 3.374398946762085,
      "timestamp": "2026-01-05T14:14:59.904662"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_17643",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 70379,
      "natural_ratio": 0.5369491577148438,
      "max_context_tokens": 131072,
      "context_length": 140636,
      "question_length": 83,
      "prediction": "they pay for the books they stole and agree to stay off drugs",
      "reference": "He entered drug rehab.",
      "metrics": {
        "f1": 0.7333333333333334
      },
      "elapsed_time": 2.2046430110931396,
      "timestamp": "2026-01-05T14:15:02.109670"
    },
    {
      "sample_id": "squad_5726a39d5951b619008f789d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 775,
      "natural_ratio": 0.00591278076171875,
      "max_context_tokens": 131072,
      "context_length": 1458,
      "question_length": 52,
      "prediction": "1812",
      "reference": "1812",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07512640953063965,
      "timestamp": "2026-01-05T14:15:02.185005"
    },
    {
      "sample_id": "squad_57302226947a6a140053d16a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 360,
      "natural_ratio": 0.00274658203125,
      "max_context_tokens": 131072,
      "context_length": 614,
      "question_length": 66,
      "prediction": "1903",
      "reference": "1903",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06549215316772461,
      "timestamp": "2026-01-05T14:15:02.250663"
    },
    {
      "sample_id": "squad_572fb814a23a5019007fc8dd",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 420,
      "natural_ratio": 0.003204345703125,
      "max_context_tokens": 131072,
      "context_length": 749,
      "question_length": 51,
      "prediction": "A database administrator can restore a database that has been corrupted.",
      "reference": "database administrator",
      "metrics": {
        "f1": 0.46808510638297873
      },
      "elapsed_time": 0.14609503746032715,
      "timestamp": "2026-01-05T14:15:02.396916"
    },
    {
      "sample_id": "squad_57336c1cd058e614000b5ac1",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 428,
      "natural_ratio": 0.003265380859375,
      "max_context_tokens": 131072,
      "context_length": 776,
      "question_length": 41,
      "prediction": "French",
      "reference": "French",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.03500223159790039,
      "timestamp": "2026-01-05T14:15:02.432063"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_11954",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 63253,
      "natural_ratio": 0.48258209228515625,
      "max_context_tokens": 131072,
      "context_length": 126395,
      "question_length": 71,
      "prediction": "Cassandra has been chosen as a prize for king Agamemnon.",
      "reference": "Cassandra will become a concubine of Agamemnon",
      "metrics": {
        "f1": 0.7777777777777778
      },
      "elapsed_time": 2.0931484699249268,
      "timestamp": "2026-01-05T14:15:04.525532"
    },
    {
      "sample_id": "squad_56e7af3b37bdd419002c433a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 269,
      "natural_ratio": 0.00205230712890625,
      "max_context_tokens": 131072,
      "context_length": 441,
      "question_length": 57,
      "prediction": "Independence Day",
      "reference": "Independence Day",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.058840274810791016,
      "timestamp": "2026-01-05T14:15:04.584594"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_13470",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 96974,
      "natural_ratio": 0.7398529052734375,
      "max_context_tokens": 131072,
      "context_length": 193877,
      "question_length": 31,
      "prediction": "Christiania",
      "reference": "Kristiania",
      "metrics": {
        "f1": 0.7999999999999999
      },
      "elapsed_time": 3.6568973064422607,
      "timestamp": "2026-01-05T14:15:08.241855"
    },
    {
      "sample_id": "squad_57325d96b9d445190005eab7",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 306,
      "natural_ratio": 0.0023345947265625,
      "max_context_tokens": 131072,
      "context_length": 508,
      "question_length": 65,
      "prediction": "8.2%",
      "reference": "8.2%",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.07205963134765625,
      "timestamp": "2026-01-05T14:15:08.314109"
    },
    {
      "sample_id": "squad_5727740ff1498d1400e8f88d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 738,
      "natural_ratio": 0.0056304931640625,
      "max_context_tokens": 131072,
      "context_length": 1367,
      "question_length": 70,
      "prediction": "John Ray",
      "reference": "John Ray",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05077481269836426,
      "timestamp": "2026-01-05T14:15:08.365054"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_28168",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 53138,
      "natural_ratio": 0.4054107666015625,
      "max_context_tokens": 131072,
      "context_length": 106158,
      "question_length": 78,
      "prediction": "7 years",
      "reference": "Seven years.",
      "metrics": {
        "f1": 0.75
      },
      "elapsed_time": 1.6675360202789307,
      "timestamp": "2026-01-05T14:15:10.032918"
    },
    {
      "sample_id": "squad_5726f9ba708984140094d772",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 316,
      "natural_ratio": 0.002410888671875,
      "max_context_tokens": 131072,
      "context_length": 525,
      "question_length": 67,
      "prediction": "When other sources are scant.",
      "reference": "when other sources may be scant",
      "metrics": {
        "f1": 0.8571428571428571
      },
      "elapsed_time": 0.08590388298034668,
      "timestamp": "2026-01-05T14:15:10.119045"
    },
    {
      "sample_id": "squad_5727b28f2ca10214002d9432",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 551,
      "natural_ratio": 0.00420379638671875,
      "max_context_tokens": 131072,
      "context_length": 993,
      "question_length": 69,
      "prediction": "120",
      "reference": "120,000 characters",
      "metrics": {
        "f1": 0.4
      },
      "elapsed_time": 0.056687116622924805,
      "timestamp": "2026-01-05T14:15:10.175898"
    },
    {
      "sample_id": "squad_570b85a5ec8fbc190045ba5d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 397,
      "natural_ratio": 0.00302886962890625,
      "max_context_tokens": 131072,
      "context_length": 695,
      "question_length": 60,
      "prediction": "grazers",
      "reference": "development of grazers",
      "metrics": {
        "f1": 0.5454545454545454
      },
      "elapsed_time": 0.04459500312805176,
      "timestamp": "2026-01-05T14:15:10.220650"
    },
    {
      "sample_id": "squad_56cbd2f96d243a140015ed72",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 523,
      "natural_ratio": 0.00399017333984375,
      "max_context_tokens": 131072,
      "context_length": 952,
      "question_length": 55,
      "prediction": "1835",
      "reference": "1835",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06596183776855469,
      "timestamp": "2026-01-05T14:15:10.286763"
    },
    {
      "sample_id": "squad_57314737497a881900248d3f",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 511,
      "natural_ratio": 0.00389862060546875,
      "max_context_tokens": 131072,
      "context_length": 919,
      "question_length": 63,
      "prediction": "early model F-16 aircraft converted to QF-16 configuration",
      "reference": "early model F-16 aircraft converted to QF-16 configuration",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.1771836280822754,
      "timestamp": "2026-01-05T14:15:10.464098"
    },
    {
      "sample_id": "squad_56d37f7059d6e41400146559",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 279,
      "natural_ratio": 0.00212860107421875,
      "max_context_tokens": 131072,
      "context_length": 467,
      "question_length": 51,
      "prediction": "2013",
      "reference": "2013",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06380534172058105,
      "timestamp": "2026-01-05T14:15:10.528053"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_5129",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 75154,
      "natural_ratio": 0.5733795166015625,
      "max_context_tokens": 131072,
      "context_length": 150201,
      "question_length": 67,
      "prediction": "Kobo",
      "reference": "Sugai",
      "metrics": {
        "f1": 0.0
      },
      "elapsed_time": 3.081946611404419,
      "timestamp": "2026-01-05T14:15:13.610348"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_2683",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 96465,
      "natural_ratio": 0.7359695434570312,
      "max_context_tokens": 131072,
      "context_length": 192850,
      "question_length": 40,
      "prediction": "one half left",
      "reference": "He has heart and Martel doesn't.",
      "metrics": {
        "f1": 0.6956521739130435
      },
      "elapsed_time": 0.1943514347076416,
      "timestamp": "2026-01-05T14:15:13.805050"
    },
    {
      "sample_id": "squad_57321d1eb9d445190005e81c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 328,
      "natural_ratio": 0.00250244140625,
      "max_context_tokens": 131072,
      "context_length": 562,
      "question_length": 54,
      "prediction": "3.16 million",
      "reference": "3.16 million",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0787513256072998,
      "timestamp": "2026-01-05T14:15:13.883993"
    },
    {
      "sample_id": "squad_5726768af1498d1400e8e082",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 293,
      "natural_ratio": 0.00223541259765625,
      "max_context_tokens": 131072,
      "context_length": 512,
      "question_length": 35,
      "prediction": "April 1855",
      "reference": "April 1855",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.10153603553771973,
      "timestamp": "2026-01-05T14:15:13.985748"
    },
    {
      "sample_id": "squad_5705edea52bb8914006896d6",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 208,
      "natural_ratio": 0.0015869140625,
      "max_context_tokens": 131072,
      "context_length": 300,
      "question_length": 77,
      "prediction": "WHO（World Health Organization）",
      "reference": "The World Health Organization",
      "metrics": {
        "f1": 0.9333333333333333
      },
      "elapsed_time": 0.0822303295135498,
      "timestamp": "2026-01-05T14:15:14.068159"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_25562",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 113661,
      "natural_ratio": 0.8671646118164062,
      "max_context_tokens": 131072,
      "context_length": 227197,
      "question_length": 86,
      "prediction": "Erik was trying to create superior soldiers by making changes to embryos that would reduce fear and pity, increase muscle strength, and heighten senses.",
      "reference": "Super-soldiers",
      "metrics": {
        "f1": 0.5294117647058825
      },
      "elapsed_time": 4.267576456069946,
      "timestamp": "2026-01-05T14:15:18.336119"
    },
    {
      "sample_id": "squad_572816beff5b5019007d9ce8",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 1093,
      "natural_ratio": 0.00833892822265625,
      "max_context_tokens": 131072,
      "context_length": 2064,
      "question_length": 83,
      "prediction": "Joseph P. Kerwin",
      "reference": "Joseph P. Kerwin",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.09406518936157227,
      "timestamp": "2026-01-05T14:15:18.430750"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_26009",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 71251,
      "natural_ratio": 0.5436019897460938,
      "max_context_tokens": 131072,
      "context_length": 142429,
      "question_length": 33,
      "prediction": "Medea takes advantage of the distraction caused by the arrival of the Corinthian messenger to enter the house, where she kills her two children.",
      "reference": "With a knife.",
      "metrics": {
        "f1": 0.6285714285714286
      },
      "elapsed_time": 2.4322335720062256,
      "timestamp": "2026-01-05T14:15:20.863476"
    },
    {
      "sample_id": "squad_57336bbb4776f41900660a8d",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 401,
      "natural_ratio": 0.00305938720703125,
      "max_context_tokens": 131072,
      "context_length": 691,
      "question_length": 72,
      "prediction": "$500 million",
      "reference": "$500 million",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.08000612258911133,
      "timestamp": "2026-01-05T14:15:20.943703"
    },
    {
      "sample_id": "squad_56e6fd9cde9d371400068113",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 185,
      "natural_ratio": 0.00141143798828125,
      "max_context_tokens": 131072,
      "context_length": 290,
      "question_length": 41,
      "prediction": "New Zealand",
      "reference": "New Zealand",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.041683197021484375,
      "timestamp": "2026-01-05T14:15:20.985549"
    },
    {
      "sample_id": "squad_57337520d058e614000b5b4c",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 602,
      "natural_ratio": 0.0045928955078125,
      "max_context_tokens": 131072,
      "context_length": 1074,
      "question_length": 91,
      "prediction": "serious",
      "reference": "serious",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.03583526611328125,
      "timestamp": "2026-01-05T14:15:21.021544"
    },
    {
      "sample_id": "squad_57340136d058e614000b6785",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 444,
      "natural_ratio": 0.003387451171875,
      "max_context_tokens": 131072,
      "context_length": 795,
      "question_length": 54,
      "prediction": "intent",
      "reference": "intent",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.03470444679260254,
      "timestamp": "2026-01-05T14:15:21.056429"
    },
    {
      "sample_id": "squad_5727ce204b864d1900163d8a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 602,
      "natural_ratio": 0.0045928955078125,
      "max_context_tokens": 131072,
      "context_length": 1124,
      "question_length": 41,
      "prediction": "to promote evolution and naturalism through the journal Nature",
      "reference": "as part of his efforts to secularise and professionalise science",
      "metrics": {
        "f1": 0.823529411764706
      },
      "elapsed_time": 0.12625718116760254,
      "timestamp": "2026-01-05T14:15:21.182848"
    },
    {
      "sample_id": "squad_57318f8ae6313a140071d09e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 296,
      "natural_ratio": 0.00225830078125,
      "max_context_tokens": 131072,
      "context_length": 508,
      "question_length": 45,
      "prediction": "Gaddafi's father",
      "reference": "His father",
      "metrics": {
        "f1": 0.8571428571428571
      },
      "elapsed_time": 0.06391334533691406,
      "timestamp": "2026-01-05T14:15:21.246911"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_15976",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 91697,
      "natural_ratio": 0.6995925903320312,
      "max_context_tokens": 131072,
      "context_length": 183312,
      "question_length": 42,
      "prediction": "from his mother",
      "reference": "In his dreams",
      "metrics": {
        "f1": 0.7
      },
      "elapsed_time": 2.6571033000946045,
      "timestamp": "2026-01-05T14:15:23.904365"
    },
    {
      "sample_id": "squad_5705f3a375f01819005e77c5",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 124,
      "natural_ratio": 0.000946044921875,
      "max_context_tokens": 131072,
      "context_length": 175,
      "question_length": 34,
      "prediction": "several years",
      "reference": "several years",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05540204048156738,
      "timestamp": "2026-01-05T14:15:23.959970"
    },
    {
      "sample_id": "squad_5726c8615951b619008f7dfb",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 345,
      "natural_ratio": 0.00263214111328125,
      "max_context_tokens": 131072,
      "context_length": 584,
      "question_length": 66,
      "prediction": "Charlemagne",
      "reference": "Charlemagne",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05594325065612793,
      "timestamp": "2026-01-05T14:15:24.016097"
    },
    {
      "sample_id": "squad_56cdd4d762d2951400fa68cc",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 425,
      "natural_ratio": 0.00324249267578125,
      "max_context_tokens": 131072,
      "context_length": 771,
      "question_length": 39,
      "prediction": "The Longmen Shan Fault System is located in the eastern border of the Tibetan Plateau.",
      "reference": "the eastern border of the Tibetan Plateau",
      "metrics": {
        "f1": 0.6456692913385828
      },
      "elapsed_time": 0.2100679874420166,
      "timestamp": "2026-01-05T14:15:24.226355"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_18882",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 13898,
      "natural_ratio": 0.1060333251953125,
      "max_context_tokens": 131072,
      "context_length": 27719,
      "question_length": 38,
      "prediction": "Squirrel",
      "reference": "A squirrel",
      "metrics": {
        "f1": 0.8750000000000001
      },
      "elapsed_time": 0.3028430938720703,
      "timestamp": "2026-01-05T14:15:24.529491"
    },
    {
      "sample_id": "squad_5725c34889a1e219009abe0e",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 321,
      "natural_ratio": 0.00244903564453125,
      "max_context_tokens": 131072,
      "context_length": 561,
      "question_length": 41,
      "prediction": "Israel",
      "reference": "Israel",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.035566091537475586,
      "timestamp": "2026-01-05T14:15:24.565293"
    },
    {
      "sample_id": "squad_572ef06703f9891900756af9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 489,
      "natural_ratio": 0.00373077392578125,
      "max_context_tokens": 131072,
      "context_length": 866,
      "question_length": 72,
      "prediction": "Monogamous marriages",
      "reference": "Arranged marriages",
      "metrics": {
        "f1": 0.8571428571428572
      },
      "elapsed_time": 0.06648850440979004,
      "timestamp": "2026-01-05T14:15:24.631958"
    },
    {
      "sample_id": "squad_573243d40fdd8d15006c68aa",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 370,
      "natural_ratio": 0.0028228759765625,
      "max_context_tokens": 131072,
      "context_length": 654,
      "question_length": 47,
      "prediction": "Bagratid Dynasty",
      "reference": "Bagratid Dynasty",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.0649716854095459,
      "timestamp": "2026-01-05T14:15:24.697099"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_13185",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 59546,
      "natural_ratio": 0.4542999267578125,
      "max_context_tokens": 131072,
      "context_length": 119028,
      "question_length": 24,
      "prediction": "Margit regrets that she is married to Bengt and wishes she could marry Gudmund instead.",
      "reference": "She regrets marrying  Bengt Gauteson",
      "metrics": {
        "f1": 0.8571428571428571
      },
      "elapsed_time": 2.010831117630005,
      "timestamp": "2026-01-05T14:15:26.708284"
    },
    {
      "sample_id": "squad_572941266aef051400154c0a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 351,
      "natural_ratio": 0.00267791748046875,
      "max_context_tokens": 131072,
      "context_length": 563,
      "question_length": 99,
      "prediction": "International Energy Agency",
      "reference": "International Energy Agency",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05942702293395996,
      "timestamp": "2026-01-05T14:15:26.767934"
    },
    {
      "sample_id": "squad_5731eb9bb9d445190005e69a",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 384,
      "natural_ratio": 0.0029296875,
      "max_context_tokens": 131072,
      "context_length": 697,
      "question_length": 31,
      "prediction": "Raphael Lemkin",
      "reference": "Raphael Lemkin",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06645894050598145,
      "timestamp": "2026-01-05T14:15:26.834570"
    },
    {
      "sample_id": "squad_56de244f4396321400ee25f0",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 392,
      "natural_ratio": 0.00299072265625,
      "max_context_tokens": 131072,
      "context_length": 683,
      "question_length": 62,
      "prediction": "three",
      "reference": "three",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.03502631187438965,
      "timestamp": "2026-01-05T14:15:26.869758"
    },
    {
      "sample_id": "squad_572f9463947a6a140053ca88",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 422,
      "natural_ratio": 0.0032196044921875,
      "max_context_tokens": 131072,
      "context_length": 742,
      "question_length": 62,
      "prediction": "19%",
      "reference": "19%",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05538344383239746,
      "timestamp": "2026-01-05T14:15:26.925296"
    },
    {
      "sample_id": "squad_56bec3303aeaaa14008c9393",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 376,
      "natural_ratio": 0.00286865234375,
      "max_context_tokens": 131072,
      "context_length": 632,
      "question_length": 80,
      "prediction": "L.A. Reid",
      "reference": "L.A. Reid",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.06480193138122559,
      "timestamp": "2026-01-05T14:15:26.990247"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_24903",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 114975,
      "natural_ratio": 0.8771896362304688,
      "max_context_tokens": 131072,
      "context_length": 229858,
      "question_length": 53,
      "prediction": "Maya becomes a soldier-bee.",
      "reference": "teaching",
      "metrics": {
        "f1": 0.3478260869565218
      },
      "elapsed_time": 4.105494499206543,
      "timestamp": "2026-01-05T14:15:31.096110"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_13488",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 80726,
      "natural_ratio": 0.6158905029296875,
      "max_context_tokens": 131072,
      "context_length": 161328,
      "question_length": 84,
      "prediction": "regretting her past behavior towards Leonard Charteris",
      "reference": "Not having been brave enough to kill Charteris.",
      "metrics": {
        "f1": 0.8333333333333334
      },
      "elapsed_time": 2.7921416759490967,
      "timestamp": "2026-01-05T14:15:33.889005"
    },
    {
      "sample_id": "squad_5727fb69ff5b5019007d99ec",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 377,
      "natural_ratio": 0.00287628173828125,
      "max_context_tokens": 131072,
      "context_length": 676,
      "question_length": 39,
      "prediction": "Prince Richard, Duke of Gloucester attended Eton.",
      "reference": "Prince Richard",
      "metrics": {
        "f1": 0.4444444444444445
      },
      "elapsed_time": 0.14084672927856445,
      "timestamp": "2026-01-05T14:15:34.030062"
    },
    {
      "sample_id": "squad_57346018879d6814001ca583",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 601,
      "natural_ratio": 0.00458526611328125,
      "max_context_tokens": 131072,
      "context_length": 1104,
      "question_length": 59,
      "prediction": "Managing populations that might exceed the carrying capacity of their habitat and threaten the well-being of other species, or, in some instances, damage human health or safety.",
      "reference": "managing populations",
      "metrics": {
        "f1": 0.20304568527918782
      },
      "elapsed_time": 0.34952712059020996,
      "timestamp": "2026-01-05T14:15:34.379778"
    },
    {
      "sample_id": "squad_570bdec2ec8fbc190045bbae",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 423,
      "natural_ratio": 0.00322723388671875,
      "max_context_tokens": 131072,
      "context_length": 746,
      "question_length": 60,
      "prediction": "Biodiversity physics",
      "reference": "Contemporary biodiversity physics",
      "metrics": {
        "f1": 0.9032258064516129
      },
      "elapsed_time": 0.06522464752197266,
      "timestamp": "2026-01-05T14:15:34.445174"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_26429",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 97592,
      "natural_ratio": 0.74456787109375,
      "max_context_tokens": 131072,
      "context_length": 195071,
      "question_length": 74,
      "prediction": "Phelps",
      "reference": "A flight attendant.",
      "metrics": {
        "f1": 0.35294117647058826
      },
      "elapsed_time": 3.1976280212402344,
      "timestamp": "2026-01-05T14:15:37.643175"
    },
    {
      "sample_id": "squad_5732a8641d5d2e14009ff888",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 437,
      "natural_ratio": 0.00333404541015625,
      "max_context_tokens": 131072,
      "context_length": 705,
      "question_length": 130,
      "prediction": "Governments",
      "reference": "Governments",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.05040478706359863,
      "timestamp": "2026-01-05T14:15:37.693769"
    },
    {
      "sample_id": "squad_572ee52903f9891900756ac9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 339,
      "natural_ratio": 0.00258636474609375,
      "max_context_tokens": 131072,
      "context_length": 570,
      "question_length": 69,
      "prediction": "saturation",
      "reference": "saturation",
      "metrics": {
        "f1": 1.0
      },
      "elapsed_time": 0.04492473602294922,
      "timestamp": "2026-01-05T14:15:37.738856"
    },
    {
      "sample_id": "narrativeqa_narrativeqa_31130",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 70901,
      "natural_ratio": 0.5409317016601562,
      "max_context_tokens": 131072,
      "context_length": 141730,
      "question_length": 32,
      "prediction": "She is a prostitute.",
      "reference": "She is a prostitute",
      "metrics": {
        "f1": 0.9743589743589743
      },
      "elapsed_time": 3.3906619548797607,
      "timestamp": "2026-01-05T14:15:41.129864"
    },
    {
      "sample_id": "squad_57361c88012e2f140011a1a9",
      "model": "qwen2.5-7b",
      "model_key": "qwen2.5-7b",
      "dataset": "mixed",
      "task_type": "reading_comprehension",
      "natural_tokens": 477,
      "natural_ratio": 0.00363922119140625,
      "max_context_tokens": 131072,
      "context_length": 875,
      "question_length": 40,
      "prediction": "Over $700 million",
      "reference": "$700 million",
      "metrics": {
        "f1": 0.8275862068965517
      },
      "elapsed_time": 0.09177708625793457,
      "timestamp": "2026-01-05T14:15:41.221843"
    }
  ]
}