[
    {
        "step": 0,
        "reward": -3.7819061279296875,
        "gold_reward": -3.13934326171875,
        "kl_divergence": 0.0,
        "mean_generated_length": 133.28125
    },
    {
        "step": 1,
        "reward": -3.4259490966796875,
        "gold_reward": -2.917926788330078,
        "kl_divergence": 0.0,
        "mean_generated_length": 127.515625
    },
    {
        "step": 2,
        "reward": -3.4664077758789062,
        "gold_reward": -2.9809436798095703,
        "kl_divergence": -0.08627334237098694,
        "mean_generated_length": 91.171875
    },
    {
        "step": 3,
        "reward": -3.8388671875,
        "gold_reward": -3.3630523681640625,
        "kl_divergence": -0.008064902387559414,
        "mean_generated_length": 120.953125
    },
    {
        "step": 4,
        "reward": -3.5845985412597656,
        "gold_reward": -3.2526092529296875,
        "kl_divergence": -0.06082535535097122,
        "mean_generated_length": 119.265625
    },
    {
        "step": 5,
        "reward": -4.557525634765625,
        "gold_reward": -3.255859375,
        "kl_divergence": 0.1372203379869461,
        "mean_generated_length": 113.546875
    },
    {
        "step": 6,
        "reward": -3.432647705078125,
        "gold_reward": -3.2862777709960938,
        "kl_divergence": 0.08660878241062164,
        "mean_generated_length": 129.1875
    },
    {
        "step": 7,
        "reward": -3.0281219482421875,
        "gold_reward": -2.8107986450195312,
        "kl_divergence": 0.1807321012020111,
        "mean_generated_length": 127.4375
    },
    {
        "step": 8,
        "reward": -2.7110748291015625,
        "gold_reward": -3.0869598388671875,
        "kl_divergence": 0.39324185252189636,
        "mean_generated_length": 143.453125
    },
    {
        "step": 9,
        "reward": -2.930020809173584,
        "gold_reward": -2.4538726806640625,
        "kl_divergence": 0.6049928665161133,
        "mean_generated_length": 124.46875
    },
    {
        "step": 10,
        "reward": -2.330535888671875,
        "gold_reward": -2.1860713958740234,
        "kl_divergence": 1.675398349761963,
        "mean_generated_length": 140.390625
    },
    {
        "step": 11,
        "reward": -2.2979812622070312,
        "gold_reward": -2.9038848876953125,
        "kl_divergence": 2.316223382949829,
        "mean_generated_length": 159.484375
    },
    {
        "step": 12,
        "reward": -3.2076759338378906,
        "gold_reward": -2.737720489501953,
        "kl_divergence": 1.3816548585891724,
        "mean_generated_length": 128.4375
    },
    {
        "step": 13,
        "reward": -2.492095947265625,
        "gold_reward": -2.2181625366210938,
        "kl_divergence": 1.7763816118240356,
        "mean_generated_length": 105.515625
    },
    {
        "step": 14,
        "reward": -2.335224151611328,
        "gold_reward": -2.67950439453125,
        "kl_divergence": 2.291472911834717,
        "mean_generated_length": 128.21875
    },
    {
        "step": 15,
        "reward": -2.675076961517334,
        "gold_reward": -2.6155242919921875,
        "kl_divergence": 3.1427435874938965,
        "mean_generated_length": 127.296875
    },
    {
        "step": 16,
        "reward": -3.1741714477539062,
        "gold_reward": -2.253955841064453,
        "kl_divergence": 3.383388042449951,
        "mean_generated_length": 130.984375
    },
    {
        "step": 17,
        "reward": -2.9069671630859375,
        "gold_reward": -2.817628860473633,
        "kl_divergence": 3.881498336791992,
        "mean_generated_length": 117.796875
    },
    {
        "step": 18,
        "reward": -3.29400634765625,
        "gold_reward": -3.2388839721679688,
        "kl_divergence": 3.676805019378662,
        "mean_generated_length": 134.5
    },
    {
        "step": 19,
        "reward": -3.0439453125,
        "gold_reward": -3.02679443359375,
        "kl_divergence": 4.072471618652344,
        "mean_generated_length": 113.4375
    },
    {
        "step": 20,
        "reward": -3.478729248046875,
        "gold_reward": -3.0325927734375,
        "kl_divergence": 5.733091354370117,
        "mean_generated_length": 125.5625
    },
    {
        "step": 21,
        "reward": -1.8622827529907227,
        "gold_reward": -2.468087673187256,
        "kl_divergence": 6.052178382873535,
        "mean_generated_length": 122.90625
    },
    {
        "step": 22,
        "reward": -3.10406494140625,
        "gold_reward": -2.462902069091797,
        "kl_divergence": 6.517963409423828,
        "mean_generated_length": 105.09375
    },
    {
        "step": 23,
        "reward": -2.0672874450683594,
        "gold_reward": -2.2985000610351562,
        "kl_divergence": 8.645731925964355,
        "mean_generated_length": 131.359375
    },
    {
        "step": 24,
        "reward": -2.4981689453125,
        "gold_reward": -2.658447265625,
        "kl_divergence": 9.764017105102539,
        "mean_generated_length": 124.65625
    },
    {
        "step": 25,
        "reward": -3.3456382751464844,
        "gold_reward": -2.9535980224609375,
        "kl_divergence": 10.649444580078125,
        "mean_generated_length": 155.21875
    },
    {
        "step": 26,
        "reward": -2.42437744140625,
        "gold_reward": -2.9234390258789062,
        "kl_divergence": 12.814057350158691,
        "mean_generated_length": 161.703125
    },
    {
        "step": 27,
        "reward": -2.644794464111328,
        "gold_reward": -2.838418960571289,
        "kl_divergence": 18.941852569580078,
        "mean_generated_length": 218.640625
    },
    {
        "step": 28,
        "reward": -2.3230743408203125,
        "gold_reward": -2.4548540115356445,
        "kl_divergence": 16.09139060974121,
        "mean_generated_length": 152.078125
    },
    {
        "step": 29,
        "reward": -1.975107192993164,
        "gold_reward": -2.90313720703125,
        "kl_divergence": 18.922719955444336,
        "mean_generated_length": 193.15625
    },
    {
        "step": 30,
        "reward": -2.550395965576172,
        "gold_reward": -2.2968482971191406,
        "kl_divergence": 16.52853012084961,
        "mean_generated_length": 158.09375
    },
    {
        "step": 31,
        "reward": -2.9636096954345703,
        "gold_reward": -2.8855133056640625,
        "kl_divergence": 13.21922492980957,
        "mean_generated_length": 157.484375
    },
    {
        "step": 32,
        "reward": -2.35150146484375,
        "gold_reward": -2.4213333129882812,
        "kl_divergence": 13.15125846862793,
        "mean_generated_length": 150.765625
    },
    {
        "step": 33,
        "reward": -2.41961669921875,
        "gold_reward": -2.59429931640625,
        "kl_divergence": 16.485668182373047,
        "mean_generated_length": 162.578125
    },
    {
        "step": 34,
        "reward": -2.58135986328125,
        "gold_reward": -2.339946746826172,
        "kl_divergence": 13.846458435058594,
        "mean_generated_length": 166.625
    },
    {
        "step": 35,
        "reward": -2.6552622318267822,
        "gold_reward": -2.49237060546875,
        "kl_divergence": 15.193044662475586,
        "mean_generated_length": 140.578125
    },
    {
        "step": 36,
        "reward": -2.246372699737549,
        "gold_reward": -3.2494468688964844,
        "kl_divergence": 17.016490936279297,
        "mean_generated_length": 203.53125
    },
    {
        "step": 37,
        "reward": -1.8732070922851562,
        "gold_reward": -2.6150283813476562,
        "kl_divergence": 16.127071380615234,
        "mean_generated_length": 138.03125
    },
    {
        "step": 38,
        "reward": -2.3031005859375,
        "gold_reward": -3.03265380859375,
        "kl_divergence": 19.10108184814453,
        "mean_generated_length": 178.875
    },
    {
        "step": 39,
        "reward": -2.477275848388672,
        "gold_reward": -2.7885913848876953,
        "kl_divergence": 16.003679275512695,
        "mean_generated_length": 156.859375
    },
    {
        "step": 40,
        "reward": -2.466708183288574,
        "gold_reward": -2.8779945373535156,
        "kl_divergence": 19.855377197265625,
        "mean_generated_length": 166.390625
    },
    {
        "step": 41,
        "reward": -2.48748779296875,
        "gold_reward": -2.6227264404296875,
        "kl_divergence": 18.854351043701172,
        "mean_generated_length": 173.796875
    },
    {
        "step": 42,
        "reward": -2.0295562744140625,
        "gold_reward": -2.8962249755859375,
        "kl_divergence": 20.824443817138672,
        "mean_generated_length": 177.609375
    },
    {
        "step": 43,
        "reward": -2.5588302612304688,
        "gold_reward": -2.863006591796875,
        "kl_divergence": 20.184429168701172,
        "mean_generated_length": 192.453125
    },
    {
        "step": 44,
        "reward": -1.7890625,
        "gold_reward": -2.627349853515625,
        "kl_divergence": 21.256427764892578,
        "mean_generated_length": 175.5
    },
    {
        "step": 45,
        "reward": -2.1365737915039062,
        "gold_reward": -2.78448486328125,
        "kl_divergence": 23.665485382080078,
        "mean_generated_length": 200.078125
    },
    {
        "step": 46,
        "reward": -2.138263702392578,
        "gold_reward": -2.6315841674804688,
        "kl_divergence": 25.5360107421875,
        "mean_generated_length": 187.4375
    },
    {
        "step": 47,
        "reward": -1.9115447998046875,
        "gold_reward": -2.81439208984375,
        "kl_divergence": 24.76249885559082,
        "mean_generated_length": 197.609375
    },
    {
        "step": 48,
        "reward": -1.8065738677978516,
        "gold_reward": -2.713681221008301,
        "kl_divergence": 28.282176971435547,
        "mean_generated_length": 195.734375
    },
    {
        "step": 49,
        "reward": -2.101306915283203,
        "gold_reward": -2.549406051635742,
        "kl_divergence": 29.771589279174805,
        "mean_generated_length": 220.59375
    },
    {
        "step": 50,
        "reward": -1.5717124938964844,
        "gold_reward": -2.3031692504882812,
        "kl_divergence": 27.792377471923828,
        "mean_generated_length": 191.1875
    },
    {
        "step": 51,
        "reward": -2.3083572387695312,
        "gold_reward": -3.177642822265625,
        "kl_divergence": 33.497467041015625,
        "mean_generated_length": 222.0
    },
    {
        "step": 52,
        "reward": -1.9841690063476562,
        "gold_reward": -2.9136857986450195,
        "kl_divergence": 30.89424705505371,
        "mean_generated_length": 212.140625
    },
    {
        "step": 53,
        "reward": -1.9603118896484375,
        "gold_reward": -2.666323661804199,
        "kl_divergence": 31.144466400146484,
        "mean_generated_length": 243.71875
    },
    {
        "step": 54,
        "reward": -2.3349380493164062,
        "gold_reward": -3.18994140625,
        "kl_divergence": 34.53404998779297,
        "mean_generated_length": 238.0
    },
    {
        "step": 55,
        "reward": -1.8691911697387695,
        "gold_reward": -2.677919387817383,
        "kl_divergence": 31.160350799560547,
        "mean_generated_length": 213.984375
    },
    {
        "step": 56,
        "reward": -1.6426239013671875,
        "gold_reward": -2.624847412109375,
        "kl_divergence": 40.392608642578125,
        "mean_generated_length": 244.078125
    },
    {
        "step": 57,
        "reward": -1.991424560546875,
        "gold_reward": -3.005645751953125,
        "kl_divergence": 40.270999908447266,
        "mean_generated_length": 254.515625
    },
    {
        "step": 58,
        "reward": -2.171895980834961,
        "gold_reward": -2.7065868377685547,
        "kl_divergence": 43.630794525146484,
        "mean_generated_length": 279.796875
    },
    {
        "step": 59,
        "reward": -1.9009246826171875,
        "gold_reward": -3.0012340545654297,
        "kl_divergence": 38.39411163330078,
        "mean_generated_length": 245.65625
    },
    {
        "step": 60,
        "reward": -1.94122314453125,
        "gold_reward": -3.1592764854431152,
        "kl_divergence": 50.543052673339844,
        "mean_generated_length": 290.953125
    },
    {
        "step": 61,
        "reward": -2.0306854248046875,
        "gold_reward": -3.085906982421875,
        "kl_divergence": 48.39229965209961,
        "mean_generated_length": 298.796875
    },
    {
        "step": 62,
        "reward": -2.2376174926757812,
        "gold_reward": -3.22894287109375,
        "kl_divergence": 49.64299011230469,
        "mean_generated_length": 307.671875
    },
    {
        "step": 63,
        "reward": -2.145599365234375,
        "gold_reward": -3.3089599609375,
        "kl_divergence": 45.84682083129883,
        "mean_generated_length": 265.6875
    },
    {
        "step": 64,
        "reward": -2.4530792236328125,
        "gold_reward": -2.929290771484375,
        "kl_divergence": 51.345054626464844,
        "mean_generated_length": 299.953125
    },
    {
        "step": 65,
        "reward": -2.0204410552978516,
        "gold_reward": -3.22613525390625,
        "kl_divergence": 55.68486404418945,
        "mean_generated_length": 292.015625
    },
    {
        "step": 66,
        "reward": -1.640625,
        "gold_reward": -2.805964469909668,
        "kl_divergence": 53.392662048339844,
        "mean_generated_length": 289.375
    },
    {
        "step": 67,
        "reward": -1.6122856140136719,
        "gold_reward": -3.158905029296875,
        "kl_divergence": 52.93033218383789,
        "mean_generated_length": 304.234375
    },
    {
        "step": 68,
        "reward": -0.638031005859375,
        "gold_reward": -2.3922882080078125,
        "kl_divergence": 52.887428283691406,
        "mean_generated_length": 273.15625
    },
    {
        "step": 69,
        "reward": -2.0117530822753906,
        "gold_reward": -2.799346923828125,
        "kl_divergence": 54.534420013427734,
        "mean_generated_length": 277.984375
    },
    {
        "step": 70,
        "reward": -0.925201416015625,
        "gold_reward": -2.9441146850585938,
        "kl_divergence": 48.97609329223633,
        "mean_generated_length": 254.5
    },
    {
        "step": 71,
        "reward": -1.6512718200683594,
        "gold_reward": -2.73028564453125,
        "kl_divergence": 51.091224670410156,
        "mean_generated_length": 241.015625
    },
    {
        "step": 72,
        "reward": -1.5623931884765625,
        "gold_reward": -2.34344482421875,
        "kl_divergence": 51.67050552368164,
        "mean_generated_length": 203.328125
    },
    {
        "step": 73,
        "reward": -1.5220718383789062,
        "gold_reward": -2.47943115234375,
        "kl_divergence": 44.345741271972656,
        "mean_generated_length": 185.421875
    },
    {
        "step": 74,
        "reward": -1.7876358032226562,
        "gold_reward": -2.9503135681152344,
        "kl_divergence": 43.320953369140625,
        "mean_generated_length": 201.78125
    },
    {
        "step": 75,
        "reward": -1.3238334655761719,
        "gold_reward": -2.6178207397460938,
        "kl_divergence": 40.23743438720703,
        "mean_generated_length": 170.25
    },
    {
        "step": 76,
        "reward": -1.8671417236328125,
        "gold_reward": -2.5118560791015625,
        "kl_divergence": 44.68070602416992,
        "mean_generated_length": 188.671875
    },
    {
        "step": 77,
        "reward": -1.3358478546142578,
        "gold_reward": -1.9157752990722656,
        "kl_divergence": 45.423683166503906,
        "mean_generated_length": 210.1875
    },
    {
        "step": 78,
        "reward": 0.280517578125,
        "gold_reward": -1.203125,
        "kl_divergence": 47.19940185546875,
        "mean_generated_length": 184.125
    },
    {
        "step": 79,
        "reward": -2.4111328125,
        "gold_reward": -2.4592132568359375,
        "kl_divergence": 47.37804412841797,
        "mean_generated_length": 182.046875
    },
    {
        "step": 80,
        "reward": -1.1906623840332031,
        "gold_reward": -2.183818817138672,
        "kl_divergence": 44.97309494018555,
        "mean_generated_length": 184.109375
    },
    {
        "step": 81,
        "reward": -1.0930404663085938,
        "gold_reward": -2.0026168823242188,
        "kl_divergence": 41.08612823486328,
        "mean_generated_length": 153.640625
    },
    {
        "step": 82,
        "reward": -1.6489944458007812,
        "gold_reward": -2.439617156982422,
        "kl_divergence": 53.087646484375,
        "mean_generated_length": 192.890625
    },
    {
        "step": 83,
        "reward": -1.4398956298828125,
        "gold_reward": -2.590972900390625,
        "kl_divergence": 47.4095458984375,
        "mean_generated_length": 170.71875
    },
    {
        "step": 84,
        "reward": -2.5660324096679688,
        "gold_reward": -2.83447265625,
        "kl_divergence": 45.95956802368164,
        "mean_generated_length": 166.203125
    },
    {
        "step": 85,
        "reward": -1.8493576049804688,
        "gold_reward": -3.063385009765625,
        "kl_divergence": 50.094078063964844,
        "mean_generated_length": 182.21875
    },
    {
        "step": 86,
        "reward": -1.3493499755859375,
        "gold_reward": -2.086578369140625,
        "kl_divergence": 45.286800384521484,
        "mean_generated_length": 169.59375
    },
    {
        "step": 87,
        "reward": -0.5391902923583984,
        "gold_reward": -2.280242919921875,
        "kl_divergence": 51.12921142578125,
        "mean_generated_length": 179.859375
    },
    {
        "step": 88,
        "reward": -1.2601470947265625,
        "gold_reward": -2.3301239013671875,
        "kl_divergence": 54.04246520996094,
        "mean_generated_length": 181.625
    },
    {
        "step": 89,
        "reward": -0.9102973937988281,
        "gold_reward": -1.73492431640625,
        "kl_divergence": 52.334991455078125,
        "mean_generated_length": 174.140625
    },
    {
        "step": 90,
        "reward": -0.8742446899414062,
        "gold_reward": -2.5911178588867188,
        "kl_divergence": 59.401458740234375,
        "mean_generated_length": 187.109375
    },
    {
        "step": 91,
        "reward": -1.0162334442138672,
        "gold_reward": -2.363300323486328,
        "kl_divergence": 66.63932800292969,
        "mean_generated_length": 201.9375
    },
    {
        "step": 92,
        "reward": -0.01212310791015625,
        "gold_reward": -1.3643324375152588,
        "kl_divergence": 57.31805419921875,
        "mean_generated_length": 172.328125
    },
    {
        "step": 93,
        "reward": -1.0262832641601562,
        "gold_reward": -1.9601669311523438,
        "kl_divergence": 67.82662200927734,
        "mean_generated_length": 216.28125
    },
    {
        "step": 94,
        "reward": -0.38341522216796875,
        "gold_reward": -2.167938232421875,
        "kl_divergence": 72.77650451660156,
        "mean_generated_length": 203.1875
    },
    {
        "step": 95,
        "reward": -0.4308586120605469,
        "gold_reward": -2.0290260314941406,
        "kl_divergence": 79.55181884765625,
        "mean_generated_length": 240.703125
    },
    {
        "step": 96,
        "reward": -0.5057373046875,
        "gold_reward": -2.0589466094970703,
        "kl_divergence": 78.48040008544922,
        "mean_generated_length": 232.484375
    },
    {
        "step": 97,
        "reward": -1.2461423873901367,
        "gold_reward": -2.46087646484375,
        "kl_divergence": 71.45631408691406,
        "mean_generated_length": 242.53125
    },
    {
        "step": 98,
        "reward": -0.20853710174560547,
        "gold_reward": -2.1938133239746094,
        "kl_divergence": 85.95745849609375,
        "mean_generated_length": 235.578125
    },
    {
        "step": 99,
        "reward": -0.6850926876068115,
        "gold_reward": -2.1996026039123535,
        "kl_divergence": 93.15638732910156,
        "mean_generated_length": 251.046875
    },
    {
        "step": 100,
        "reward": -0.25841522216796875,
        "gold_reward": -2.440277099609375,
        "kl_divergence": 90.23043060302734,
        "mean_generated_length": 251.4375
    },
    {
        "step": 101,
        "reward": 0.10866546630859375,
        "gold_reward": -1.832275390625,
        "kl_divergence": 90.978759765625,
        "mean_generated_length": 245.765625
    },
    {
        "step": 102,
        "reward": -0.3407554626464844,
        "gold_reward": -1.8447113037109375,
        "kl_divergence": 81.83366394042969,
        "mean_generated_length": 234.9375
    },
    {
        "step": 103,
        "reward": -0.659088134765625,
        "gold_reward": -2.251983642578125,
        "kl_divergence": 80.39339447021484,
        "mean_generated_length": 234.75
    },
    {
        "step": 104,
        "reward": 0.016744613647460938,
        "gold_reward": -1.9423446655273438,
        "kl_divergence": 96.29122924804688,
        "mean_generated_length": 259.78125
    },
    {
        "step": 105,
        "reward": -0.8868255615234375,
        "gold_reward": -2.429107666015625,
        "kl_divergence": 85.73666381835938,
        "mean_generated_length": 226.734375
    },
    {
        "step": 106,
        "reward": -0.977508544921875,
        "gold_reward": -2.6514244079589844,
        "kl_divergence": 88.22383117675781,
        "mean_generated_length": 234.234375
    },
    {
        "step": 107,
        "reward": -0.1094512939453125,
        "gold_reward": -2.08612060546875,
        "kl_divergence": 86.43896484375,
        "mean_generated_length": 212.03125
    },
    {
        "step": 108,
        "reward": -0.4498443603515625,
        "gold_reward": -2.4271240234375,
        "kl_divergence": 85.67219543457031,
        "mean_generated_length": 219.0625
    },
    {
        "step": 109,
        "reward": -0.35485267639160156,
        "gold_reward": -1.806492805480957,
        "kl_divergence": 89.12002563476562,
        "mean_generated_length": 213.875
    },
    {
        "step": 110,
        "reward": 0.2747631072998047,
        "gold_reward": -1.9865875244140625,
        "kl_divergence": 83.48920440673828,
        "mean_generated_length": 214.984375
    },
    {
        "step": 111,
        "reward": -0.6998977661132812,
        "gold_reward": -2.157958984375,
        "kl_divergence": 73.26374053955078,
        "mean_generated_length": 193.890625
    },
    {
        "step": 112,
        "reward": 0.44042325019836426,
        "gold_reward": -1.7349786758422852,
        "kl_divergence": 71.45101165771484,
        "mean_generated_length": 183.890625
    },
    {
        "step": 113,
        "reward": -0.29451847076416016,
        "gold_reward": -1.5683631896972656,
        "kl_divergence": 77.83120727539062,
        "mean_generated_length": 197.015625
    },
    {
        "step": 114,
        "reward": -0.0665740966796875,
        "gold_reward": -1.6485099792480469,
        "kl_divergence": 73.87471771240234,
        "mean_generated_length": 181.453125
    },
    {
        "step": 115,
        "reward": -0.26416015625,
        "gold_reward": -2.5449185371398926,
        "kl_divergence": 71.1187744140625,
        "mean_generated_length": 204.96875
    },
    {
        "step": 116,
        "reward": -0.3592700958251953,
        "gold_reward": -2.3404693603515625,
        "kl_divergence": 71.73320770263672,
        "mean_generated_length": 184.15625
    },
    {
        "step": 117,
        "reward": -0.4043083190917969,
        "gold_reward": -2.563873291015625,
        "kl_divergence": 77.8588638305664,
        "mean_generated_length": 198.828125
    },
    {
        "step": 118,
        "reward": -0.29481983184814453,
        "gold_reward": -2.3596630096435547,
        "kl_divergence": 73.9090576171875,
        "mean_generated_length": 178.640625
    },
    {
        "step": 119,
        "reward": -0.7920265197753906,
        "gold_reward": -2.2755889892578125,
        "kl_divergence": 83.22232818603516,
        "mean_generated_length": 196.5625
    },
    {
        "step": 120,
        "reward": -0.5926437377929688,
        "gold_reward": -2.1797027587890625,
        "kl_divergence": 80.39546203613281,
        "mean_generated_length": 196.1875
    },
    {
        "step": 121,
        "reward": 0.191497802734375,
        "gold_reward": -1.9774647951126099,
        "kl_divergence": 81.5051040649414,
        "mean_generated_length": 200.203125
    },
    {
        "step": 122,
        "reward": -0.4511117935180664,
        "gold_reward": -2.126953125,
        "kl_divergence": 78.09161376953125,
        "mean_generated_length": 190.078125
    },
    {
        "step": 123,
        "reward": 0.14480364322662354,
        "gold_reward": -2.1149063110351562,
        "kl_divergence": 79.84701538085938,
        "mean_generated_length": 191.0625
    },
    {
        "step": 124,
        "reward": 0.19193267822265625,
        "gold_reward": -2.3695144653320312,
        "kl_divergence": 81.3416519165039,
        "mean_generated_length": 185.125
    },
    {
        "step": 125,
        "reward": -0.25498485565185547,
        "gold_reward": -2.018665313720703,
        "kl_divergence": 82.572998046875,
        "mean_generated_length": 177.4375
    },
    {
        "step": 126,
        "reward": 0.009052276611328125,
        "gold_reward": -2.11065673828125,
        "kl_divergence": 77.86406707763672,
        "mean_generated_length": 176.234375
    },
    {
        "step": 127,
        "reward": 0.07164764404296875,
        "gold_reward": -1.7317657470703125,
        "kl_divergence": 82.2120590209961,
        "mean_generated_length": 170.6875
    },
    {
        "step": 128,
        "reward": -0.4375762939453125,
        "gold_reward": -2.0599822998046875,
        "kl_divergence": 78.46863555908203,
        "mean_generated_length": 179.109375
    },
    {
        "step": 129,
        "reward": 0.2513427734375,
        "gold_reward": -1.3447113037109375,
        "kl_divergence": 70.24967956542969,
        "mean_generated_length": 157.109375
    },
    {
        "step": 130,
        "reward": -0.41198158264160156,
        "gold_reward": -1.8790779113769531,
        "kl_divergence": 77.84026336669922,
        "mean_generated_length": 163.859375
    },
    {
        "step": 131,
        "reward": -0.49295806884765625,
        "gold_reward": -2.5269775390625,
        "kl_divergence": 78.80377960205078,
        "mean_generated_length": 165.734375
    },
    {
        "step": 132,
        "reward": -1.0188217163085938,
        "gold_reward": -2.164813995361328,
        "kl_divergence": 75.71223449707031,
        "mean_generated_length": 168.609375
    },
    {
        "step": 133,
        "reward": -0.15866506099700928,
        "gold_reward": -2.501624584197998,
        "kl_divergence": 69.06425476074219,
        "mean_generated_length": 156.890625
    },
    {
        "step": 134,
        "reward": 0.2412548065185547,
        "gold_reward": -1.8227920532226562,
        "kl_divergence": 76.18132781982422,
        "mean_generated_length": 170.125
    },
    {
        "step": 135,
        "reward": -0.7201919555664062,
        "gold_reward": -2.25335693359375,
        "kl_divergence": 73.62548065185547,
        "mean_generated_length": 165.625
    },
    {
        "step": 136,
        "reward": -0.3922996520996094,
        "gold_reward": -1.7792930603027344,
        "kl_divergence": 82.04087829589844,
        "mean_generated_length": 154.6875
    },
    {
        "step": 137,
        "reward": 0.2225494384765625,
        "gold_reward": -1.3513622283935547,
        "kl_divergence": 76.02734375,
        "mean_generated_length": 158.171875
    },
    {
        "step": 138,
        "reward": -0.35519617795944214,
        "gold_reward": -2.1542587280273438,
        "kl_divergence": 75.37236022949219,
        "mean_generated_length": 153.640625
    },
    {
        "step": 139,
        "reward": -0.4247264862060547,
        "gold_reward": -1.6397819519042969,
        "kl_divergence": 75.16484069824219,
        "mean_generated_length": 152.375
    },
    {
        "step": 140,
        "reward": 0.6669654846191406,
        "gold_reward": -1.8119430541992188,
        "kl_divergence": 86.63627624511719,
        "mean_generated_length": 178.671875
    },
    {
        "step": 141,
        "reward": 0.18283438682556152,
        "gold_reward": -1.6264724731445312,
        "kl_divergence": 75.70399475097656,
        "mean_generated_length": 157.125
    },
    {
        "step": 142,
        "reward": -0.3766899108886719,
        "gold_reward": -2.108579635620117,
        "kl_divergence": 77.9445571899414,
        "mean_generated_length": 155.109375
    },
    {
        "step": 143,
        "reward": -0.3152923583984375,
        "gold_reward": -1.6045303344726562,
        "kl_divergence": 86.47161865234375,
        "mean_generated_length": 167.5
    },
    {
        "step": 144,
        "reward": 0.5034332275390625,
        "gold_reward": -2.09429931640625,
        "kl_divergence": 83.455078125,
        "mean_generated_length": 165.6875
    },
    {
        "step": 145,
        "reward": 0.388427734375,
        "gold_reward": -1.7212142944335938,
        "kl_divergence": 86.21295166015625,
        "mean_generated_length": 167.8125
    },
    {
        "step": 146,
        "reward": 0.5609540939331055,
        "gold_reward": -1.9829212427139282,
        "kl_divergence": 94.4969482421875,
        "mean_generated_length": 190.65625
    },
    {
        "step": 147,
        "reward": 1.6770477294921875,
        "gold_reward": -1.0826187133789062,
        "kl_divergence": 89.84625244140625,
        "mean_generated_length": 160.25
    },
    {
        "step": 148,
        "reward": 0.9715347290039062,
        "gold_reward": -1.5969886779785156,
        "kl_divergence": 98.7895278930664,
        "mean_generated_length": 181.5625
    },
    {
        "step": 149,
        "reward": 1.6161861419677734,
        "gold_reward": -1.7794206142425537,
        "kl_divergence": 96.19320678710938,
        "mean_generated_length": 169.765625
    },
    {
        "step": 150,
        "reward": 0.5773124694824219,
        "gold_reward": -1.636758804321289,
        "kl_divergence": 100.8565444946289,
        "mean_generated_length": 178.359375
    },
    {
        "step": 151,
        "reward": 0.6121368408203125,
        "gold_reward": -1.861288070678711,
        "kl_divergence": 98.11969757080078,
        "mean_generated_length": 170.359375
    },
    {
        "step": 152,
        "reward": 0.2298736572265625,
        "gold_reward": -1.487701416015625,
        "kl_divergence": 103.2221450805664,
        "mean_generated_length": 173.34375
    },
    {
        "step": 153,
        "reward": 0.795989990234375,
        "gold_reward": -2.0505332946777344,
        "kl_divergence": 107.543212890625,
        "mean_generated_length": 200.25
    },
    {
        "step": 154,
        "reward": 1.6601715087890625,
        "gold_reward": -1.70355224609375,
        "kl_divergence": 109.69497680664062,
        "mean_generated_length": 181.609375
    },
    {
        "step": 155,
        "reward": 0.9975814819335938,
        "gold_reward": -1.901275634765625,
        "kl_divergence": 117.4106674194336,
        "mean_generated_length": 196.734375
    },
    {
        "step": 156,
        "reward": 2.0386886596679688,
        "gold_reward": -1.0738983154296875,
        "kl_divergence": 120.74749755859375,
        "mean_generated_length": 214.640625
    },
    {
        "step": 157,
        "reward": 0.9664306640625,
        "gold_reward": -0.5869140625,
        "kl_divergence": 114.34153747558594,
        "mean_generated_length": 200.375
    },
    {
        "step": 158,
        "reward": 1.5731658935546875,
        "gold_reward": -1.3445053100585938,
        "kl_divergence": 129.21815490722656,
        "mean_generated_length": 210.046875
    },
    {
        "step": 159,
        "reward": 1.5452537536621094,
        "gold_reward": -1.234710693359375,
        "kl_divergence": 119.90299224853516,
        "mean_generated_length": 200.953125
    },
    {
        "step": 160,
        "reward": 2.000762939453125,
        "gold_reward": -0.9800949096679688,
        "kl_divergence": 119.7022933959961,
        "mean_generated_length": 186.671875
    },
    {
        "step": 161,
        "reward": 1.9204559326171875,
        "gold_reward": -1.1936187744140625,
        "kl_divergence": 130.4779052734375,
        "mean_generated_length": 215.453125
    },
    {
        "step": 162,
        "reward": 2.1000213623046875,
        "gold_reward": -1.2742729187011719,
        "kl_divergence": 130.16683959960938,
        "mean_generated_length": 198.390625
    },
    {
        "step": 163,
        "reward": 0.9483261108398438,
        "gold_reward": -1.6820297241210938,
        "kl_divergence": 128.0819091796875,
        "mean_generated_length": 205.28125
    },
    {
        "step": 164,
        "reward": 1.5963761806488037,
        "gold_reward": -1.9044189453125,
        "kl_divergence": 123.95337677001953,
        "mean_generated_length": 208.265625
    },
    {
        "step": 165,
        "reward": 0.9479827880859375,
        "gold_reward": -1.4831931591033936,
        "kl_divergence": 127.25546264648438,
        "mean_generated_length": 210.59375
    },
    {
        "step": 166,
        "reward": 2.362194061279297,
        "gold_reward": -1.4132614135742188,
        "kl_divergence": 131.56878662109375,
        "mean_generated_length": 214.671875
    },
    {
        "step": 167,
        "reward": 1.9558353424072266,
        "gold_reward": -1.2816638946533203,
        "kl_divergence": 134.9258575439453,
        "mean_generated_length": 219.390625
    },
    {
        "step": 168,
        "reward": 2.1696929931640625,
        "gold_reward": -0.6043224334716797,
        "kl_divergence": 124.27301025390625,
        "mean_generated_length": 196.109375
    },
    {
        "step": 169,
        "reward": 2.505207061767578,
        "gold_reward": -1.2725868225097656,
        "kl_divergence": 133.09378051757812,
        "mean_generated_length": 205.40625
    },
    {
        "step": 170,
        "reward": 1.7966437339782715,
        "gold_reward": -1.5088043212890625,
        "kl_divergence": 135.76397705078125,
        "mean_generated_length": 219.578125
    },
    {
        "step": 171,
        "reward": 2.185596466064453,
        "gold_reward": -0.4868602752685547,
        "kl_divergence": 127.42538452148438,
        "mean_generated_length": 190.75
    },
    {
        "step": 172,
        "reward": 1.6444244384765625,
        "gold_reward": -1.33294677734375,
        "kl_divergence": 145.12338256835938,
        "mean_generated_length": 221.28125
    },
    {
        "step": 173,
        "reward": 2.5575408935546875,
        "gold_reward": -1.0358600616455078,
        "kl_divergence": 148.89779663085938,
        "mean_generated_length": 202.328125
    },
    {
        "step": 174,
        "reward": 1.585479736328125,
        "gold_reward": -1.3101882934570312,
        "kl_divergence": 151.60140991210938,
        "mean_generated_length": 232.828125
    },
    {
        "step": 175,
        "reward": 2.6872806549072266,
        "gold_reward": -1.0225067138671875,
        "kl_divergence": 142.03201293945312,
        "mean_generated_length": 220.875
    },
    {
        "step": 176,
        "reward": 2.099578857421875,
        "gold_reward": -1.3815040588378906,
        "kl_divergence": 147.36761474609375,
        "mean_generated_length": 232.046875
    },
    {
        "step": 177,
        "reward": 3.04400634765625,
        "gold_reward": -1.503533124923706,
        "kl_divergence": 154.0433349609375,
        "mean_generated_length": 219.421875
    },
    {
        "step": 178,
        "reward": 2.263427734375,
        "gold_reward": -1.0050735473632812,
        "kl_divergence": 163.38211059570312,
        "mean_generated_length": 241.390625
    },
    {
        "step": 179,
        "reward": 3.090972900390625,
        "gold_reward": -1.4579315185546875,
        "kl_divergence": 138.13075256347656,
        "mean_generated_length": 203.296875
    },
    {
        "step": 180,
        "reward": 2.06927490234375,
        "gold_reward": -1.1420555114746094,
        "kl_divergence": 148.4680938720703,
        "mean_generated_length": 212.921875
    },
    {
        "step": 181,
        "reward": 3.0132293701171875,
        "gold_reward": -1.002058982849121,
        "kl_divergence": 141.3360137939453,
        "mean_generated_length": 210.4375
    },
    {
        "step": 182,
        "reward": 2.1469497680664062,
        "gold_reward": -0.8863029479980469,
        "kl_divergence": 132.4537353515625,
        "mean_generated_length": 189.90625
    },
    {
        "step": 183,
        "reward": 1.9239349365234375,
        "gold_reward": -1.2359695434570312,
        "kl_divergence": 147.6609344482422,
        "mean_generated_length": 217.640625
    },
    {
        "step": 184,
        "reward": 1.8456296920776367,
        "gold_reward": -1.0992279052734375,
        "kl_divergence": 139.63511657714844,
        "mean_generated_length": 189.0
    },
    {
        "step": 185,
        "reward": 2.3528060913085938,
        "gold_reward": -1.5164642333984375,
        "kl_divergence": 132.1162567138672,
        "mean_generated_length": 191.5625
    },
    {
        "step": 186,
        "reward": 2.0794525146484375,
        "gold_reward": -1.0885543823242188,
        "kl_divergence": 125.41217041015625,
        "mean_generated_length": 184.203125
    },
    {
        "step": 187,
        "reward": 2.4678955078125,
        "gold_reward": -1.3523712158203125,
        "kl_divergence": 129.28558349609375,
        "mean_generated_length": 179.296875
    },
    {
        "step": 188,
        "reward": 1.7630290985107422,
        "gold_reward": -0.6541290283203125,
        "kl_divergence": 127.81633758544922,
        "mean_generated_length": 174.6875
    },
    {
        "step": 189,
        "reward": 1.7859764099121094,
        "gold_reward": -1.4942283630371094,
        "kl_divergence": 120.66053009033203,
        "mean_generated_length": 179.09375
    },
    {
        "step": 190,
        "reward": 1.7560615539550781,
        "gold_reward": -1.1102142333984375,
        "kl_divergence": 123.1972885131836,
        "mean_generated_length": 177.796875
    },
    {
        "step": 191,
        "reward": 2.601409912109375,
        "gold_reward": -0.7494029998779297,
        "kl_divergence": 119.4699478149414,
        "mean_generated_length": 169.546875
    },
    {
        "step": 192,
        "reward": 2.4713706970214844,
        "gold_reward": -0.83056640625,
        "kl_divergence": 132.92234802246094,
        "mean_generated_length": 195.90625
    },
    {
        "step": 193,
        "reward": 2.196319580078125,
        "gold_reward": -0.7210159301757812,
        "kl_divergence": 129.44017028808594,
        "mean_generated_length": 178.515625
    },
    {
        "step": 194,
        "reward": 2.94183349609375,
        "gold_reward": -1.5356979370117188,
        "kl_divergence": 134.0646514892578,
        "mean_generated_length": 193.40625
    },
    {
        "step": 195,
        "reward": 2.2979297637939453,
        "gold_reward": -1.34295654296875,
        "kl_divergence": 135.71322631835938,
        "mean_generated_length": 181.890625
    },
    {
        "step": 196,
        "reward": 1.9607086181640625,
        "gold_reward": -1.5424575805664062,
        "kl_divergence": 136.12890625,
        "mean_generated_length": 192.0625
    },
    {
        "step": 197,
        "reward": 2.0003128051757812,
        "gold_reward": -1.1274566650390625,
        "kl_divergence": 122.62837982177734,
        "mean_generated_length": 176.8125
    },
    {
        "step": 198,
        "reward": 1.619415283203125,
        "gold_reward": -1.283583641052246,
        "kl_divergence": 129.9088897705078,
        "mean_generated_length": 174.21875
    },
    {
        "step": 199,
        "reward": 1.8158111572265625,
        "gold_reward": -1.4582653045654297,
        "kl_divergence": 133.53208923339844,
        "mean_generated_length": 188.140625
    },
    {
        "step": 200,
        "reward": 2.2314682006835938,
        "gold_reward": -1.0975003242492676,
        "kl_divergence": 129.34303283691406,
        "mean_generated_length": 193.421875
    },
    {
        "step": 201,
        "reward": 2.175593376159668,
        "gold_reward": -1.0813369750976562,
        "kl_divergence": 136.10943603515625,
        "mean_generated_length": 202.140625
    },
    {
        "step": 202,
        "reward": 2.31854248046875,
        "gold_reward": -1.26349937915802,
        "kl_divergence": 133.58872985839844,
        "mean_generated_length": 185.359375
    },
    {
        "step": 203,
        "reward": 2.650177001953125,
        "gold_reward": -1.2541160583496094,
        "kl_divergence": 139.1068115234375,
        "mean_generated_length": 195.609375
    },
    {
        "step": 204,
        "reward": 2.6360931396484375,
        "gold_reward": -1.0901329517364502,
        "kl_divergence": 143.5594940185547,
        "mean_generated_length": 191.734375
    },
    {
        "step": 205,
        "reward": 2.5067977905273438,
        "gold_reward": -1.0294442176818848,
        "kl_divergence": 142.38336181640625,
        "mean_generated_length": 186.015625
    },
    {
        "step": 206,
        "reward": 2.938323974609375,
        "gold_reward": -0.7598648071289062,
        "kl_divergence": 157.5435791015625,
        "mean_generated_length": 200.796875
    },
    {
        "step": 207,
        "reward": 2.576122283935547,
        "gold_reward": -1.1015472412109375,
        "kl_divergence": 142.1717987060547,
        "mean_generated_length": 201.671875
    },
    {
        "step": 208,
        "reward": 2.6665420532226562,
        "gold_reward": -0.3285102844238281,
        "kl_divergence": 137.5343780517578,
        "mean_generated_length": 183.109375
    },
    {
        "step": 209,
        "reward": 2.7330095767974854,
        "gold_reward": -0.7996063232421875,
        "kl_divergence": 140.0809783935547,
        "mean_generated_length": 199.390625
    },
    {
        "step": 210,
        "reward": 2.6996774673461914,
        "gold_reward": -1.206831932067871,
        "kl_divergence": 149.985595703125,
        "mean_generated_length": 187.15625
    },
    {
        "step": 211,
        "reward": 1.9857635498046875,
        "gold_reward": -1.3674736022949219,
        "kl_divergence": 140.89894104003906,
        "mean_generated_length": 195.0625
    },
    {
        "step": 212,
        "reward": 2.64093017578125,
        "gold_reward": -1.33685302734375,
        "kl_divergence": 133.3843994140625,
        "mean_generated_length": 182.484375
    },
    {
        "step": 213,
        "reward": 2.415964126586914,
        "gold_reward": -1.0248146057128906,
        "kl_divergence": 143.77890014648438,
        "mean_generated_length": 199.4375
    },
    {
        "step": 214,
        "reward": 2.3836212158203125,
        "gold_reward": -0.8927040100097656,
        "kl_divergence": 143.37091064453125,
        "mean_generated_length": 198.359375
    },
    {
        "step": 215,
        "reward": 2.5923919677734375,
        "gold_reward": -0.6181259155273438,
        "kl_divergence": 140.6875762939453,
        "mean_generated_length": 185.609375
    },
    {
        "step": 216,
        "reward": 2.831096649169922,
        "gold_reward": -0.712646484375,
        "kl_divergence": 142.84555053710938,
        "mean_generated_length": 197.359375
    },
    {
        "step": 217,
        "reward": 2.3993988037109375,
        "gold_reward": -0.9022083282470703,
        "kl_divergence": 139.53846740722656,
        "mean_generated_length": 194.734375
    },
    {
        "step": 218,
        "reward": 2.6106834411621094,
        "gold_reward": -0.71881103515625,
        "kl_divergence": 144.61676025390625,
        "mean_generated_length": 192.53125
    },
    {
        "step": 219,
        "reward": 2.7614083290100098,
        "gold_reward": -0.976071834564209,
        "kl_divergence": 148.13015747070312,
        "mean_generated_length": 204.765625
    },
    {
        "step": 220,
        "reward": 2.2662906646728516,
        "gold_reward": -1.0226850509643555,
        "kl_divergence": 140.7206268310547,
        "mean_generated_length": 187.9375
    },
    {
        "step": 221,
        "reward": 1.8648130893707275,
        "gold_reward": -1.3272323608398438,
        "kl_divergence": 143.4012451171875,
        "mean_generated_length": 188.15625
    },
    {
        "step": 222,
        "reward": 1.989621639251709,
        "gold_reward": -0.8003597259521484,
        "kl_divergence": 132.9607391357422,
        "mean_generated_length": 180.890625
    },
    {
        "step": 223,
        "reward": 2.3898162841796875,
        "gold_reward": -1.2594337463378906,
        "kl_divergence": 133.97744750976562,
        "mean_generated_length": 186.109375
    },
    {
        "step": 224,
        "reward": 2.5114541053771973,
        "gold_reward": -0.6968507766723633,
        "kl_divergence": 132.4486083984375,
        "mean_generated_length": 184.109375
    },
    {
        "step": 225,
        "reward": 3.165740966796875,
        "gold_reward": -1.3287277221679688,
        "kl_divergence": 149.25120544433594,
        "mean_generated_length": 208.171875
    },
    {
        "step": 226,
        "reward": 3.3830642700195312,
        "gold_reward": -0.5806732177734375,
        "kl_divergence": 138.66049194335938,
        "mean_generated_length": 177.03125
    },
    {
        "step": 227,
        "reward": 2.237006187438965,
        "gold_reward": -1.2123782634735107,
        "kl_divergence": 148.24459838867188,
        "mean_generated_length": 194.03125
    },
    {
        "step": 228,
        "reward": 2.4596633911132812,
        "gold_reward": -1.425018310546875,
        "kl_divergence": 135.74832153320312,
        "mean_generated_length": 177.640625
    },
    {
        "step": 229,
        "reward": 2.1393890380859375,
        "gold_reward": -1.3033599853515625,
        "kl_divergence": 149.85633850097656,
        "mean_generated_length": 201.28125
    },
    {
        "step": 230,
        "reward": 2.32073974609375,
        "gold_reward": -1.2820816040039062,
        "kl_divergence": 142.91461181640625,
        "mean_generated_length": 196.921875
    },
    {
        "step": 231,
        "reward": 1.9859619140625,
        "gold_reward": -0.8750841617584229,
        "kl_divergence": 147.33151245117188,
        "mean_generated_length": 202.515625
    },
    {
        "step": 232,
        "reward": 2.5035552978515625,
        "gold_reward": -1.3996200561523438,
        "kl_divergence": 145.94537353515625,
        "mean_generated_length": 213.828125
    },
    {
        "step": 233,
        "reward": 2.7308807373046875,
        "gold_reward": -1.2937164306640625,
        "kl_divergence": 141.56854248046875,
        "mean_generated_length": 196.890625
    },
    {
        "step": 234,
        "reward": 2.158416748046875,
        "gold_reward": -1.1578178405761719,
        "kl_divergence": 145.05755615234375,
        "mean_generated_length": 208.546875
    },
    {
        "step": 235,
        "reward": 2.851374626159668,
        "gold_reward": -0.9185051918029785,
        "kl_divergence": 144.24777221679688,
        "mean_generated_length": 210.46875
    },
    {
        "step": 236,
        "reward": 2.88134765625,
        "gold_reward": -0.857513427734375,
        "kl_divergence": 195.36334228515625,
        "mean_generated_length": 254.375
    }
]