[
    {
        "step": 0,
        "reward": -3.7819061279296875,
        "gold_reward": -3.13934326171875,
        "kl_divergence": 0.0,
        "mean_generated_length": 133.28125
    },
    {
        "step": 1,
        "reward": -3.4259490966796875,
        "gold_reward": -2.917926788330078,
        "kl_divergence": 0.0,
        "mean_generated_length": 127.515625
    },
    {
        "step": 2,
        "reward": -3.362152099609375,
        "gold_reward": -3.0334415435791016,
        "kl_divergence": -0.06499310582876205,
        "mean_generated_length": 89.453125
    },
    {
        "step": 3,
        "reward": -4.011146545410156,
        "gold_reward": -3.1636962890625,
        "kl_divergence": 0.05410473793745041,
        "mean_generated_length": 141.796875
    },
    {
        "step": 4,
        "reward": -3.1777725219726562,
        "gold_reward": -2.945404052734375,
        "kl_divergence": -0.07184137403964996,
        "mean_generated_length": 126.1875
    },
    {
        "step": 5,
        "reward": -4.1084747314453125,
        "gold_reward": -3.1265411376953125,
        "kl_divergence": 0.014171566814184189,
        "mean_generated_length": 119.09375
    },
    {
        "step": 6,
        "reward": -2.7798919677734375,
        "gold_reward": -3.2289657592773438,
        "kl_divergence": 0.19619637727737427,
        "mean_generated_length": 136.5
    },
    {
        "step": 7,
        "reward": -3.7107086181640625,
        "gold_reward": -2.837432861328125,
        "kl_divergence": 0.1866658329963684,
        "mean_generated_length": 129.296875
    },
    {
        "step": 8,
        "reward": -2.7465591430664062,
        "gold_reward": -3.1210575103759766,
        "kl_divergence": 0.5856812596321106,
        "mean_generated_length": 153.84375
    },
    {
        "step": 9,
        "reward": -2.7139015197753906,
        "gold_reward": -2.580841064453125,
        "kl_divergence": 1.1961768865585327,
        "mean_generated_length": 142.734375
    },
    {
        "step": 10,
        "reward": -2.037586212158203,
        "gold_reward": -1.7804317474365234,
        "kl_divergence": 1.4198359251022339,
        "mean_generated_length": 139.15625
    },
    {
        "step": 11,
        "reward": -2.3923606872558594,
        "gold_reward": -3.0848846435546875,
        "kl_divergence": 1.7001953125,
        "mean_generated_length": 150.546875
    },
    {
        "step": 12,
        "reward": -2.7394018173217773,
        "gold_reward": -2.464508056640625,
        "kl_divergence": 2.841367483139038,
        "mean_generated_length": 159.765625
    },
    {
        "step": 13,
        "reward": -2.2020416259765625,
        "gold_reward": -2.0994415283203125,
        "kl_divergence": 2.9929888248443604,
        "mean_generated_length": 137.8125
    },
    {
        "step": 14,
        "reward": -1.8809890747070312,
        "gold_reward": -2.22802734375,
        "kl_divergence": 4.610948085784912,
        "mean_generated_length": 174.390625
    },
    {
        "step": 15,
        "reward": -2.1933212280273438,
        "gold_reward": -2.491729736328125,
        "kl_divergence": 5.243877410888672,
        "mean_generated_length": 171.28125
    },
    {
        "step": 16,
        "reward": -2.4634971618652344,
        "gold_reward": -2.1302261352539062,
        "kl_divergence": 6.973215103149414,
        "mean_generated_length": 174.9375
    },
    {
        "step": 17,
        "reward": -1.1274662017822266,
        "gold_reward": -2.068359375,
        "kl_divergence": 8.27392578125,
        "mean_generated_length": 189.203125
    },
    {
        "step": 18,
        "reward": -2.4135055541992188,
        "gold_reward": -2.6407241821289062,
        "kl_divergence": 10.21733283996582,
        "mean_generated_length": 196.984375
    },
    {
        "step": 19,
        "reward": -1.8634605407714844,
        "gold_reward": -2.4521751403808594,
        "kl_divergence": 11.341580390930176,
        "mean_generated_length": 208.28125
    },
    {
        "step": 20,
        "reward": -2.5771560668945312,
        "gold_reward": -2.6423959732055664,
        "kl_divergence": 14.357635498046875,
        "mean_generated_length": 234.015625
    },
    {
        "step": 21,
        "reward": -1.5815505981445312,
        "gold_reward": -2.46405029296875,
        "kl_divergence": 18.070518493652344,
        "mean_generated_length": 231.9375
    },
    {
        "step": 22,
        "reward": -1.7590179443359375,
        "gold_reward": -2.1793463230133057,
        "kl_divergence": 19.538564682006836,
        "mean_generated_length": 232.46875
    },
    {
        "step": 23,
        "reward": -1.160247802734375,
        "gold_reward": -2.1263275146484375,
        "kl_divergence": 22.203569412231445,
        "mean_generated_length": 243.59375
    },
    {
        "step": 24,
        "reward": -1.550079345703125,
        "gold_reward": -2.5511474609375,
        "kl_divergence": 27.717927932739258,
        "mean_generated_length": 251.484375
    },
    {
        "step": 25,
        "reward": -1.344085693359375,
        "gold_reward": -2.5193099975585938,
        "kl_divergence": 33.122283935546875,
        "mean_generated_length": 292.09375
    },
    {
        "step": 26,
        "reward": -1.5054397583007812,
        "gold_reward": -2.66387939453125,
        "kl_divergence": 32.312713623046875,
        "mean_generated_length": 282.125
    },
    {
        "step": 27,
        "reward": -1.7863082885742188,
        "gold_reward": -2.8335304260253906,
        "kl_divergence": 36.52188491821289,
        "mean_generated_length": 320.140625
    },
    {
        "step": 28,
        "reward": -1.2134628295898438,
        "gold_reward": -2.646820068359375,
        "kl_divergence": 41.714576721191406,
        "mean_generated_length": 281.921875
    },
    {
        "step": 29,
        "reward": -1.4321441650390625,
        "gold_reward": -2.578948974609375,
        "kl_divergence": 44.443973541259766,
        "mean_generated_length": 320.875
    },
    {
        "step": 30,
        "reward": -1.8914718627929688,
        "gold_reward": -2.5095977783203125,
        "kl_divergence": 52.73116683959961,
        "mean_generated_length": 330.5625
    },
    {
        "step": 31,
        "reward": -1.2435660362243652,
        "gold_reward": -2.7027359008789062,
        "kl_divergence": 45.111873626708984,
        "mean_generated_length": 309.390625
    },
    {
        "step": 32,
        "reward": -1.7840728759765625,
        "gold_reward": -2.4190673828125,
        "kl_divergence": 51.12362289428711,
        "mean_generated_length": 293.265625
    },
    {
        "step": 33,
        "reward": -0.8274612426757812,
        "gold_reward": -2.478792190551758,
        "kl_divergence": 49.1498908996582,
        "mean_generated_length": 309.984375
    },
    {
        "step": 34,
        "reward": -1.1648616790771484,
        "gold_reward": -2.5179481506347656,
        "kl_divergence": 55.4298095703125,
        "mean_generated_length": 324.171875
    },
    {
        "step": 35,
        "reward": -1.8104171752929688,
        "gold_reward": -2.468830108642578,
        "kl_divergence": 59.64366149902344,
        "mean_generated_length": 348.609375
    },
    {
        "step": 36,
        "reward": -1.2756080627441406,
        "gold_reward": -3.0219955444335938,
        "kl_divergence": 45.940731048583984,
        "mean_generated_length": 317.359375
    },
    {
        "step": 37,
        "reward": -1.7125015258789062,
        "gold_reward": -2.5202865600585938,
        "kl_divergence": 56.00823211669922,
        "mean_generated_length": 294.25
    },
    {
        "step": 38,
        "reward": -1.9674301147460938,
        "gold_reward": -2.762939453125,
        "kl_divergence": 57.53638458251953,
        "mean_generated_length": 324.703125
    },
    {
        "step": 39,
        "reward": -1.7243719100952148,
        "gold_reward": -2.6807098388671875,
        "kl_divergence": 53.23240661621094,
        "mean_generated_length": 314.484375
    },
    {
        "step": 40,
        "reward": -1.3525724411010742,
        "gold_reward": -2.8457489013671875,
        "kl_divergence": 55.082275390625,
        "mean_generated_length": 316.84375
    },
    {
        "step": 41,
        "reward": -1.393951416015625,
        "gold_reward": -2.271728515625,
        "kl_divergence": 54.87323760986328,
        "mean_generated_length": 316.171875
    },
    {
        "step": 42,
        "reward": -1.71881103515625,
        "gold_reward": -2.5066423416137695,
        "kl_divergence": 51.633602142333984,
        "mean_generated_length": 290.0625
    },
    {
        "step": 43,
        "reward": -1.6153526306152344,
        "gold_reward": -2.6076736450195312,
        "kl_divergence": 47.767513275146484,
        "mean_generated_length": 292.703125
    },
    {
        "step": 44,
        "reward": -1.1065216064453125,
        "gold_reward": -2.298879623413086,
        "kl_divergence": 47.66518020629883,
        "mean_generated_length": 276.734375
    },
    {
        "step": 45,
        "reward": -0.9610137939453125,
        "gold_reward": -2.4284591674804688,
        "kl_divergence": 41.67965316772461,
        "mean_generated_length": 255.3125
    },
    {
        "step": 46,
        "reward": -1.0923385620117188,
        "gold_reward": -2.2120513916015625,
        "kl_divergence": 49.561378479003906,
        "mean_generated_length": 257.53125
    },
    {
        "step": 47,
        "reward": -1.1249456405639648,
        "gold_reward": -2.3770370483398438,
        "kl_divergence": 40.720176696777344,
        "mean_generated_length": 238.96875
    },
    {
        "step": 48,
        "reward": -0.5675811767578125,
        "gold_reward": -1.6178741455078125,
        "kl_divergence": 42.72877883911133,
        "mean_generated_length": 233.34375
    },
    {
        "step": 49,
        "reward": -1.5546398162841797,
        "gold_reward": -2.19873046875,
        "kl_divergence": 40.42212677001953,
        "mean_generated_length": 246.234375
    },
    {
        "step": 50,
        "reward": -0.46677398681640625,
        "gold_reward": -1.4597015380859375,
        "kl_divergence": 37.645259857177734,
        "mean_generated_length": 207.6875
    },
    {
        "step": 51,
        "reward": -0.981048583984375,
        "gold_reward": -2.255584716796875,
        "kl_divergence": 42.463680267333984,
        "mean_generated_length": 254.84375
    },
    {
        "step": 52,
        "reward": -0.5739288330078125,
        "gold_reward": -2.100322723388672,
        "kl_divergence": 40.147098541259766,
        "mean_generated_length": 232.734375
    },
    {
        "step": 53,
        "reward": -1.4530717134475708,
        "gold_reward": -2.3147506713867188,
        "kl_divergence": 39.87523651123047,
        "mean_generated_length": 237.0
    },
    {
        "step": 54,
        "reward": -1.3909893035888672,
        "gold_reward": -2.534597396850586,
        "kl_divergence": 37.18370056152344,
        "mean_generated_length": 228.640625
    },
    {
        "step": 55,
        "reward": -0.2645416259765625,
        "gold_reward": -1.8304901123046875,
        "kl_divergence": 43.17755889892578,
        "mean_generated_length": 239.546875
    },
    {
        "step": 56,
        "reward": -1.2626152038574219,
        "gold_reward": -2.417581558227539,
        "kl_divergence": 42.49889373779297,
        "mean_generated_length": 236.9375
    },
    {
        "step": 57,
        "reward": -1.3262062072753906,
        "gold_reward": -2.0815582275390625,
        "kl_divergence": 45.65571212768555,
        "mean_generated_length": 231.734375
    },
    {
        "step": 58,
        "reward": -0.854034423828125,
        "gold_reward": -1.770843505859375,
        "kl_divergence": 40.8856201171875,
        "mean_generated_length": 219.609375
    },
    {
        "step": 59,
        "reward": -1.006744384765625,
        "gold_reward": -1.5936203002929688,
        "kl_divergence": 42.371849060058594,
        "mean_generated_length": 221.1875
    },
    {
        "step": 60,
        "reward": -0.5455780029296875,
        "gold_reward": -1.8613700866699219,
        "kl_divergence": 41.29511642456055,
        "mean_generated_length": 210.46875
    },
    {
        "step": 61,
        "reward": -0.8385066986083984,
        "gold_reward": -1.84185791015625,
        "kl_divergence": 42.91746520996094,
        "mean_generated_length": 225.640625
    },
    {
        "step": 62,
        "reward": -0.5455169677734375,
        "gold_reward": -2.2052459716796875,
        "kl_divergence": 44.203983306884766,
        "mean_generated_length": 226.296875
    },
    {
        "step": 63,
        "reward": -1.319894790649414,
        "gold_reward": -2.1762847900390625,
        "kl_divergence": 43.57958221435547,
        "mean_generated_length": 225.78125
    },
    {
        "step": 64,
        "reward": -0.5642762184143066,
        "gold_reward": -1.4124984741210938,
        "kl_divergence": 48.255126953125,
        "mean_generated_length": 221.46875
    },
    {
        "step": 65,
        "reward": -0.9563617706298828,
        "gold_reward": -2.1276626586914062,
        "kl_divergence": 47.72521209716797,
        "mean_generated_length": 238.890625
    },
    {
        "step": 66,
        "reward": -1.11395263671875,
        "gold_reward": -2.1253585815429688,
        "kl_divergence": 46.760215759277344,
        "mean_generated_length": 227.0625
    },
    {
        "step": 67,
        "reward": -0.8853950500488281,
        "gold_reward": -2.2630653381347656,
        "kl_divergence": 53.64624786376953,
        "mean_generated_length": 266.1875
    },
    {
        "step": 68,
        "reward": 0.5910310745239258,
        "gold_reward": -1.3160438537597656,
        "kl_divergence": 56.853694915771484,
        "mean_generated_length": 226.796875
    },
    {
        "step": 69,
        "reward": -0.5929069519042969,
        "gold_reward": -1.734588623046875,
        "kl_divergence": 56.27447509765625,
        "mean_generated_length": 240.375
    },
    {
        "step": 70,
        "reward": -0.6573314666748047,
        "gold_reward": -1.9082794189453125,
        "kl_divergence": 51.33463668823242,
        "mean_generated_length": 222.171875
    },
    {
        "step": 71,
        "reward": -1.1744155883789062,
        "gold_reward": -2.258026123046875,
        "kl_divergence": 49.34406280517578,
        "mean_generated_length": 222.78125
    },
    {
        "step": 72,
        "reward": -0.8609867095947266,
        "gold_reward": -1.8120040893554688,
        "kl_divergence": 48.14566421508789,
        "mean_generated_length": 209.0625
    },
    {
        "step": 73,
        "reward": -1.3509635925292969,
        "gold_reward": -2.219104766845703,
        "kl_divergence": 47.29391860961914,
        "mean_generated_length": 212.921875
    },
    {
        "step": 74,
        "reward": -0.7303810119628906,
        "gold_reward": -2.2153472900390625,
        "kl_divergence": 43.51045608520508,
        "mean_generated_length": 222.34375
    },
    {
        "step": 75,
        "reward": -0.33852386474609375,
        "gold_reward": -2.1453208923339844,
        "kl_divergence": 44.41984939575195,
        "mean_generated_length": 209.40625
    },
    {
        "step": 76,
        "reward": -0.9642448425292969,
        "gold_reward": -1.9627685546875,
        "kl_divergence": 53.894718170166016,
        "mean_generated_length": 220.25
    },
    {
        "step": 77,
        "reward": -0.6829376220703125,
        "gold_reward": -1.8627243041992188,
        "kl_divergence": 57.266845703125,
        "mean_generated_length": 241.359375
    },
    {
        "step": 78,
        "reward": -0.49432373046875,
        "gold_reward": -1.2142333984375,
        "kl_divergence": 58.86562728881836,
        "mean_generated_length": 240.75
    },
    {
        "step": 79,
        "reward": -1.2923221588134766,
        "gold_reward": -2.106842041015625,
        "kl_divergence": 53.95771026611328,
        "mean_generated_length": 214.53125
    },
    {
        "step": 80,
        "reward": -0.3607769012451172,
        "gold_reward": -1.9100563526153564,
        "kl_divergence": 49.741573333740234,
        "mean_generated_length": 216.59375
    },
    {
        "step": 81,
        "reward": -0.177801251411438,
        "gold_reward": -1.6815319061279297,
        "kl_divergence": 52.52530288696289,
        "mean_generated_length": 192.59375
    },
    {
        "step": 82,
        "reward": -0.05966949462890625,
        "gold_reward": -1.9797554016113281,
        "kl_divergence": 59.144779205322266,
        "mean_generated_length": 227.625
    },
    {
        "step": 83,
        "reward": 0.0455169677734375,
        "gold_reward": -1.75360107421875,
        "kl_divergence": 52.98264694213867,
        "mean_generated_length": 214.328125
    },
    {
        "step": 84,
        "reward": -1.3150959014892578,
        "gold_reward": -2.228139877319336,
        "kl_divergence": 59.16572570800781,
        "mean_generated_length": 211.34375
    },
    {
        "step": 85,
        "reward": -0.49889373779296875,
        "gold_reward": -2.4089279174804688,
        "kl_divergence": 56.91168975830078,
        "mean_generated_length": 209.015625
    },
    {
        "step": 86,
        "reward": -0.4863917827606201,
        "gold_reward": -1.8860244750976562,
        "kl_divergence": 57.23118591308594,
        "mean_generated_length": 210.765625
    },
    {
        "step": 87,
        "reward": 0.5944671630859375,
        "gold_reward": -2.0830917358398438,
        "kl_divergence": 65.31181335449219,
        "mean_generated_length": 230.1875
    },
    {
        "step": 88,
        "reward": 0.1715545654296875,
        "gold_reward": -1.8844146728515625,
        "kl_divergence": 67.48519134521484,
        "mean_generated_length": 231.921875
    },
    {
        "step": 89,
        "reward": 0.6228790283203125,
        "gold_reward": -1.1064071655273438,
        "kl_divergence": 69.50137329101562,
        "mean_generated_length": 225.640625
    },
    {
        "step": 90,
        "reward": 0.23760223388671875,
        "gold_reward": -2.2942886352539062,
        "kl_divergence": 75.83070373535156,
        "mean_generated_length": 247.25
    },
    {
        "step": 91,
        "reward": 0.2314453125,
        "gold_reward": -1.7276268005371094,
        "kl_divergence": 78.15007019042969,
        "mean_generated_length": 239.28125
    },
    {
        "step": 92,
        "reward": 0.3300209045410156,
        "gold_reward": -1.0996971130371094,
        "kl_divergence": 73.17512512207031,
        "mean_generated_length": 216.625
    },
    {
        "step": 93,
        "reward": 0.500152587890625,
        "gold_reward": -1.5999984741210938,
        "kl_divergence": 82.42589569091797,
        "mean_generated_length": 245.859375
    },
    {
        "step": 94,
        "reward": 0.44272613525390625,
        "gold_reward": -1.9414739608764648,
        "kl_divergence": 80.58706665039062,
        "mean_generated_length": 236.859375
    },
    {
        "step": 95,
        "reward": -0.39849853515625,
        "gold_reward": -2.0745601654052734,
        "kl_divergence": 81.35077667236328,
        "mean_generated_length": 250.953125
    },
    {
        "step": 96,
        "reward": 0.8027763366699219,
        "gold_reward": -1.4431304931640625,
        "kl_divergence": 79.14788055419922,
        "mean_generated_length": 245.015625
    },
    {
        "step": 97,
        "reward": -0.4664115905761719,
        "gold_reward": -2.70928955078125,
        "kl_divergence": 83.17878723144531,
        "mean_generated_length": 259.359375
    },
    {
        "step": 98,
        "reward": 0.37484073638916016,
        "gold_reward": -2.126922607421875,
        "kl_divergence": 81.67818450927734,
        "mean_generated_length": 245.96875
    },
    {
        "step": 99,
        "reward": -0.5690460205078125,
        "gold_reward": -2.1941070556640625,
        "kl_divergence": 79.01313781738281,
        "mean_generated_length": 246.78125
    },
    {
        "step": 100,
        "reward": 0.4439697265625,
        "gold_reward": -2.047168731689453,
        "kl_divergence": 64.0073471069336,
        "mean_generated_length": 209.9375
    },
    {
        "step": 101,
        "reward": -0.206390380859375,
        "gold_reward": -1.5047607421875,
        "kl_divergence": 75.95704650878906,
        "mean_generated_length": 224.640625
    },
    {
        "step": 102,
        "reward": 0.8015708923339844,
        "gold_reward": -1.2621498107910156,
        "kl_divergence": 72.98121643066406,
        "mean_generated_length": 216.71875
    },
    {
        "step": 103,
        "reward": 0.05689239501953125,
        "gold_reward": -1.7183990478515625,
        "kl_divergence": 71.12899017333984,
        "mean_generated_length": 218.0
    },
    {
        "step": 104,
        "reward": -0.06348991394042969,
        "gold_reward": -1.7174072265625,
        "kl_divergence": 81.47413635253906,
        "mean_generated_length": 247.6875
    },
    {
        "step": 105,
        "reward": -0.7294950485229492,
        "gold_reward": -1.9482955932617188,
        "kl_divergence": 70.68550109863281,
        "mean_generated_length": 221.40625
    },
    {
        "step": 106,
        "reward": 0.018815994262695312,
        "gold_reward": -2.08514404296875,
        "kl_divergence": 80.27776336669922,
        "mean_generated_length": 244.546875
    },
    {
        "step": 107,
        "reward": 0.0831594467163086,
        "gold_reward": -1.6536750793457031,
        "kl_divergence": 75.52534484863281,
        "mean_generated_length": 215.328125
    },
    {
        "step": 108,
        "reward": 0.8948812484741211,
        "gold_reward": -1.9107131958007812,
        "kl_divergence": 87.95426940917969,
        "mean_generated_length": 242.734375
    },
    {
        "step": 109,
        "reward": 0.345672607421875,
        "gold_reward": -1.5390571355819702,
        "kl_divergence": 93.27840423583984,
        "mean_generated_length": 246.984375
    },
    {
        "step": 110,
        "reward": 0.14786148071289062,
        "gold_reward": -2.168914794921875,
        "kl_divergence": 92.64701080322266,
        "mean_generated_length": 265.96875
    },
    {
        "step": 111,
        "reward": 0.04520416259765625,
        "gold_reward": -1.8924407958984375,
        "kl_divergence": 89.54986572265625,
        "mean_generated_length": 238.078125
    },
    {
        "step": 112,
        "reward": 0.965972900390625,
        "gold_reward": -1.5621891021728516,
        "kl_divergence": 84.483154296875,
        "mean_generated_length": 245.046875
    },
    {
        "step": 113,
        "reward": 0.41619396209716797,
        "gold_reward": -1.5874481201171875,
        "kl_divergence": 108.7546615600586,
        "mean_generated_length": 305.390625
    },
    {
        "step": 114,
        "reward": 0.3132052421569824,
        "gold_reward": -1.7862415313720703,
        "kl_divergence": 103.71839141845703,
        "mean_generated_length": 280.546875
    },
    {
        "step": 115,
        "reward": 0.6547775268554688,
        "gold_reward": -2.3487701416015625,
        "kl_divergence": 96.92921447753906,
        "mean_generated_length": 294.546875
    },
    {
        "step": 116,
        "reward": 0.01267242431640625,
        "gold_reward": -1.838958740234375,
        "kl_divergence": 88.82121276855469,
        "mean_generated_length": 252.78125
    },
    {
        "step": 117,
        "reward": 0.8001527786254883,
        "gold_reward": -2.3082656860351562,
        "kl_divergence": 100.03507232666016,
        "mean_generated_length": 290.6875
    },
    {
        "step": 118,
        "reward": 1.1949996948242188,
        "gold_reward": -1.820068359375,
        "kl_divergence": 89.25767517089844,
        "mean_generated_length": 267.625
    },
    {
        "step": 119,
        "reward": 0.2023143768310547,
        "gold_reward": -1.9662094116210938,
        "kl_divergence": 96.02033233642578,
        "mean_generated_length": 268.078125
    },
    {
        "step": 120,
        "reward": 0.3688111901283264,
        "gold_reward": -1.6458740234375,
        "kl_divergence": 87.8742904663086,
        "mean_generated_length": 263.546875
    },
    {
        "step": 121,
        "reward": 0.985558032989502,
        "gold_reward": -1.601531982421875,
        "kl_divergence": 76.32264709472656,
        "mean_generated_length": 248.515625
    },
    {
        "step": 122,
        "reward": 0.9632110595703125,
        "gold_reward": -1.7379226684570312,
        "kl_divergence": 86.09618377685547,
        "mean_generated_length": 262.921875
    },
    {
        "step": 123,
        "reward": 0.3179168701171875,
        "gold_reward": -1.937347412109375,
        "kl_divergence": 77.01151275634766,
        "mean_generated_length": 230.40625
    },
    {
        "step": 124,
        "reward": 0.83837890625,
        "gold_reward": -2.261566162109375,
        "kl_divergence": 69.3768081665039,
        "mean_generated_length": 240.40625
    },
    {
        "step": 125,
        "reward": 0.2132568359375,
        "gold_reward": -1.73736572265625,
        "kl_divergence": 73.34149932861328,
        "mean_generated_length": 238.515625
    },
    {
        "step": 126,
        "reward": 0.7627487182617188,
        "gold_reward": -1.7929229736328125,
        "kl_divergence": 67.84385681152344,
        "mean_generated_length": 225.375
    },
    {
        "step": 127,
        "reward": 0.4984588623046875,
        "gold_reward": -1.3422431945800781,
        "kl_divergence": 76.09915924072266,
        "mean_generated_length": 222.9375
    },
    {
        "step": 128,
        "reward": 0.5462455749511719,
        "gold_reward": -1.4994053840637207,
        "kl_divergence": 82.48013305664062,
        "mean_generated_length": 254.3125
    },
    {
        "step": 129,
        "reward": 0.41747283935546875,
        "gold_reward": -1.0599250793457031,
        "kl_divergence": 68.85845947265625,
        "mean_generated_length": 203.71875
    },
    {
        "step": 130,
        "reward": 0.7504730224609375,
        "gold_reward": -1.7700614929199219,
        "kl_divergence": 77.12123107910156,
        "mean_generated_length": 261.34375
    },
    {
        "step": 131,
        "reward": 0.20055770874023438,
        "gold_reward": -1.9038143157958984,
        "kl_divergence": 69.65945434570312,
        "mean_generated_length": 231.5
    },
    {
        "step": 132,
        "reward": 0.5040359497070312,
        "gold_reward": -1.4080848693847656,
        "kl_divergence": 73.20155334472656,
        "mean_generated_length": 241.5
    },
    {
        "step": 133,
        "reward": 0.49964046478271484,
        "gold_reward": -1.777862548828125,
        "kl_divergence": 69.7149658203125,
        "mean_generated_length": 242.984375
    },
    {
        "step": 134,
        "reward": 1.2129669189453125,
        "gold_reward": -1.2384729385375977,
        "kl_divergence": 77.63421630859375,
        "mean_generated_length": 238.359375
    },
    {
        "step": 135,
        "reward": 0.3806610107421875,
        "gold_reward": -1.9248199462890625,
        "kl_divergence": 66.61708068847656,
        "mean_generated_length": 228.96875
    },
    {
        "step": 136,
        "reward": 1.0097062587738037,
        "gold_reward": -1.1736564636230469,
        "kl_divergence": 70.82125854492188,
        "mean_generated_length": 234.65625
    },
    {
        "step": 137,
        "reward": 0.41053009033203125,
        "gold_reward": -1.5432491302490234,
        "kl_divergence": 78.27996826171875,
        "mean_generated_length": 235.671875
    },
    {
        "step": 138,
        "reward": 0.9487781524658203,
        "gold_reward": -1.3581104278564453,
        "kl_divergence": 77.21638488769531,
        "mean_generated_length": 233.015625
    },
    {
        "step": 139,
        "reward": 0.8619174957275391,
        "gold_reward": -1.272329330444336,
        "kl_divergence": 74.0716323852539,
        "mean_generated_length": 223.8125
    },
    {
        "step": 140,
        "reward": 0.7731952667236328,
        "gold_reward": -1.644181728363037,
        "kl_divergence": 75.00399780273438,
        "mean_generated_length": 242.09375
    },
    {
        "step": 141,
        "reward": 0.6283721923828125,
        "gold_reward": -1.4914112091064453,
        "kl_divergence": 82.76220703125,
        "mean_generated_length": 252.125
    },
    {
        "step": 142,
        "reward": 0.1680450439453125,
        "gold_reward": -1.7601318359375,
        "kl_divergence": 75.66212463378906,
        "mean_generated_length": 229.1875
    },
    {
        "step": 143,
        "reward": 0.10343742370605469,
        "gold_reward": -1.3427200317382812,
        "kl_divergence": 89.14350128173828,
        "mean_generated_length": 244.109375
    },
    {
        "step": 144,
        "reward": 0.33119964599609375,
        "gold_reward": -1.9187278747558594,
        "kl_divergence": 84.7533187866211,
        "mean_generated_length": 252.40625
    },
    {
        "step": 145,
        "reward": 0.8745403289794922,
        "gold_reward": -1.4334564208984375,
        "kl_divergence": 81.6351318359375,
        "mean_generated_length": 245.15625
    },
    {
        "step": 146,
        "reward": 0.9859180450439453,
        "gold_reward": -2.2190818786621094,
        "kl_divergence": 93.792236328125,
        "mean_generated_length": 272.265625
    },
    {
        "step": 147,
        "reward": 1.5035781860351562,
        "gold_reward": -1.1275711059570312,
        "kl_divergence": 89.79974365234375,
        "mean_generated_length": 240.546875
    },
    {
        "step": 148,
        "reward": 1.0483884811401367,
        "gold_reward": -1.4325714111328125,
        "kl_divergence": 86.37019348144531,
        "mean_generated_length": 244.171875
    },
    {
        "step": 149,
        "reward": 0.8842476606369019,
        "gold_reward": -1.6769676208496094,
        "kl_divergence": 89.39523315429688,
        "mean_generated_length": 254.0625
    },
    {
        "step": 150,
        "reward": -0.09759140014648438,
        "gold_reward": -1.6840591430664062,
        "kl_divergence": 91.50444030761719,
        "mean_generated_length": 257.296875
    },
    {
        "step": 151,
        "reward": 0.7019948959350586,
        "gold_reward": -1.4227867126464844,
        "kl_divergence": 89.90353393554688,
        "mean_generated_length": 247.796875
    },
    {
        "step": 152,
        "reward": 0.4937248229980469,
        "gold_reward": -1.379502296447754,
        "kl_divergence": 86.6568832397461,
        "mean_generated_length": 245.40625
    },
    {
        "step": 153,
        "reward": 0.37612152099609375,
        "gold_reward": -1.9485702514648438,
        "kl_divergence": 77.00022888183594,
        "mean_generated_length": 256.015625
    },
    {
        "step": 154,
        "reward": 1.2552261352539062,
        "gold_reward": -1.4977302551269531,
        "kl_divergence": 89.58929443359375,
        "mean_generated_length": 267.140625
    },
    {
        "step": 155,
        "reward": 0.3366050720214844,
        "gold_reward": -2.0783843994140625,
        "kl_divergence": 89.50191497802734,
        "mean_generated_length": 258.3125
    },
    {
        "step": 156,
        "reward": 1.3506355285644531,
        "gold_reward": -1.013031005859375,
        "kl_divergence": 92.07482147216797,
        "mean_generated_length": 281.71875
    },
    {
        "step": 157,
        "reward": 0.14990234375,
        "gold_reward": -1.9468994140625,
        "kl_divergence": 114.0145492553711,
        "mean_generated_length": 299.5
    },
    {
        "step": 158,
        "reward": 0.7864799499511719,
        "gold_reward": -1.89373779296875,
        "kl_divergence": 88.25703430175781,
        "mean_generated_length": 254.15625
    },
    {
        "step": 159,
        "reward": 0.67010498046875,
        "gold_reward": -1.4762763977050781,
        "kl_divergence": 88.38094329833984,
        "mean_generated_length": 246.140625
    },
    {
        "step": 160,
        "reward": 1.0691986083984375,
        "gold_reward": -1.2264251708984375,
        "kl_divergence": 86.635498046875,
        "mean_generated_length": 224.96875
    },
    {
        "step": 161,
        "reward": 0.9178009033203125,
        "gold_reward": -1.5932693481445312,
        "kl_divergence": 91.21177673339844,
        "mean_generated_length": 256.8125
    },
    {
        "step": 162,
        "reward": 1.147918701171875,
        "gold_reward": -1.6865081787109375,
        "kl_divergence": 88.13514709472656,
        "mean_generated_length": 243.359375
    },
    {
        "step": 163,
        "reward": 0.1463031768798828,
        "gold_reward": -1.8939590454101562,
        "kl_divergence": 87.7491226196289,
        "mean_generated_length": 247.59375
    },
    {
        "step": 164,
        "reward": 1.0387496948242188,
        "gold_reward": -2.1333179473876953,
        "kl_divergence": 89.83875274658203,
        "mean_generated_length": 250.984375
    },
    {
        "step": 165,
        "reward": 0.5553436279296875,
        "gold_reward": -1.6848869323730469,
        "kl_divergence": 82.58822631835938,
        "mean_generated_length": 230.078125
    },
    {
        "step": 166,
        "reward": 1.5533905029296875,
        "gold_reward": -1.7464885711669922,
        "kl_divergence": 87.84327697753906,
        "mean_generated_length": 248.5
    },
    {
        "step": 167,
        "reward": 1.5148391723632812,
        "gold_reward": -1.35479736328125,
        "kl_divergence": 87.25798034667969,
        "mean_generated_length": 244.296875
    },
    {
        "step": 168,
        "reward": 1.4572067260742188,
        "gold_reward": -1.065093994140625,
        "kl_divergence": 79.95545196533203,
        "mean_generated_length": 229.46875
    },
    {
        "step": 169,
        "reward": 0.9278278946876526,
        "gold_reward": -1.8155745267868042,
        "kl_divergence": 82.36563873291016,
        "mean_generated_length": 226.453125
    },
    {
        "step": 170,
        "reward": 1.4534835815429688,
        "gold_reward": -1.3981361389160156,
        "kl_divergence": 83.04154968261719,
        "mean_generated_length": 232.921875
    },
    {
        "step": 171,
        "reward": 1.5021514892578125,
        "gold_reward": -0.7815322875976562,
        "kl_divergence": 78.81016540527344,
        "mean_generated_length": 209.3125
    },
    {
        "step": 172,
        "reward": 1.0927200317382812,
        "gold_reward": -1.5421676635742188,
        "kl_divergence": 87.71591186523438,
        "mean_generated_length": 243.09375
    },
    {
        "step": 173,
        "reward": 1.208303451538086,
        "gold_reward": -1.3413543701171875,
        "kl_divergence": 85.70228576660156,
        "mean_generated_length": 211.875
    },
    {
        "step": 174,
        "reward": 1.2094306945800781,
        "gold_reward": -1.4029502868652344,
        "kl_divergence": 86.22404479980469,
        "mean_generated_length": 250.953125
    },
    {
        "step": 175,
        "reward": 1.2852783203125,
        "gold_reward": -1.24859619140625,
        "kl_divergence": 83.25933837890625,
        "mean_generated_length": 226.484375
    },
    {
        "step": 176,
        "reward": 0.8492183685302734,
        "gold_reward": -1.852325439453125,
        "kl_divergence": 83.94239044189453,
        "mean_generated_length": 240.515625
    },
    {
        "step": 177,
        "reward": 1.2984066009521484,
        "gold_reward": -1.464874267578125,
        "kl_divergence": 83.88246154785156,
        "mean_generated_length": 230.984375
    },
    {
        "step": 178,
        "reward": 0.8890914916992188,
        "gold_reward": -1.4912338256835938,
        "kl_divergence": 91.05473327636719,
        "mean_generated_length": 249.15625
    },
    {
        "step": 179,
        "reward": 2.1556718349456787,
        "gold_reward": -1.5156173706054688,
        "kl_divergence": 81.86419677734375,
        "mean_generated_length": 231.5
    },
    {
        "step": 180,
        "reward": 1.3931655883789062,
        "gold_reward": -1.0395355224609375,
        "kl_divergence": 89.47077941894531,
        "mean_generated_length": 240.78125
    },
    {
        "step": 181,
        "reward": 1.8176727294921875,
        "gold_reward": -0.6972465515136719,
        "kl_divergence": 81.62516784667969,
        "mean_generated_length": 233.453125
    },
    {
        "step": 182,
        "reward": 1.4512252807617188,
        "gold_reward": -1.3184967041015625,
        "kl_divergence": 75.4453353881836,
        "mean_generated_length": 216.625
    },
    {
        "step": 183,
        "reward": 1.1562538146972656,
        "gold_reward": -1.3014373779296875,
        "kl_divergence": 83.24788665771484,
        "mean_generated_length": 232.234375
    },
    {
        "step": 184,
        "reward": 1.1896915435791016,
        "gold_reward": -1.6820831298828125,
        "kl_divergence": 80.40326690673828,
        "mean_generated_length": 230.765625
    },
    {
        "step": 185,
        "reward": 1.3964262008666992,
        "gold_reward": -1.9444313049316406,
        "kl_divergence": 88.85729217529297,
        "mean_generated_length": 241.09375
    },
    {
        "step": 186,
        "reward": 1.0103988647460938,
        "gold_reward": -1.1383662223815918,
        "kl_divergence": 80.0308609008789,
        "mean_generated_length": 210.828125
    },
    {
        "step": 187,
        "reward": 1.127584457397461,
        "gold_reward": -1.315669059753418,
        "kl_divergence": 80.78163146972656,
        "mean_generated_length": 231.09375
    },
    {
        "step": 188,
        "reward": 1.0169525146484375,
        "gold_reward": -1.2220687866210938,
        "kl_divergence": 89.74480438232422,
        "mean_generated_length": 233.0625
    },
    {
        "step": 189,
        "reward": 1.2330741882324219,
        "gold_reward": -1.5516242980957031,
        "kl_divergence": 81.28995513916016,
        "mean_generated_length": 245.015625
    },
    {
        "step": 190,
        "reward": 1.02996826171875,
        "gold_reward": -1.3589019775390625,
        "kl_divergence": 86.95341491699219,
        "mean_generated_length": 231.125
    },
    {
        "step": 191,
        "reward": 1.6498336791992188,
        "gold_reward": -1.6222152709960938,
        "kl_divergence": 83.82524871826172,
        "mean_generated_length": 230.40625
    },
    {
        "step": 192,
        "reward": 1.3219184875488281,
        "gold_reward": -1.022674560546875,
        "kl_divergence": 96.81354522705078,
        "mean_generated_length": 267.265625
    },
    {
        "step": 193,
        "reward": 1.4099922180175781,
        "gold_reward": -0.8806610107421875,
        "kl_divergence": 87.62539672851562,
        "mean_generated_length": 231.140625
    },
    {
        "step": 194,
        "reward": 1.8540763854980469,
        "gold_reward": -1.8639373779296875,
        "kl_divergence": 83.39738464355469,
        "mean_generated_length": 248.390625
    },
    {
        "step": 195,
        "reward": 0.728363037109375,
        "gold_reward": -1.4818115234375,
        "kl_divergence": 78.28439331054688,
        "mean_generated_length": 228.140625
    },
    {
        "step": 196,
        "reward": 1.4081878662109375,
        "gold_reward": -1.6927518844604492,
        "kl_divergence": 86.2192611694336,
        "mean_generated_length": 261.953125
    },
    {
        "step": 197,
        "reward": 1.6334266662597656,
        "gold_reward": -1.11767578125,
        "kl_divergence": 84.53206634521484,
        "mean_generated_length": 248.375
    },
    {
        "step": 198,
        "reward": 0.8059654235839844,
        "gold_reward": -1.6465988159179688,
        "kl_divergence": 91.57242584228516,
        "mean_generated_length": 254.671875
    },
    {
        "step": 199,
        "reward": 0.847259521484375,
        "gold_reward": -1.3163909912109375,
        "kl_divergence": 82.5947265625,
        "mean_generated_length": 256.765625
    },
    {
        "step": 200,
        "reward": 1.5803251266479492,
        "gold_reward": -1.595184326171875,
        "kl_divergence": 91.50531005859375,
        "mean_generated_length": 245.953125
    },
    {
        "step": 201,
        "reward": 1.131988525390625,
        "gold_reward": -1.4363479614257812,
        "kl_divergence": 85.52043914794922,
        "mean_generated_length": 251.75
    },
    {
        "step": 202,
        "reward": 1.2637176513671875,
        "gold_reward": -1.7105865478515625,
        "kl_divergence": 78.72899627685547,
        "mean_generated_length": 223.296875
    },
    {
        "step": 203,
        "reward": 1.1210861206054688,
        "gold_reward": -1.804347038269043,
        "kl_divergence": 80.08946228027344,
        "mean_generated_length": 244.421875
    },
    {
        "step": 204,
        "reward": 1.3839912414550781,
        "gold_reward": -1.3689937591552734,
        "kl_divergence": 90.60112762451172,
        "mean_generated_length": 253.59375
    },
    {
        "step": 205,
        "reward": 1.3550362586975098,
        "gold_reward": -1.5152587890625,
        "kl_divergence": 84.92213439941406,
        "mean_generated_length": 241.9375
    },
    {
        "step": 206,
        "reward": 1.4841890335083008,
        "gold_reward": -1.157888412475586,
        "kl_divergence": 93.2884521484375,
        "mean_generated_length": 240.875
    },
    {
        "step": 207,
        "reward": 1.286773681640625,
        "gold_reward": -1.5077152252197266,
        "kl_divergence": 88.92876434326172,
        "mean_generated_length": 259.859375
    },
    {
        "step": 208,
        "reward": 1.2921714782714844,
        "gold_reward": -0.9081611633300781,
        "kl_divergence": 87.06413269042969,
        "mean_generated_length": 224.859375
    },
    {
        "step": 209,
        "reward": 0.9462337493896484,
        "gold_reward": -1.7690887451171875,
        "kl_divergence": 96.41356658935547,
        "mean_generated_length": 268.875
    },
    {
        "step": 210,
        "reward": 1.1151885986328125,
        "gold_reward": -1.612091064453125,
        "kl_divergence": 83.90530395507812,
        "mean_generated_length": 236.15625
    },
    {
        "step": 211,
        "reward": 1.2218894958496094,
        "gold_reward": -1.3338508605957031,
        "kl_divergence": 90.58366394042969,
        "mean_generated_length": 241.03125
    },
    {
        "step": 212,
        "reward": 1.3963699340820312,
        "gold_reward": -2.0355758666992188,
        "kl_divergence": 86.39715576171875,
        "mean_generated_length": 244.921875
    },
    {
        "step": 213,
        "reward": 1.8061141967773438,
        "gold_reward": -1.0681381225585938,
        "kl_divergence": 89.70318603515625,
        "mean_generated_length": 246.9375
    },
    {
        "step": 214,
        "reward": 1.2915115356445312,
        "gold_reward": -1.6500701904296875,
        "kl_divergence": 90.57850646972656,
        "mean_generated_length": 247.21875
    },
    {
        "step": 215,
        "reward": 1.7708358764648438,
        "gold_reward": -0.9111061096191406,
        "kl_divergence": 90.7149429321289,
        "mean_generated_length": 243.625
    },
    {
        "step": 216,
        "reward": 1.7116317749023438,
        "gold_reward": -0.9894638061523438,
        "kl_divergence": 82.79962158203125,
        "mean_generated_length": 237.71875
    },
    {
        "step": 217,
        "reward": 1.1178436279296875,
        "gold_reward": -1.2205657958984375,
        "kl_divergence": 93.95427703857422,
        "mean_generated_length": 243.03125
    },
    {
        "step": 218,
        "reward": 1.349008560180664,
        "gold_reward": -1.1098289489746094,
        "kl_divergence": 86.7726058959961,
        "mean_generated_length": 245.921875
    },
    {
        "step": 219,
        "reward": 1.6543121337890625,
        "gold_reward": -1.1005706787109375,
        "kl_divergence": 91.35720825195312,
        "mean_generated_length": 249.890625
    },
    {
        "step": 220,
        "reward": 1.7469806671142578,
        "gold_reward": -1.3006057739257812,
        "kl_divergence": 94.62846374511719,
        "mean_generated_length": 256.71875
    },
    {
        "step": 221,
        "reward": 0.9279708862304688,
        "gold_reward": -1.36895751953125,
        "kl_divergence": 79.3584976196289,
        "mean_generated_length": 225.984375
    },
    {
        "step": 222,
        "reward": 1.7852096557617188,
        "gold_reward": -1.0473289489746094,
        "kl_divergence": 91.6034927368164,
        "mean_generated_length": 238.453125
    },
    {
        "step": 223,
        "reward": 1.5005083084106445,
        "gold_reward": -1.59307861328125,
        "kl_divergence": 89.81562805175781,
        "mean_generated_length": 247.4375
    },
    {
        "step": 224,
        "reward": 1.1556243896484375,
        "gold_reward": -1.1689414978027344,
        "kl_divergence": 87.8797378540039,
        "mean_generated_length": 245.328125
    },
    {
        "step": 225,
        "reward": 1.8890533447265625,
        "gold_reward": -1.7421875,
        "kl_divergence": 88.5945816040039,
        "mean_generated_length": 262.75
    },
    {
        "step": 226,
        "reward": 2.4997100830078125,
        "gold_reward": -0.68833327293396,
        "kl_divergence": 91.30997467041016,
        "mean_generated_length": 245.046875
    },
    {
        "step": 227,
        "reward": 1.6760482788085938,
        "gold_reward": -1.401458740234375,
        "kl_divergence": 83.16472625732422,
        "mean_generated_length": 242.5625
    },
    {
        "step": 228,
        "reward": 1.8687362670898438,
        "gold_reward": -1.5971145629882812,
        "kl_divergence": 88.31813049316406,
        "mean_generated_length": 241.71875
    },
    {
        "step": 229,
        "reward": 0.9558944702148438,
        "gold_reward": -1.5893793106079102,
        "kl_divergence": 91.2275161743164,
        "mean_generated_length": 240.515625
    },
    {
        "step": 230,
        "reward": 1.8959388732910156,
        "gold_reward": -1.1941108703613281,
        "kl_divergence": 95.15911865234375,
        "mean_generated_length": 244.90625
    },
    {
        "step": 231,
        "reward": 1.5112190246582031,
        "gold_reward": -1.3464469909667969,
        "kl_divergence": 85.36453247070312,
        "mean_generated_length": 235.1875
    },
    {
        "step": 232,
        "reward": 1.200319766998291,
        "gold_reward": -1.6285247802734375,
        "kl_divergence": 84.37787628173828,
        "mean_generated_length": 242.28125
    },
    {
        "step": 233,
        "reward": 1.772705078125,
        "gold_reward": -1.4673700332641602,
        "kl_divergence": 88.59090423583984,
        "mean_generated_length": 246.875
    },
    {
        "step": 234,
        "reward": 1.2523040771484375,
        "gold_reward": -1.6174335479736328,
        "kl_divergence": 85.39295959472656,
        "mean_generated_length": 250.609375
    },
    {
        "step": 235,
        "reward": 1.4699821472167969,
        "gold_reward": -1.109283447265625,
        "kl_divergence": 94.15312194824219,
        "mean_generated_length": 273.734375
    },
    {
        "step": 236,
        "reward": 2.277587890625,
        "gold_reward": -0.58984375,
        "kl_divergence": 98.0001220703125,
        "mean_generated_length": 288.375
    }
]