[
    {
        "step": 0,
        "reward": -3.7819061279296875,
        "gold_reward": -3.13934326171875,
        "kl_divergence": 0.0,
        "mean_generated_length": 133.28125
    },
    {
        "step": 1,
        "reward": -3.4259490966796875,
        "gold_reward": -2.917926788330078,
        "kl_divergence": 0.0,
        "mean_generated_length": 127.515625
    },
    {
        "step": 2,
        "reward": -3.19598388671875,
        "gold_reward": -2.934751510620117,
        "kl_divergence": -0.05931110307574272,
        "mean_generated_length": 94.53125
    },
    {
        "step": 3,
        "reward": -3.7830429077148438,
        "gold_reward": -3.364288330078125,
        "kl_divergence": 0.042078569531440735,
        "mean_generated_length": 132.5625
    },
    {
        "step": 4,
        "reward": -2.9556407928466797,
        "gold_reward": -3.03314208984375,
        "kl_divergence": -0.04045282304286957,
        "mean_generated_length": 135.9375
    },
    {
        "step": 5,
        "reward": -4.584259033203125,
        "gold_reward": -3.2369537353515625,
        "kl_divergence": 0.07112958282232285,
        "mean_generated_length": 112.5
    },
    {
        "step": 6,
        "reward": -3.5378189086914062,
        "gold_reward": -2.9532413482666016,
        "kl_divergence": 0.22646461427211761,
        "mean_generated_length": 134.765625
    },
    {
        "step": 7,
        "reward": -3.4828643798828125,
        "gold_reward": -2.9080047607421875,
        "kl_divergence": 0.23782943189144135,
        "mean_generated_length": 135.9375
    },
    {
        "step": 8,
        "reward": -2.78167724609375,
        "gold_reward": -3.0773544311523438,
        "kl_divergence": 0.8294774889945984,
        "mean_generated_length": 157.859375
    },
    {
        "step": 9,
        "reward": -3.1565093994140625,
        "gold_reward": -2.5189208984375,
        "kl_divergence": 1.250887393951416,
        "mean_generated_length": 141.015625
    },
    {
        "step": 10,
        "reward": -2.1036834716796875,
        "gold_reward": -2.019338607788086,
        "kl_divergence": 1.592876672744751,
        "mean_generated_length": 140.15625
    },
    {
        "step": 11,
        "reward": -2.3509902954101562,
        "gold_reward": -2.9454116821289062,
        "kl_divergence": 1.9397594928741455,
        "mean_generated_length": 152.640625
    },
    {
        "step": 12,
        "reward": -3.007852554321289,
        "gold_reward": -2.5361099243164062,
        "kl_divergence": 2.2717981338500977,
        "mean_generated_length": 153.921875
    },
    {
        "step": 13,
        "reward": -2.2669830322265625,
        "gold_reward": -2.0183258056640625,
        "kl_divergence": 2.5057992935180664,
        "mean_generated_length": 118.125
    },
    {
        "step": 14,
        "reward": -2.276937484741211,
        "gold_reward": -2.632230758666992,
        "kl_divergence": 4.652376174926758,
        "mean_generated_length": 165.515625
    },
    {
        "step": 15,
        "reward": -2.385486602783203,
        "gold_reward": -2.366058349609375,
        "kl_divergence": 4.009842395782471,
        "mean_generated_length": 149.453125
    },
    {
        "step": 16,
        "reward": -2.6519126892089844,
        "gold_reward": -2.3414459228515625,
        "kl_divergence": 5.137790203094482,
        "mean_generated_length": 157.140625
    },
    {
        "step": 17,
        "reward": -2.348118782043457,
        "gold_reward": -2.60748291015625,
        "kl_divergence": 5.1524271965026855,
        "mean_generated_length": 151.875
    },
    {
        "step": 18,
        "reward": -2.1541748046875,
        "gold_reward": -2.7116775512695312,
        "kl_divergence": 6.7983927726745605,
        "mean_generated_length": 172.71875
    },
    {
        "step": 19,
        "reward": -2.1436538696289062,
        "gold_reward": -2.67926025390625,
        "kl_divergence": 8.628008842468262,
        "mean_generated_length": 173.453125
    },
    {
        "step": 20,
        "reward": -2.9109954833984375,
        "gold_reward": -2.663726806640625,
        "kl_divergence": 9.912321090698242,
        "mean_generated_length": 181.625
    },
    {
        "step": 21,
        "reward": -1.6086349487304688,
        "gold_reward": -2.3311100006103516,
        "kl_divergence": 8.851539611816406,
        "mean_generated_length": 155.03125
    },
    {
        "step": 22,
        "reward": -2.3501129150390625,
        "gold_reward": -2.2672271728515625,
        "kl_divergence": 11.973402976989746,
        "mean_generated_length": 165.296875
    },
    {
        "step": 23,
        "reward": -2.259805679321289,
        "gold_reward": -2.5153045654296875,
        "kl_divergence": 12.772184371948242,
        "mean_generated_length": 179.125
    },
    {
        "step": 24,
        "reward": -2.3509278297424316,
        "gold_reward": -2.534820556640625,
        "kl_divergence": 23.011892318725586,
        "mean_generated_length": 223.140625
    },
    {
        "step": 25,
        "reward": -2.7238616943359375,
        "gold_reward": -2.947601318359375,
        "kl_divergence": 28.581356048583984,
        "mean_generated_length": 282.1875
    },
    {
        "step": 26,
        "reward": -2.4227752685546875,
        "gold_reward": -3.2996673583984375,
        "kl_divergence": 29.831382751464844,
        "mean_generated_length": 278.96875
    },
    {
        "step": 27,
        "reward": -2.675058364868164,
        "gold_reward": -3.479215621948242,
        "kl_divergence": 33.25455856323242,
        "mean_generated_length": 329.609375
    },
    {
        "step": 28,
        "reward": -2.483957529067993,
        "gold_reward": -2.8857269287109375,
        "kl_divergence": 38.54697799682617,
        "mean_generated_length": 286.671875
    },
    {
        "step": 29,
        "reward": -3.240631103515625,
        "gold_reward": -3.27020263671875,
        "kl_divergence": 37.828529357910156,
        "mean_generated_length": 324.375
    },
    {
        "step": 30,
        "reward": -2.2793426513671875,
        "gold_reward": -2.8211212158203125,
        "kl_divergence": 46.387203216552734,
        "mean_generated_length": 330.1875
    },
    {
        "step": 31,
        "reward": -1.74615478515625,
        "gold_reward": -3.07489013671875,
        "kl_divergence": 35.92398452758789,
        "mean_generated_length": 311.34375
    },
    {
        "step": 32,
        "reward": -2.7774124145507812,
        "gold_reward": -3.056427001953125,
        "kl_divergence": 48.1186408996582,
        "mean_generated_length": 299.875
    },
    {
        "step": 33,
        "reward": -2.4261012077331543,
        "gold_reward": -2.88665771484375,
        "kl_divergence": 46.980857849121094,
        "mean_generated_length": 312.40625
    },
    {
        "step": 34,
        "reward": -2.721527099609375,
        "gold_reward": -2.7276439666748047,
        "kl_divergence": 47.923622131347656,
        "mean_generated_length": 336.5
    },
    {
        "step": 35,
        "reward": -2.7887802124023438,
        "gold_reward": -2.579498291015625,
        "kl_divergence": 54.62929153442383,
        "mean_generated_length": 357.75
    },
    {
        "step": 36,
        "reward": -2.410797119140625,
        "gold_reward": -3.282501220703125,
        "kl_divergence": 42.99239730834961,
        "mean_generated_length": 320.5
    },
    {
        "step": 37,
        "reward": -2.4032440185546875,
        "gold_reward": -3.0736846923828125,
        "kl_divergence": 52.169551849365234,
        "mean_generated_length": 300.25
    },
    {
        "step": 38,
        "reward": -2.7891693115234375,
        "gold_reward": -3.29058837890625,
        "kl_divergence": 51.8333740234375,
        "mean_generated_length": 328.375
    },
    {
        "step": 39,
        "reward": -1.7901287078857422,
        "gold_reward": -3.0163726806640625,
        "kl_divergence": 51.19121170043945,
        "mean_generated_length": 326.0
    },
    {
        "step": 40,
        "reward": -2.3128890991210938,
        "gold_reward": -3.1671104431152344,
        "kl_divergence": 60.14240646362305,
        "mean_generated_length": 331.375
    },
    {
        "step": 41,
        "reward": -2.5802383422851562,
        "gold_reward": -2.5333824157714844,
        "kl_divergence": 59.670982360839844,
        "mean_generated_length": 351.75
    },
    {
        "step": 42,
        "reward": -2.382080078125,
        "gold_reward": -3.13201904296875,
        "kl_divergence": 51.50190353393555,
        "mean_generated_length": 303.5
    },
    {
        "step": 43,
        "reward": -2.342315196990967,
        "gold_reward": -2.79559326171875,
        "kl_divergence": 60.4570426940918,
        "mean_generated_length": 346.125
    },
    {
        "step": 44,
        "reward": -1.5992431640625,
        "gold_reward": -3.0608673095703125,
        "kl_divergence": 63.19343566894531,
        "mean_generated_length": 307.75
    },
    {
        "step": 45,
        "reward": -1.3801040649414062,
        "gold_reward": -2.7391281127929688,
        "kl_divergence": 52.77412033081055,
        "mean_generated_length": 290.375
    },
    {
        "step": 46,
        "reward": -1.6552925109863281,
        "gold_reward": -2.774890899658203,
        "kl_divergence": 62.328880310058594,
        "mean_generated_length": 323.625
    },
    {
        "step": 47,
        "reward": -1.5489085912704468,
        "gold_reward": -2.680755615234375,
        "kl_divergence": 58.402565002441406,
        "mean_generated_length": 300.0
    },
    {
        "step": 48,
        "reward": -1.1815299987792969,
        "gold_reward": -2.23944091796875,
        "kl_divergence": 73.03548431396484,
        "mean_generated_length": 334.25
    },
    {
        "step": 49,
        "reward": -1.5905914306640625,
        "gold_reward": -2.69488525390625,
        "kl_divergence": 68.6948013305664,
        "mean_generated_length": 338.5
    },
    {
        "step": 50,
        "reward": -1.3874969482421875,
        "gold_reward": -1.9963226318359375,
        "kl_divergence": 70.3322525024414,
        "mean_generated_length": 311.125
    },
    {
        "step": 51,
        "reward": -1.7880687713623047,
        "gold_reward": -2.795318603515625,
        "kl_divergence": 67.71788024902344,
        "mean_generated_length": 321.75
    },
    {
        "step": 52,
        "reward": -0.8941268920898438,
        "gold_reward": -2.899200439453125,
        "kl_divergence": 66.79627227783203,
        "mean_generated_length": 298.875
    },
    {
        "step": 53,
        "reward": -1.733184814453125,
        "gold_reward": -2.5218353271484375,
        "kl_divergence": 76.15052795410156,
        "mean_generated_length": 347.0
    },
    {
        "step": 54,
        "reward": -1.3857803344726562,
        "gold_reward": -2.8300552368164062,
        "kl_divergence": 72.5887680053711,
        "mean_generated_length": 310.5
    },
    {
        "step": 55,
        "reward": -0.5286335945129395,
        "gold_reward": -1.91583251953125,
        "kl_divergence": 74.503173828125,
        "mean_generated_length": 315.375
    },
    {
        "step": 56,
        "reward": -1.4006519317626953,
        "gold_reward": -2.6769485473632812,
        "kl_divergence": 77.23267364501953,
        "mean_generated_length": 309.125
    },
    {
        "step": 57,
        "reward": -1.096360206604004,
        "gold_reward": -2.4208240509033203,
        "kl_divergence": 85.23222351074219,
        "mean_generated_length": 322.0
    },
    {
        "step": 58,
        "reward": -1.626007080078125,
        "gold_reward": -2.3438987731933594,
        "kl_divergence": 78.45991516113281,
        "mean_generated_length": 312.25
    },
    {
        "step": 59,
        "reward": -1.1185111999511719,
        "gold_reward": -2.523426055908203,
        "kl_divergence": 85.78163146972656,
        "mean_generated_length": 317.625
    },
    {
        "step": 60,
        "reward": -1.0968170166015625,
        "gold_reward": -2.635162353515625,
        "kl_divergence": 83.91201782226562,
        "mean_generated_length": 310.5
    },
    {
        "step": 61,
        "reward": -0.3182220458984375,
        "gold_reward": -2.488677978515625,
        "kl_divergence": 89.08973693847656,
        "mean_generated_length": 324.75
    },
    {
        "step": 62,
        "reward": -0.7261028289794922,
        "gold_reward": -2.8646697998046875,
        "kl_divergence": 87.91753387451172,
        "mean_generated_length": 326.25
    },
    {
        "step": 63,
        "reward": -1.1169586181640625,
        "gold_reward": -2.6102161407470703,
        "kl_divergence": 86.5389404296875,
        "mean_generated_length": 273.5625
    },
    {
        "step": 64,
        "reward": -0.28953033685684204,
        "gold_reward": -2.368072509765625,
        "kl_divergence": 103.07295227050781,
        "mean_generated_length": 315.125
    },
    {
        "step": 65,
        "reward": -0.5652236938476562,
        "gold_reward": -2.9412841796875,
        "kl_divergence": 93.53881072998047,
        "mean_generated_length": 305.375
    },
    {
        "step": 66,
        "reward": -0.6860809326171875,
        "gold_reward": -2.5515270233154297,
        "kl_divergence": 93.22266387939453,
        "mean_generated_length": 307.625
    },
    {
        "step": 67,
        "reward": -0.2990570068359375,
        "gold_reward": -2.900482177734375,
        "kl_divergence": 100.48648071289062,
        "mean_generated_length": 328.0
    },
    {
        "step": 68,
        "reward": 0.1791989803314209,
        "gold_reward": -2.4163284301757812,
        "kl_divergence": 106.42084503173828,
        "mean_generated_length": 326.875
    },
    {
        "step": 69,
        "reward": -0.9392242431640625,
        "gold_reward": -2.7442474365234375,
        "kl_divergence": 109.50605010986328,
        "mean_generated_length": 337.375
    },
    {
        "step": 70,
        "reward": -0.5881118774414062,
        "gold_reward": -2.99462890625,
        "kl_divergence": 106.69635772705078,
        "mean_generated_length": 312.75
    },
    {
        "step": 71,
        "reward": -0.3204403519630432,
        "gold_reward": -3.14764404296875,
        "kl_divergence": 121.28173828125,
        "mean_generated_length": 328.5
    },
    {
        "step": 72,
        "reward": -0.7046051025390625,
        "gold_reward": -2.548809051513672,
        "kl_divergence": 116.29354858398438,
        "mean_generated_length": 314.625
    },
    {
        "step": 73,
        "reward": 0.027263641357421875,
        "gold_reward": -2.957244873046875,
        "kl_divergence": 116.50416564941406,
        "mean_generated_length": 284.25
    },
    {
        "step": 74,
        "reward": -0.5889227390289307,
        "gold_reward": -3.221099853515625,
        "kl_divergence": 102.66929626464844,
        "mean_generated_length": 291.0
    },
    {
        "step": 75,
        "reward": -0.44687652587890625,
        "gold_reward": -3.3361339569091797,
        "kl_divergence": 120.90538024902344,
        "mean_generated_length": 321.25
    },
    {
        "step": 76,
        "reward": -0.1672840118408203,
        "gold_reward": -3.1516647338867188,
        "kl_divergence": 119.91657257080078,
        "mean_generated_length": 314.5
    },
    {
        "step": 77,
        "reward": -0.08829498291015625,
        "gold_reward": -2.684803009033203,
        "kl_divergence": 123.32717895507812,
        "mean_generated_length": 349.125
    },
    {
        "step": 78,
        "reward": 1.6282958984375,
        "gold_reward": -2.467529296875,
        "kl_divergence": 145.28683471679688,
        "mean_generated_length": 398.375
    },
    {
        "step": 79,
        "reward": -0.7950286865234375,
        "gold_reward": -3.052337646484375,
        "kl_divergence": 127.19888305664062,
        "mean_generated_length": 311.25
    },
    {
        "step": 80,
        "reward": 0.258320152759552,
        "gold_reward": -2.868743896484375,
        "kl_divergence": 133.2958526611328,
        "mean_generated_length": 314.25
    },
    {
        "step": 81,
        "reward": 0.45911574363708496,
        "gold_reward": -2.4809112548828125,
        "kl_divergence": 131.34512329101562,
        "mean_generated_length": 313.625
    },
    {
        "step": 82,
        "reward": 0.00337982177734375,
        "gold_reward": -3.525146484375,
        "kl_divergence": 139.04971313476562,
        "mean_generated_length": 321.125
    },
    {
        "step": 83,
        "reward": -0.2509307861328125,
        "gold_reward": -3.3788681030273438,
        "kl_divergence": 144.54290771484375,
        "mean_generated_length": 303.625
    },
    {
        "step": 84,
        "reward": -0.31012725830078125,
        "gold_reward": -3.5215301513671875,
        "kl_divergence": 141.24655151367188,
        "mean_generated_length": 306.375
    },
    {
        "step": 85,
        "reward": 0.10635757446289062,
        "gold_reward": -3.100189208984375,
        "kl_divergence": 145.3575439453125,
        "mean_generated_length": 316.375
    },
    {
        "step": 86,
        "reward": 0.2815055847167969,
        "gold_reward": -3.3277587890625,
        "kl_divergence": 148.43536376953125,
        "mean_generated_length": 310.875
    },
    {
        "step": 87,
        "reward": 0.4367218017578125,
        "gold_reward": -3.603118896484375,
        "kl_divergence": 145.94512939453125,
        "mean_generated_length": 309.5
    },
    {
        "step": 88,
        "reward": 0.0830082893371582,
        "gold_reward": -3.332589626312256,
        "kl_divergence": 170.71080017089844,
        "mean_generated_length": 343.75
    },
    {
        "step": 89,
        "reward": 0.9922876358032227,
        "gold_reward": -2.98504638671875,
        "kl_divergence": 159.74720764160156,
        "mean_generated_length": 317.0
    },
    {
        "step": 90,
        "reward": 0.4112062454223633,
        "gold_reward": -3.4638214111328125,
        "kl_divergence": 175.19284057617188,
        "mean_generated_length": 306.75
    },
    {
        "step": 91,
        "reward": 0.47505760192871094,
        "gold_reward": -3.610504150390625,
        "kl_divergence": 174.64833068847656,
        "mean_generated_length": 302.375
    },
    {
        "step": 92,
        "reward": 1.3375358581542969,
        "gold_reward": -2.93865966796875,
        "kl_divergence": 179.0123291015625,
        "mean_generated_length": 296.25
    },
    {
        "step": 93,
        "reward": 1.2731409072875977,
        "gold_reward": -3.5357666015625,
        "kl_divergence": 204.12637329101562,
        "mean_generated_length": 335.25
    },
    {
        "step": 94,
        "reward": 1.5443859100341797,
        "gold_reward": -3.48614501953125,
        "kl_divergence": 215.8223419189453,
        "mean_generated_length": 331.75
    },
    {
        "step": 95,
        "reward": 1.430572509765625,
        "gold_reward": -3.6289825439453125,
        "kl_divergence": 216.630126953125,
        "mean_generated_length": 323.625
    },
    {
        "step": 96,
        "reward": 1.8955955505371094,
        "gold_reward": -3.6548004150390625,
        "kl_divergence": 248.33929443359375,
        "mean_generated_length": 344.375
    },
    {
        "step": 97,
        "reward": 1.4979071617126465,
        "gold_reward": -4.259185791015625,
        "kl_divergence": 236.88369750976562,
        "mean_generated_length": 300.25
    },
    {
        "step": 98,
        "reward": 2.252899169921875,
        "gold_reward": -3.8826904296875,
        "kl_divergence": 271.2012634277344,
        "mean_generated_length": 328.875
    },
    {
        "step": 99,
        "reward": 1.8777923583984375,
        "gold_reward": -4.0640869140625,
        "kl_divergence": 281.5639343261719,
        "mean_generated_length": 316.625
    },
    {
        "step": 100,
        "reward": 2.0182533264160156,
        "gold_reward": -4.112823486328125,
        "kl_divergence": 296.7479248046875,
        "mean_generated_length": 323.375
    },
    {
        "step": 101,
        "reward": 2.0855162143707275,
        "gold_reward": -4.01739501953125,
        "kl_divergence": 317.5837707519531,
        "mean_generated_length": 323.875
    },
    {
        "step": 102,
        "reward": 2.428703784942627,
        "gold_reward": -3.97991943359375,
        "kl_divergence": 324.9958801269531,
        "mean_generated_length": 317.375
    },
    {
        "step": 103,
        "reward": 2.4008026123046875,
        "gold_reward": -4.158592224121094,
        "kl_divergence": 323.25018310546875,
        "mean_generated_length": 300.25
    },
    {
        "step": 104,
        "reward": 2.2838363647460938,
        "gold_reward": -4.1540985107421875,
        "kl_divergence": 347.5010070800781,
        "mean_generated_length": 323.125
    },
    {
        "step": 105,
        "reward": 2.3617095947265625,
        "gold_reward": -4.454132080078125,
        "kl_divergence": 332.48681640625,
        "mean_generated_length": 303.25
    },
    {
        "step": 106,
        "reward": 1.8791170120239258,
        "gold_reward": -4.8980712890625,
        "kl_divergence": 338.750732421875,
        "mean_generated_length": 331.125
    },
    {
        "step": 107,
        "reward": 2.0021591186523438,
        "gold_reward": -4.46240234375,
        "kl_divergence": 303.5674133300781,
        "mean_generated_length": 287.5
    },
    {
        "step": 108,
        "reward": 2.205413818359375,
        "gold_reward": -4.6690521240234375,
        "kl_divergence": 322.9281005859375,
        "mean_generated_length": 324.375
    },
    {
        "step": 109,
        "reward": 2.0134048461914062,
        "gold_reward": -4.6041259765625,
        "kl_divergence": 332.48193359375,
        "mean_generated_length": 331.0
    },
    {
        "step": 110,
        "reward": 1.9698486328125,
        "gold_reward": -4.658203125,
        "kl_divergence": 316.8987121582031,
        "mean_generated_length": 311.5
    },
    {
        "step": 111,
        "reward": 1.3118665218353271,
        "gold_reward": -4.727294921875,
        "kl_divergence": 319.9197082519531,
        "mean_generated_length": 299.875
    },
    {
        "step": 112,
        "reward": 1.466552734375,
        "gold_reward": -4.6778564453125,
        "kl_divergence": 329.20819091796875,
        "mean_generated_length": 314.25
    },
    {
        "step": 113,
        "reward": 1.6901397705078125,
        "gold_reward": -4.4779052734375,
        "kl_divergence": 350.00244140625,
        "mean_generated_length": 336.5
    },
    {
        "step": 114,
        "reward": 1.771946907043457,
        "gold_reward": -4.722900390625,
        "kl_divergence": 367.87005615234375,
        "mean_generated_length": 357.75
    },
    {
        "step": 115,
        "reward": 1.9437499046325684,
        "gold_reward": -4.947998046875,
        "kl_divergence": 345.486083984375,
        "mean_generated_length": 320.5
    },
    {
        "step": 116,
        "reward": 2.2354140281677246,
        "gold_reward": -4.7666015625,
        "kl_divergence": 342.1230163574219,
        "mean_generated_length": 300.25
    },
    {
        "step": 117,
        "reward": 2.581624984741211,
        "gold_reward": -5.01416015625,
        "kl_divergence": 353.5990295410156,
        "mean_generated_length": 328.375
    },
    {
        "step": 118,
        "reward": 2.3943939208984375,
        "gold_reward": -4.6773681640625,
        "kl_divergence": 356.67340087890625,
        "mean_generated_length": 326.0
    },
    {
        "step": 119,
        "reward": 2.4579010009765625,
        "gold_reward": -4.8594970703125,
        "kl_divergence": 363.67437744140625,
        "mean_generated_length": 331.375
    },
    {
        "step": 120,
        "reward": 2.7343034744262695,
        "gold_reward": -4.8265380859375,
        "kl_divergence": 373.1150207519531,
        "mean_generated_length": 351.75
    },
    {
        "step": 121,
        "reward": 2.003215789794922,
        "gold_reward": -4.837158203125,
        "kl_divergence": 337.8644714355469,
        "mean_generated_length": 303.5
    },
    {
        "step": 122,
        "reward": 2.2778396606445312,
        "gold_reward": -4.83624267578125,
        "kl_divergence": 366.5111999511719,
        "mean_generated_length": 346.125
    },
    {
        "step": 123,
        "reward": 2.4384613037109375,
        "gold_reward": -4.9129638671875,
        "kl_divergence": 344.12432861328125,
        "mean_generated_length": 307.75
    },
    {
        "step": 124,
        "reward": 2.1089534759521484,
        "gold_reward": -4.6805419921875,
        "kl_divergence": 330.9815673828125,
        "mean_generated_length": 290.375
    },
    {
        "step": 125,
        "reward": 2.8722381591796875,
        "gold_reward": -4.6901092529296875,
        "kl_divergence": 356.4433898925781,
        "mean_generated_length": 323.625
    },
    {
        "step": 126,
        "reward": 2.9452247619628906,
        "gold_reward": -4.713157653808594,
        "kl_divergence": 341.4584045410156,
        "mean_generated_length": 300.0
    },
    {
        "step": 127,
        "reward": 3.39324951171875,
        "gold_reward": -4.644287109375,
        "kl_divergence": 376.24053955078125,
        "mean_generated_length": 334.25
    },
    {
        "step": 128,
        "reward": 3.2717323303222656,
        "gold_reward": -4.812255859375,
        "kl_divergence": 372.22967529296875,
        "mean_generated_length": 338.5
    },
    {
        "step": 129,
        "reward": 3.0917816162109375,
        "gold_reward": -4.258399963378906,
        "kl_divergence": 357.1434326171875,
        "mean_generated_length": 311.125
    },
    {
        "step": 130,
        "reward": 3.2116470336914062,
        "gold_reward": -4.9495849609375,
        "kl_divergence": 363.9066467285156,
        "mean_generated_length": 321.75
    },
    {
        "step": 131,
        "reward": 3.017486572265625,
        "gold_reward": -4.96649169921875,
        "kl_divergence": 347.24468994140625,
        "mean_generated_length": 298.875
    },
    {
        "step": 132,
        "reward": 3.0068511962890625,
        "gold_reward": -5.051025390625,
        "kl_divergence": 385.7170715332031,
        "mean_generated_length": 347.0
    },
    {
        "step": 133,
        "reward": 3.1649398803710938,
        "gold_reward": -5.1123046875,
        "kl_divergence": 355.6759948730469,
        "mean_generated_length": 310.5
    },
    {
        "step": 134,
        "reward": 2.7161569595336914,
        "gold_reward": -4.953857421875,
        "kl_divergence": 359.2756652832031,
        "mean_generated_length": 315.375
    },
    {
        "step": 135,
        "reward": 2.335381031036377,
        "gold_reward": -5.078857421875,
        "kl_divergence": 346.90789794921875,
        "mean_generated_length": 309.125
    },
    {
        "step": 136,
        "reward": 2.8005752563476562,
        "gold_reward": -4.9317626953125,
        "kl_divergence": 353.1748046875,
        "mean_generated_length": 322.0
    },
    {
        "step": 137,
        "reward": 2.78057861328125,
        "gold_reward": -4.93634033203125,
        "kl_divergence": 349.4495849609375,
        "mean_generated_length": 312.25
    },
    {
        "step": 138,
        "reward": 1.9292449951171875,
        "gold_reward": -5.06640625,
        "kl_divergence": 342.0342712402344,
        "mean_generated_length": 317.625
    },
    {
        "step": 139,
        "reward": 1.8443292379379272,
        "gold_reward": -5.000732421875,
        "kl_divergence": 338.2149963378906,
        "mean_generated_length": 310.5
    },
    {
        "step": 140,
        "reward": 0.8400259017944336,
        "gold_reward": -5.257568359375,
        "kl_divergence": 344.02398681640625,
        "mean_generated_length": 324.75
    },
    {
        "step": 141,
        "reward": 0.21728515625,
        "gold_reward": -5.33251953125,
        "kl_divergence": 333.48712158203125,
        "mean_generated_length": 326.25
    },
    {
        "step": 142,
        "reward": 0.9554042816162109,
        "gold_reward": -5.24786376953125,
        "kl_divergence": 311.328369140625,
        "mean_generated_length": 276.125
    },
    {
        "step": 143,
        "reward": 0.42386817932128906,
        "gold_reward": -5.098388671875,
        "kl_divergence": 338.0521240234375,
        "mean_generated_length": 315.125
    },
    {
        "step": 144,
        "reward": 0.940007209777832,
        "gold_reward": -5.40673828125,
        "kl_divergence": 334.3233947753906,
        "mean_generated_length": 305.375
    },
    {
        "step": 145,
        "reward": 1.6701583862304688,
        "gold_reward": -5.29296875,
        "kl_divergence": 342.15985107421875,
        "mean_generated_length": 307.625
    },
    {
        "step": 146,
        "reward": 1.410888671875,
        "gold_reward": -5.36572265625,
        "kl_divergence": 362.87744140625,
        "mean_generated_length": 328.0
    },
    {
        "step": 147,
        "reward": 2.1583099365234375,
        "gold_reward": -5.0157470703125,
        "kl_divergence": 364.64886474609375,
        "mean_generated_length": 326.875
    },
    {
        "step": 148,
        "reward": 1.8853683471679688,
        "gold_reward": -5.20965576171875,
        "kl_divergence": 370.8186340332031,
        "mean_generated_length": 337.375
    },
    {
        "step": 149,
        "reward": 1.8152313232421875,
        "gold_reward": -5.27294921875,
        "kl_divergence": 352.0951232910156,
        "mean_generated_length": 312.75
    },
    {
        "step": 150,
        "reward": 2.3507118225097656,
        "gold_reward": -5.043212890625,
        "kl_divergence": 360.41015625,
        "mean_generated_length": 328.5
    },
    {
        "step": 151,
        "reward": 2.2153472900390625,
        "gold_reward": -5.26513671875,
        "kl_divergence": 354.36029052734375,
        "mean_generated_length": 314.625
    },
    {
        "step": 152,
        "reward": 2.641021728515625,
        "gold_reward": -5.005126953125,
        "kl_divergence": 323.20916748046875,
        "mean_generated_length": 284.25
    },
    {
        "step": 153,
        "reward": 2.762744903564453,
        "gold_reward": -5.102294921875,
        "kl_divergence": 316.7572021484375,
        "mean_generated_length": 291.0
    },
    {
        "step": 154,
        "reward": 2.5680227279663086,
        "gold_reward": -5.2025146484375,
        "kl_divergence": 347.9959716796875,
        "mean_generated_length": 321.25
    },
    {
        "step": 155,
        "reward": 2.6793861389160156,
        "gold_reward": -5.238037109375,
        "kl_divergence": 341.39208984375,
        "mean_generated_length": 314.5
    },
    {
        "step": 156,
        "reward": 2.7515716552734375,
        "gold_reward": -5.050811767578125,
        "kl_divergence": 366.0728759765625,
        "mean_generated_length": 349.125
    },
    {
        "step": 157,
        "reward": 2.9423828125,
        "gold_reward": -4.857421875,
        "kl_divergence": 393.2933044433594,
        "mean_generated_length": 398.375
    },
    {
        "step": 158,
        "reward": 3.3401031494140625,
        "gold_reward": -5.0225677490234375,
        "kl_divergence": 337.93377685546875,
        "mean_generated_length": 311.25
    },
    {
        "step": 159,
        "reward": 3.55377197265625,
        "gold_reward": -4.9293212890625,
        "kl_divergence": 338.03277587890625,
        "mean_generated_length": 314.25
    },
    {
        "step": 160,
        "reward": 3.028827667236328,
        "gold_reward": -4.7635498046875,
        "kl_divergence": 343.0395202636719,
        "mean_generated_length": 313.625
    },
    {
        "step": 161,
        "reward": 3.39715576171875,
        "gold_reward": -5.232666015625,
        "kl_divergence": 341.59295654296875,
        "mean_generated_length": 321.125
    },
    {
        "step": 162,
        "reward": 3.759368896484375,
        "gold_reward": -5.018798828125,
        "kl_divergence": 326.99932861328125,
        "mean_generated_length": 303.625
    },
    {
        "step": 163,
        "reward": 2.3542251586914062,
        "gold_reward": -5.3258056640625,
        "kl_divergence": 332.26080322265625,
        "mean_generated_length": 306.375
    },
    {
        "step": 164,
        "reward": 3.258270263671875,
        "gold_reward": -5.2142333984375,
        "kl_divergence": 331.78680419921875,
        "mean_generated_length": 316.375
    },
    {
        "step": 165,
        "reward": 3.0975494384765625,
        "gold_reward": -5.0379791259765625,
        "kl_divergence": 328.36138916015625,
        "mean_generated_length": 310.875
    },
    {
        "step": 166,
        "reward": 3.0511856079101562,
        "gold_reward": -5.08642578125,
        "kl_divergence": 320.7423095703125,
        "mean_generated_length": 309.5
    },
    {
        "step": 167,
        "reward": 2.7690277099609375,
        "gold_reward": -4.9879150390625,
        "kl_divergence": 354.39178466796875,
        "mean_generated_length": 343.75
    },
    {
        "step": 168,
        "reward": 3.2655677795410156,
        "gold_reward": -4.609661102294922,
        "kl_divergence": 326.0188903808594,
        "mean_generated_length": 317.0
    },
    {
        "step": 169,
        "reward": 3.067140579223633,
        "gold_reward": -5.138671875,
        "kl_divergence": 328.3529357910156,
        "mean_generated_length": 306.75
    },
    {
        "step": 170,
        "reward": 3.6633834838867188,
        "gold_reward": -4.841092109680176,
        "kl_divergence": 313.6504211425781,
        "mean_generated_length": 302.375
    },
    {
        "step": 171,
        "reward": 3.506195068359375,
        "gold_reward": -4.3739013671875,
        "kl_divergence": 317.42034912109375,
        "mean_generated_length": 296.25
    },
    {
        "step": 172,
        "reward": 3.80999755859375,
        "gold_reward": -4.6767578125,
        "kl_divergence": 340.56982421875,
        "mean_generated_length": 335.25
    },
    {
        "step": 173,
        "reward": 3.704345703125,
        "gold_reward": -4.635986328125,
        "kl_divergence": 343.5648193359375,
        "mean_generated_length": 335.875
    },
    {
        "step": 174,
        "reward": 3.59716796875,
        "gold_reward": -4.671630859375,
        "kl_divergence": 335.69769287109375,
        "mean_generated_length": 323.625
    },
    {
        "step": 175,
        "reward": 3.2861480712890625,
        "gold_reward": -4.9356689453125,
        "kl_divergence": 361.7386779785156,
        "mean_generated_length": 344.375
    },
    {
        "step": 176,
        "reward": 3.0130157470703125,
        "gold_reward": -4.98486328125,
        "kl_divergence": 319.700439453125,
        "mean_generated_length": 300.25
    },
    {
        "step": 177,
        "reward": 2.717498779296875,
        "gold_reward": -4.824859619140625,
        "kl_divergence": 347.59124755859375,
        "mean_generated_length": 328.875
    },
    {
        "step": 178,
        "reward": 2.603607177734375,
        "gold_reward": -5.008056640625,
        "kl_divergence": 336.8607177734375,
        "mean_generated_length": 316.625
    },
    {
        "step": 179,
        "reward": 2.5898170471191406,
        "gold_reward": -4.951171875,
        "kl_divergence": 336.8225402832031,
        "mean_generated_length": 327.5
    },
    {
        "step": 180,
        "reward": 2.7566604614257812,
        "gold_reward": -4.9207763671875,
        "kl_divergence": 338.47149658203125,
        "mean_generated_length": 323.875
    },
    {
        "step": 181,
        "reward": 2.6958847045898438,
        "gold_reward": -4.9224853515625,
        "kl_divergence": 330.6259765625,
        "mean_generated_length": 317.375
    },
    {
        "step": 182,
        "reward": 2.3089661598205566,
        "gold_reward": -5.0345458984375,
        "kl_divergence": 327.99365234375,
        "mean_generated_length": 300.25
    },
    {
        "step": 183,
        "reward": 2.2794723510742188,
        "gold_reward": -5.0655364990234375,
        "kl_divergence": 335.27301025390625,
        "mean_generated_length": 323.125
    },
    {
        "step": 184,
        "reward": 2.37554931640625,
        "gold_reward": -5.1611328125,
        "kl_divergence": 326.6404724121094,
        "mean_generated_length": 303.25
    },
    {
        "step": 185,
        "reward": 2.196108341217041,
        "gold_reward": -5.510009765625,
        "kl_divergence": 337.119384765625,
        "mean_generated_length": 331.125
    },
    {
        "step": 186,
        "reward": 2.1359176635742188,
        "gold_reward": -5.075927734375,
        "kl_divergence": 315.2025451660156,
        "mean_generated_length": 287.5
    },
    {
        "step": 187,
        "reward": 2.1000900268554688,
        "gold_reward": -5.2525634765625,
        "kl_divergence": 330.3809509277344,
        "mean_generated_length": 324.375
    },
    {
        "step": 188,
        "reward": 2.218780517578125,
        "gold_reward": -5.1748046875,
        "kl_divergence": 341.3247985839844,
        "mean_generated_length": 331.0
    },
    {
        "step": 189,
        "reward": 2.5758209228515625,
        "gold_reward": -5.23406982421875,
        "kl_divergence": 329.09326171875,
        "mean_generated_length": 311.5
    },
    {
        "step": 190,
        "reward": 2.65087890625,
        "gold_reward": -5.15869140625,
        "kl_divergence": 324.2544250488281,
        "mean_generated_length": 299.875
    },
    {
        "step": 191,
        "reward": 2.2248170375823975,
        "gold_reward": -5.187255859375,
        "kl_divergence": 328.9145202636719,
        "mean_generated_length": 314.25
    },
    {
        "step": 192,
        "reward": 2.5211029052734375,
        "gold_reward": -4.98291015625,
        "kl_divergence": 348.9886474609375,
        "mean_generated_length": 336.5
    },
    {
        "step": 193,
        "reward": 1.9401473999023438,
        "gold_reward": -5.073486328125,
        "kl_divergence": 360.6932678222656,
        "mean_generated_length": 357.75
    },
    {
        "step": 194,
        "reward": 2.37396240234375,
        "gold_reward": -5.40380859375,
        "kl_divergence": 344.0591125488281,
        "mean_generated_length": 320.5
    },
    {
        "step": 195,
        "reward": 2.3534622192382812,
        "gold_reward": -5.193115234375,
        "kl_divergence": 337.17498779296875,
        "mean_generated_length": 300.25
    },
    {
        "step": 196,
        "reward": 2.3986778259277344,
        "gold_reward": -5.3251953125,
        "kl_divergence": 353.2698059082031,
        "mean_generated_length": 328.375
    },
    {
        "step": 197,
        "reward": 2.4515457153320312,
        "gold_reward": -4.9111328125,
        "kl_divergence": 352.6933898925781,
        "mean_generated_length": 326.0
    },
    {
        "step": 198,
        "reward": 1.638765573501587,
        "gold_reward": -5.1666412353515625,
        "kl_divergence": 363.493896484375,
        "mean_generated_length": 331.375
    },
    {
        "step": 199,
        "reward": 2.1668472290039062,
        "gold_reward": -5.04541015625,
        "kl_divergence": 377.484375,
        "mean_generated_length": 351.75
    },
    {
        "step": 200,
        "reward": 2.830169677734375,
        "gold_reward": -5.059326171875,
        "kl_divergence": 344.8824462890625,
        "mean_generated_length": 303.5
    },
    {
        "step": 201,
        "reward": 2.0723190307617188,
        "gold_reward": -5.011474609375,
        "kl_divergence": 374.8004455566406,
        "mean_generated_length": 346.125
    },
    {
        "step": 202,
        "reward": 2.6361083984375,
        "gold_reward": -5.1435546875,
        "kl_divergence": 351.2232360839844,
        "mean_generated_length": 307.75
    },
    {
        "step": 203,
        "reward": 2.475605010986328,
        "gold_reward": -5.2333984375,
        "kl_divergence": 339.2806396484375,
        "mean_generated_length": 290.375
    },
    {
        "step": 204,
        "reward": 2.255443572998047,
        "gold_reward": -4.899925231933594,
        "kl_divergence": 365.1799011230469,
        "mean_generated_length": 323.625
    },
    {
        "step": 205,
        "reward": 2.7246780395507812,
        "gold_reward": -5.1556396484375,
        "kl_divergence": 360.306884765625,
        "mean_generated_length": 300.0
    },
    {
        "step": 206,
        "reward": 2.323359489440918,
        "gold_reward": -5.0771484375,
        "kl_divergence": 381.2403564453125,
        "mean_generated_length": 334.25
    },
    {
        "step": 207,
        "reward": 2.320270538330078,
        "gold_reward": -5.17462158203125,
        "kl_divergence": 385.0135498046875,
        "mean_generated_length": 338.5
    },
    {
        "step": 208,
        "reward": 2.268824577331543,
        "gold_reward": -4.9029541015625,
        "kl_divergence": 368.8460693359375,
        "mean_generated_length": 311.125
    },
    {
        "step": 209,
        "reward": 2.1982269287109375,
        "gold_reward": -5.2431640625,
        "kl_divergence": 377.64495849609375,
        "mean_generated_length": 321.75
    },
    {
        "step": 210,
        "reward": 2.424509048461914,
        "gold_reward": -5.320068359375,
        "kl_divergence": 356.6574401855469,
        "mean_generated_length": 298.875
    },
    {
        "step": 211,
        "reward": 1.7306451797485352,
        "gold_reward": -5.38037109375,
        "kl_divergence": 392.0487976074219,
        "mean_generated_length": 347.0
    },
    {
        "step": 212,
        "reward": 2.0528335571289062,
        "gold_reward": -5.272216796875,
        "kl_divergence": 363.7330017089844,
        "mean_generated_length": 310.5
    },
    {
        "step": 213,
        "reward": 2.803274154663086,
        "gold_reward": -5.022735595703125,
        "kl_divergence": 369.61773681640625,
        "mean_generated_length": 315.375
    },
    {
        "step": 214,
        "reward": 2.5705490112304688,
        "gold_reward": -5.25390625,
        "kl_divergence": 360.1537170410156,
        "mean_generated_length": 309.125
    },
    {
        "step": 215,
        "reward": 2.690526008605957,
        "gold_reward": -5.01593017578125,
        "kl_divergence": 374.71356201171875,
        "mean_generated_length": 322.0
    },
    {
        "step": 216,
        "reward": 2.747467041015625,
        "gold_reward": -5.04345703125,
        "kl_divergence": 366.1783447265625,
        "mean_generated_length": 312.25
    },
    {
        "step": 217,
        "reward": 2.7236480712890625,
        "gold_reward": -4.9322509765625,
        "kl_divergence": 374.9069519042969,
        "mean_generated_length": 317.625
    },
    {
        "step": 218,
        "reward": 2.656254291534424,
        "gold_reward": -4.89892578125,
        "kl_divergence": 364.4350891113281,
        "mean_generated_length": 310.5
    },
    {
        "step": 219,
        "reward": 2.6626338958740234,
        "gold_reward": -5.07635498046875,
        "kl_divergence": 363.8337097167969,
        "mean_generated_length": 324.75
    },
    {
        "step": 220,
        "reward": 2.3700180053710938,
        "gold_reward": -5.222900390625,
        "kl_divergence": 372.0167236328125,
        "mean_generated_length": 326.25
    },
    {
        "step": 221,
        "reward": 2.9475154876708984,
        "gold_reward": -5.106201171875,
        "kl_divergence": 328.77447509765625,
        "mean_generated_length": 276.125
    },
    {
        "step": 222,
        "reward": 2.84283447265625,
        "gold_reward": -4.8359375,
        "kl_divergence": 369.0907287597656,
        "mean_generated_length": 315.125
    },
    {
        "step": 223,
        "reward": 2.6730384826660156,
        "gold_reward": -5.21533203125,
        "kl_divergence": 360.6263427734375,
        "mean_generated_length": 305.375
    },
    {
        "step": 224,
        "reward": 2.9520416259765625,
        "gold_reward": -5.015869140625,
        "kl_divergence": 360.73870849609375,
        "mean_generated_length": 307.625
    },
    {
        "step": 225,
        "reward": 2.4822120666503906,
        "gold_reward": -5.272216796875,
        "kl_divergence": 374.05499267578125,
        "mean_generated_length": 328.0
    },
    {
        "step": 226,
        "reward": 2.8971405029296875,
        "gold_reward": -4.8516845703125,
        "kl_divergence": 378.95050048828125,
        "mean_generated_length": 326.875
    },
    {
        "step": 227,
        "reward": 2.0703125,
        "gold_reward": -5.17529296875,
        "kl_divergence": 370.6949157714844,
        "mean_generated_length": 337.375
    },
    {
        "step": 228,
        "reward": 2.75616455078125,
        "gold_reward": -5.221435546875,
        "kl_divergence": 360.75799560546875,
        "mean_generated_length": 312.75
    },
    {
        "step": 229,
        "reward": 2.6782803535461426,
        "gold_reward": -5.174560546875,
        "kl_divergence": 373.26171875,
        "mean_generated_length": 328.5
    },
    {
        "step": 230,
        "reward": 2.7158355712890625,
        "gold_reward": -5.2100830078125,
        "kl_divergence": 370.7182922363281,
        "mean_generated_length": 314.625
    },
    {
        "step": 231,
        "reward": 2.8988895416259766,
        "gold_reward": -5.076171875,
        "kl_divergence": 340.6924743652344,
        "mean_generated_length": 284.25
    },
    {
        "step": 232,
        "reward": 2.6645431518554688,
        "gold_reward": -5.39111328125,
        "kl_divergence": 348.14337158203125,
        "mean_generated_length": 291.0
    },
    {
        "step": 233,
        "reward": 2.5977630615234375,
        "gold_reward": -5.245361328125,
        "kl_divergence": 373.89141845703125,
        "mean_generated_length": 321.25
    },
    {
        "step": 234,
        "reward": 2.2370834350585938,
        "gold_reward": -5.2010498046875,
        "kl_divergence": 363.3055419921875,
        "mean_generated_length": 314.5
    },
    {
        "step": 235,
        "reward": 2.2335472106933594,
        "gold_reward": -5.15869140625,
        "kl_divergence": 388.7057800292969,
        "mean_generated_length": 349.125
    },
    {
        "step": 236,
        "reward": 2.5517578125,
        "gold_reward": -4.751953125,
        "kl_divergence": 419.9791259765625,
        "mean_generated_length": 398.375
    }
]