[
    {
        "step": 0,
        "reward": -3.7819061279296875,
        "gold_reward": -3.13934326171875,
        "kl_divergence": 0.0,
        "mean_generated_length": 133.28125
    },
    {
        "step": 1,
        "reward": -3.4259490966796875,
        "gold_reward": -2.917926788330078,
        "kl_divergence": 0.0,
        "mean_generated_length": 127.515625
    },
    {
        "step": 2,
        "reward": -3.513702392578125,
        "gold_reward": -3.0561695098876953,
        "kl_divergence": -0.06574708223342896,
        "mean_generated_length": 90.859375
    },
    {
        "step": 3,
        "reward": -4.092342376708984,
        "gold_reward": -3.2760772705078125,
        "kl_divergence": 0.03126636892557144,
        "mean_generated_length": 125.140625
    },
    {
        "step": 4,
        "reward": -3.2615737915039062,
        "gold_reward": -3.114715576171875,
        "kl_divergence": -0.03418457508087158,
        "mean_generated_length": 120.421875
    },
    {
        "step": 5,
        "reward": -4.354850769042969,
        "gold_reward": -3.0415878295898438,
        "kl_divergence": -0.020692460238933563,
        "mean_generated_length": 106.21875
    },
    {
        "step": 6,
        "reward": -3.3458404541015625,
        "gold_reward": -3.10858154296875,
        "kl_divergence": -0.0014219526201486588,
        "mean_generated_length": 134.625
    },
    {
        "step": 7,
        "reward": -3.5504837036132812,
        "gold_reward": -2.6954116821289062,
        "kl_divergence": 0.32839304208755493,
        "mean_generated_length": 127.515625
    },
    {
        "step": 8,
        "reward": -3.1563491821289062,
        "gold_reward": -3.13165283203125,
        "kl_divergence": 0.3269256055355072,
        "mean_generated_length": 130.421875
    },
    {
        "step": 9,
        "reward": -2.62518310546875,
        "gold_reward": -2.3325347900390625,
        "kl_divergence": 0.7904761433601379,
        "mean_generated_length": 129.546875
    },
    {
        "step": 10,
        "reward": -2.1050338745117188,
        "gold_reward": -1.9935340881347656,
        "kl_divergence": 1.6097651720046997,
        "mean_generated_length": 135.078125
    },
    {
        "step": 11,
        "reward": -2.250274658203125,
        "gold_reward": -2.8095245361328125,
        "kl_divergence": 1.8743641376495361,
        "mean_generated_length": 159.3125
    },
    {
        "step": 12,
        "reward": -3.1480712890625,
        "gold_reward": -2.6086578369140625,
        "kl_divergence": 2.526494026184082,
        "mean_generated_length": 150.859375
    },
    {
        "step": 13,
        "reward": -2.4671249389648438,
        "gold_reward": -1.9573097229003906,
        "kl_divergence": 2.4101123809814453,
        "mean_generated_length": 122.703125
    },
    {
        "step": 14,
        "reward": -2.0871124267578125,
        "gold_reward": -2.4429931640625,
        "kl_divergence": 3.5585224628448486,
        "mean_generated_length": 155.703125
    },
    {
        "step": 15,
        "reward": -2.0732955932617188,
        "gold_reward": -2.3351058959960938,
        "kl_divergence": 4.2739481925964355,
        "mean_generated_length": 138.796875
    },
    {
        "step": 16,
        "reward": -2.7676544189453125,
        "gold_reward": -2.488372802734375,
        "kl_divergence": 5.468371391296387,
        "mean_generated_length": 165.484375
    },
    {
        "step": 17,
        "reward": -2.0591354370117188,
        "gold_reward": -2.302154541015625,
        "kl_divergence": 6.631288528442383,
        "mean_generated_length": 186.03125
    },
    {
        "step": 18,
        "reward": -2.2794036865234375,
        "gold_reward": -2.765106201171875,
        "kl_divergence": 7.781429767608643,
        "mean_generated_length": 193.59375
    },
    {
        "step": 19,
        "reward": -2.0827407836914062,
        "gold_reward": -2.7301788330078125,
        "kl_divergence": 9.126982688903809,
        "mean_generated_length": 183.140625
    },
    {
        "step": 20,
        "reward": -2.3363418579101562,
        "gold_reward": -2.6291770935058594,
        "kl_divergence": 9.585342407226562,
        "mean_generated_length": 215.640625
    },
    {
        "step": 21,
        "reward": -1.1642646789550781,
        "gold_reward": -2.1923370361328125,
        "kl_divergence": 12.098411560058594,
        "mean_generated_length": 216.21875
    },
    {
        "step": 22,
        "reward": -2.0763931274414062,
        "gold_reward": -2.4326353073120117,
        "kl_divergence": 14.59981918334961,
        "mean_generated_length": 221.078125
    },
    {
        "step": 23,
        "reward": -1.2132530212402344,
        "gold_reward": -2.1574020385742188,
        "kl_divergence": 15.507266998291016,
        "mean_generated_length": 239.203125
    },
    {
        "step": 24,
        "reward": -2.3427066802978516,
        "gold_reward": -2.919677734375,
        "kl_divergence": 21.150901794433594,
        "mean_generated_length": 248.890625
    },
    {
        "step": 25,
        "reward": -2.0116500854492188,
        "gold_reward": -2.9439592361450195,
        "kl_divergence": 25.744882583618164,
        "mean_generated_length": 302.921875
    },
    {
        "step": 26,
        "reward": -2.547469139099121,
        "gold_reward": -3.16693115234375,
        "kl_divergence": 26.977869033813477,
        "mean_generated_length": 302.484375
    },
    {
        "step": 27,
        "reward": -1.7067241668701172,
        "gold_reward": -2.971343994140625,
        "kl_divergence": 30.01278305053711,
        "mean_generated_length": 329.984375
    },
    {
        "step": 28,
        "reward": -2.1334686279296875,
        "gold_reward": -2.7766952514648438,
        "kl_divergence": 31.213794708251953,
        "mean_generated_length": 287.5
    },
    {
        "step": 29,
        "reward": -2.0851402282714844,
        "gold_reward": -2.847285509109497,
        "kl_divergence": 33.11299133300781,
        "mean_generated_length": 324.375
    },
    {
        "step": 30,
        "reward": -2.3170013427734375,
        "gold_reward": -3.1126708984375,
        "kl_divergence": 34.165008544921875,
        "mean_generated_length": 331.0
    },
    {
        "step": 31,
        "reward": -2.072723388671875,
        "gold_reward": -3.0223388671875,
        "kl_divergence": 29.533536911010742,
        "mean_generated_length": 311.5
    },
    {
        "step": 32,
        "reward": -1.6219654083251953,
        "gold_reward": -2.9187774658203125,
        "kl_divergence": 35.5937385559082,
        "mean_generated_length": 299.875
    },
    {
        "step": 33,
        "reward": -2.040630340576172,
        "gold_reward": -3.1440048217773438,
        "kl_divergence": 37.15123748779297,
        "mean_generated_length": 314.25
    },
    {
        "step": 34,
        "reward": -2.1103744506835938,
        "gold_reward": -2.90814208984375,
        "kl_divergence": 38.738677978515625,
        "mean_generated_length": 336.5
    },
    {
        "step": 35,
        "reward": -2.5220673084259033,
        "gold_reward": -2.819255828857422,
        "kl_divergence": 43.215721130371094,
        "mean_generated_length": 357.75
    },
    {
        "step": 36,
        "reward": -1.8814659118652344,
        "gold_reward": -3.2107162475585938,
        "kl_divergence": 37.526763916015625,
        "mean_generated_length": 320.5
    },
    {
        "step": 37,
        "reward": -1.272613525390625,
        "gold_reward": -2.850982666015625,
        "kl_divergence": 41.28461456298828,
        "mean_generated_length": 300.25
    },
    {
        "step": 38,
        "reward": -2.0567398071289062,
        "gold_reward": -3.2938270568847656,
        "kl_divergence": 42.96792984008789,
        "mean_generated_length": 328.375
    },
    {
        "step": 39,
        "reward": -1.9741973876953125,
        "gold_reward": -3.0832738876342773,
        "kl_divergence": 42.788387298583984,
        "mean_generated_length": 326.0
    },
    {
        "step": 40,
        "reward": -2.5066986083984375,
        "gold_reward": -3.0472564697265625,
        "kl_divergence": 50.45222473144531,
        "mean_generated_length": 331.375
    },
    {
        "step": 41,
        "reward": -2.0732040405273438,
        "gold_reward": -2.845489501953125,
        "kl_divergence": 51.49625778198242,
        "mean_generated_length": 350.859375
    },
    {
        "step": 42,
        "reward": -2.2271671295166016,
        "gold_reward": -2.97088623046875,
        "kl_divergence": 45.804100036621094,
        "mean_generated_length": 303.5
    },
    {
        "step": 43,
        "reward": -2.1200103759765625,
        "gold_reward": -3.0122528076171875,
        "kl_divergence": 50.167911529541016,
        "mean_generated_length": 346.125
    },
    {
        "step": 44,
        "reward": -1.5593719482421875,
        "gold_reward": -3.1394004821777344,
        "kl_divergence": 48.85436248779297,
        "mean_generated_length": 307.75
    },
    {
        "step": 45,
        "reward": -1.849853515625,
        "gold_reward": -2.9785614013671875,
        "kl_divergence": 50.3258171081543,
        "mean_generated_length": 290.375
    },
    {
        "step": 46,
        "reward": -1.4573135375976562,
        "gold_reward": -2.8257904052734375,
        "kl_divergence": 57.518463134765625,
        "mean_generated_length": 323.625
    },
    {
        "step": 47,
        "reward": -1.8106918334960938,
        "gold_reward": -2.8936309814453125,
        "kl_divergence": 53.22980499267578,
        "mean_generated_length": 300.0
    },
    {
        "step": 48,
        "reward": -1.0555305480957031,
        "gold_reward": -2.45184326171875,
        "kl_divergence": 62.87526321411133,
        "mean_generated_length": 334.25
    },
    {
        "step": 49,
        "reward": -1.5781288146972656,
        "gold_reward": -2.6233367919921875,
        "kl_divergence": 59.227447509765625,
        "mean_generated_length": 338.5
    },
    {
        "step": 50,
        "reward": -1.60296630859375,
        "gold_reward": -2.1937103271484375,
        "kl_divergence": 59.35350799560547,
        "mean_generated_length": 311.125
    },
    {
        "step": 51,
        "reward": -0.9210052490234375,
        "gold_reward": -2.7776145935058594,
        "kl_divergence": 59.28642272949219,
        "mean_generated_length": 321.75
    },
    {
        "step": 52,
        "reward": -2.4327239990234375,
        "gold_reward": -3.2397003173828125,
        "kl_divergence": 59.472557067871094,
        "mean_generated_length": 298.875
    },
    {
        "step": 53,
        "reward": -1.566192626953125,
        "gold_reward": -2.5842437744140625,
        "kl_divergence": 62.547061920166016,
        "mean_generated_length": 347.0
    },
    {
        "step": 54,
        "reward": -1.2175521850585938,
        "gold_reward": -2.925506591796875,
        "kl_divergence": 59.660377502441406,
        "mean_generated_length": 310.5
    },
    {
        "step": 55,
        "reward": -0.9484367370605469,
        "gold_reward": -2.6105422973632812,
        "kl_divergence": 62.57239532470703,
        "mean_generated_length": 315.375
    },
    {
        "step": 56,
        "reward": -1.5375442504882812,
        "gold_reward": -2.685394287109375,
        "kl_divergence": 69.40283966064453,
        "mean_generated_length": 309.125
    },
    {
        "step": 57,
        "reward": -0.94580078125,
        "gold_reward": -2.508087158203125,
        "kl_divergence": 70.74317169189453,
        "mean_generated_length": 322.0
    },
    {
        "step": 58,
        "reward": -0.43926239013671875,
        "gold_reward": -2.404052734375,
        "kl_divergence": 64.24354553222656,
        "mean_generated_length": 312.25
    },
    {
        "step": 59,
        "reward": -0.8094635009765625,
        "gold_reward": -2.454193115234375,
        "kl_divergence": 67.78392028808594,
        "mean_generated_length": 317.625
    },
    {
        "step": 60,
        "reward": -1.2752361297607422,
        "gold_reward": -2.705596923828125,
        "kl_divergence": 74.19155883789062,
        "mean_generated_length": 310.5
    },
    {
        "step": 61,
        "reward": -0.6111106872558594,
        "gold_reward": -2.5192642211914062,
        "kl_divergence": 71.34595489501953,
        "mean_generated_length": 324.75
    },
    {
        "step": 62,
        "reward": -0.7860641479492188,
        "gold_reward": -2.792877197265625,
        "kl_divergence": 77.30517578125,
        "mean_generated_length": 326.25
    },
    {
        "step": 63,
        "reward": -1.3222579956054688,
        "gold_reward": -2.7019290924072266,
        "kl_divergence": 69.26483154296875,
        "mean_generated_length": 276.125
    },
    {
        "step": 64,
        "reward": -0.5928688049316406,
        "gold_reward": -2.3477020263671875,
        "kl_divergence": 72.30479431152344,
        "mean_generated_length": 315.125
    },
    {
        "step": 65,
        "reward": -0.7110741138458252,
        "gold_reward": -3.022735595703125,
        "kl_divergence": 74.0479736328125,
        "mean_generated_length": 305.375
    },
    {
        "step": 66,
        "reward": -1.0270423889160156,
        "gold_reward": -2.58880615234375,
        "kl_divergence": 72.15008544921875,
        "mean_generated_length": 307.625
    },
    {
        "step": 67,
        "reward": -1.1127510070800781,
        "gold_reward": -3.0275421142578125,
        "kl_divergence": 77.9384536743164,
        "mean_generated_length": 328.0
    },
    {
        "step": 68,
        "reward": -0.3427734375,
        "gold_reward": -2.277581214904785,
        "kl_divergence": 80.84729766845703,
        "mean_generated_length": 326.875
    },
    {
        "step": 69,
        "reward": -1.3336758613586426,
        "gold_reward": -2.781940460205078,
        "kl_divergence": 79.46217346191406,
        "mean_generated_length": 337.375
    },
    {
        "step": 70,
        "reward": -1.057342529296875,
        "gold_reward": -3.2210745811462402,
        "kl_divergence": 75.14295959472656,
        "mean_generated_length": 312.75
    },
    {
        "step": 71,
        "reward": -0.6152496337890625,
        "gold_reward": -2.97454833984375,
        "kl_divergence": 81.88915252685547,
        "mean_generated_length": 326.859375
    },
    {
        "step": 72,
        "reward": -0.2796134948730469,
        "gold_reward": -2.926942825317383,
        "kl_divergence": 80.98907470703125,
        "mean_generated_length": 314.625
    },
    {
        "step": 73,
        "reward": -1.2977752685546875,
        "gold_reward": -2.737215995788574,
        "kl_divergence": 76.52064514160156,
        "mean_generated_length": 284.25
    },
    {
        "step": 74,
        "reward": -1.4839096069335938,
        "gold_reward": -3.1575393676757812,
        "kl_divergence": 69.75015258789062,
        "mean_generated_length": 291.0
    },
    {
        "step": 75,
        "reward": -0.49750518798828125,
        "gold_reward": -2.843994140625,
        "kl_divergence": 84.36320495605469,
        "mean_generated_length": 321.25
    },
    {
        "step": 76,
        "reward": -0.8251113891601562,
        "gold_reward": -2.9545440673828125,
        "kl_divergence": 75.86161804199219,
        "mean_generated_length": 314.5
    },
    {
        "step": 77,
        "reward": -0.73248291015625,
        "gold_reward": -2.7118682861328125,
        "kl_divergence": 84.00872802734375,
        "mean_generated_length": 349.125
    },
    {
        "step": 78,
        "reward": -0.755615234375,
        "gold_reward": -1.26513671875,
        "kl_divergence": 95.68502807617188,
        "mean_generated_length": 398.375
    },
    {
        "step": 79,
        "reward": -1.3177070617675781,
        "gold_reward": -3.1566162109375,
        "kl_divergence": 81.13113403320312,
        "mean_generated_length": 311.25
    },
    {
        "step": 80,
        "reward": -0.5402458310127258,
        "gold_reward": -2.6951804161071777,
        "kl_divergence": 84.8049545288086,
        "mean_generated_length": 314.25
    },
    {
        "step": 81,
        "reward": -0.18748092651367188,
        "gold_reward": -2.3675689697265625,
        "kl_divergence": 87.30410766601562,
        "mean_generated_length": 313.625
    },
    {
        "step": 82,
        "reward": -1.1123237609863281,
        "gold_reward": -3.08642578125,
        "kl_divergence": 88.4014892578125,
        "mean_generated_length": 321.125
    },
    {
        "step": 83,
        "reward": -0.9756660461425781,
        "gold_reward": -3.2275543212890625,
        "kl_divergence": 92.1693344116211,
        "mean_generated_length": 303.625
    },
    {
        "step": 84,
        "reward": -1.6420669555664062,
        "gold_reward": -3.2106056213378906,
        "kl_divergence": 89.12419128417969,
        "mean_generated_length": 306.375
    },
    {
        "step": 85,
        "reward": -0.6364879608154297,
        "gold_reward": -3.22967529296875,
        "kl_divergence": 88.44365692138672,
        "mean_generated_length": 316.375
    },
    {
        "step": 86,
        "reward": -0.7939834594726562,
        "gold_reward": -3.2413330078125,
        "kl_divergence": 93.0672836303711,
        "mean_generated_length": 310.875
    },
    {
        "step": 87,
        "reward": -0.3438262939453125,
        "gold_reward": -3.2654075622558594,
        "kl_divergence": 91.60282897949219,
        "mean_generated_length": 309.5
    },
    {
        "step": 88,
        "reward": -0.5550460815429688,
        "gold_reward": -2.80072021484375,
        "kl_divergence": 105.07473754882812,
        "mean_generated_length": 343.75
    },
    {
        "step": 89,
        "reward": -0.3325767517089844,
        "gold_reward": -2.4632301330566406,
        "kl_divergence": 95.05772399902344,
        "mean_generated_length": 317.0
    },
    {
        "step": 90,
        "reward": -0.29824066162109375,
        "gold_reward": -3.1620712280273438,
        "kl_divergence": 103.03790283203125,
        "mean_generated_length": 306.75
    },
    {
        "step": 91,
        "reward": -0.6925697326660156,
        "gold_reward": -3.013397216796875,
        "kl_divergence": 94.22360229492188,
        "mean_generated_length": 302.375
    },
    {
        "step": 92,
        "reward": -0.2848052978515625,
        "gold_reward": -2.149261474609375,
        "kl_divergence": 101.95038604736328,
        "mean_generated_length": 296.25
    },
    {
        "step": 93,
        "reward": -0.46456146240234375,
        "gold_reward": -2.9550323486328125,
        "kl_divergence": 116.03105926513672,
        "mean_generated_length": 335.25
    },
    {
        "step": 94,
        "reward": -0.7337150573730469,
        "gold_reward": -3.2588043212890625,
        "kl_divergence": 117.51570129394531,
        "mean_generated_length": 335.875
    },
    {
        "step": 95,
        "reward": -0.11110305786132812,
        "gold_reward": -2.9491424560546875,
        "kl_divergence": 110.94712829589844,
        "mean_generated_length": 323.625
    },
    {
        "step": 96,
        "reward": -0.22314453125,
        "gold_reward": -3.0429000854492188,
        "kl_divergence": 124.3670425415039,
        "mean_generated_length": 344.375
    },
    {
        "step": 97,
        "reward": -0.4252338409423828,
        "gold_reward": -3.672760009765625,
        "kl_divergence": 109.84358215332031,
        "mean_generated_length": 300.25
    },
    {
        "step": 98,
        "reward": -0.59283447265625,
        "gold_reward": -3.5985260009765625,
        "kl_divergence": 123.39794921875,
        "mean_generated_length": 328.875
    },
    {
        "step": 99,
        "reward": -0.77679443359375,
        "gold_reward": -3.4651031494140625,
        "kl_divergence": 127.50980377197266,
        "mean_generated_length": 316.625
    },
    {
        "step": 100,
        "reward": 0.3566093444824219,
        "gold_reward": -3.1171875,
        "kl_divergence": 127.82723236083984,
        "mean_generated_length": 327.5
    },
    {
        "step": 101,
        "reward": -0.6972122192382812,
        "gold_reward": -3.2853527069091797,
        "kl_divergence": 139.499755859375,
        "mean_generated_length": 323.875
    },
    {
        "step": 102,
        "reward": 0.3032386600971222,
        "gold_reward": -2.9802093505859375,
        "kl_divergence": 130.49562072753906,
        "mean_generated_length": 317.375
    },
    {
        "step": 103,
        "reward": -0.10321044921875,
        "gold_reward": -3.5365700721740723,
        "kl_divergence": 135.11746215820312,
        "mean_generated_length": 300.25
    },
    {
        "step": 104,
        "reward": 0.425750732421875,
        "gold_reward": -3.593109130859375,
        "kl_divergence": 151.88865661621094,
        "mean_generated_length": 323.125
    },
    {
        "step": 105,
        "reward": 0.08437538146972656,
        "gold_reward": -3.650787353515625,
        "kl_divergence": 140.2801055908203,
        "mean_generated_length": 303.25
    },
    {
        "step": 106,
        "reward": -0.038570404052734375,
        "gold_reward": -3.886554718017578,
        "kl_divergence": 161.41941833496094,
        "mean_generated_length": 331.125
    },
    {
        "step": 107,
        "reward": -0.1141357421875,
        "gold_reward": -3.5301666259765625,
        "kl_divergence": 141.85760498046875,
        "mean_generated_length": 287.5
    },
    {
        "step": 108,
        "reward": -0.0352020263671875,
        "gold_reward": -3.73828125,
        "kl_divergence": 169.77210998535156,
        "mean_generated_length": 324.375
    },
    {
        "step": 109,
        "reward": 0.3617269992828369,
        "gold_reward": -3.599853515625,
        "kl_divergence": 181.82666015625,
        "mean_generated_length": 331.0
    },
    {
        "step": 110,
        "reward": 0.8049983978271484,
        "gold_reward": -3.6313323974609375,
        "kl_divergence": 164.704345703125,
        "mean_generated_length": 311.5
    },
    {
        "step": 111,
        "reward": 0.3961677551269531,
        "gold_reward": -3.629150390625,
        "kl_divergence": 178.64512634277344,
        "mean_generated_length": 299.875
    },
    {
        "step": 112,
        "reward": 0.8579168319702148,
        "gold_reward": -3.784332275390625,
        "kl_divergence": 193.64788818359375,
        "mean_generated_length": 314.25
    },
    {
        "step": 113,
        "reward": 1.2176895141601562,
        "gold_reward": -3.5613250732421875,
        "kl_divergence": 197.07089233398438,
        "mean_generated_length": 336.5
    },
    {
        "step": 114,
        "reward": 0.7190093994140625,
        "gold_reward": -3.8080332279205322,
        "kl_divergence": 224.2685089111328,
        "mean_generated_length": 357.75
    },
    {
        "step": 115,
        "reward": 0.6636209487915039,
        "gold_reward": -4.152996063232422,
        "kl_divergence": 197.23312377929688,
        "mean_generated_length": 320.5
    },
    {
        "step": 116,
        "reward": 0.5241546630859375,
        "gold_reward": -4.214599609375,
        "kl_divergence": 200.12588500976562,
        "mean_generated_length": 300.25
    },
    {
        "step": 117,
        "reward": 0.6400222778320312,
        "gold_reward": -4.296875,
        "kl_divergence": 219.61793518066406,
        "mean_generated_length": 328.375
    },
    {
        "step": 118,
        "reward": 1.2062759399414062,
        "gold_reward": -4.169677734375,
        "kl_divergence": 221.87139892578125,
        "mean_generated_length": 326.0
    },
    {
        "step": 119,
        "reward": 1.1050262451171875,
        "gold_reward": -4.26129150390625,
        "kl_divergence": 232.84364318847656,
        "mean_generated_length": 331.375
    },
    {
        "step": 120,
        "reward": 0.677978515625,
        "gold_reward": -4.0735321044921875,
        "kl_divergence": 244.2340850830078,
        "mean_generated_length": 351.75
    },
    {
        "step": 121,
        "reward": 1.0027542114257812,
        "gold_reward": -4.396125793457031,
        "kl_divergence": 233.21546936035156,
        "mean_generated_length": 303.5
    },
    {
        "step": 122,
        "reward": 0.914703369140625,
        "gold_reward": -4.24200439453125,
        "kl_divergence": 246.34385681152344,
        "mean_generated_length": 346.125
    },
    {
        "step": 123,
        "reward": 0.6543426513671875,
        "gold_reward": -4.2613372802734375,
        "kl_divergence": 240.06674194335938,
        "mean_generated_length": 307.03125
    },
    {
        "step": 124,
        "reward": 1.644866943359375,
        "gold_reward": -4.1187744140625,
        "kl_divergence": 243.4801788330078,
        "mean_generated_length": 290.375
    },
    {
        "step": 125,
        "reward": 1.902449369430542,
        "gold_reward": -4.21673583984375,
        "kl_divergence": 275.4496765136719,
        "mean_generated_length": 323.625
    },
    {
        "step": 126,
        "reward": 1.7610015869140625,
        "gold_reward": -3.99102783203125,
        "kl_divergence": 257.0357666015625,
        "mean_generated_length": 300.0
    },
    {
        "step": 127,
        "reward": 2.0329437255859375,
        "gold_reward": -4.0466156005859375,
        "kl_divergence": 319.75164794921875,
        "mean_generated_length": 334.25
    },
    {
        "step": 128,
        "reward": 1.63555908203125,
        "gold_reward": -4.216705322265625,
        "kl_divergence": 330.8160400390625,
        "mean_generated_length": 338.5
    },
    {
        "step": 129,
        "reward": 2.6813125610351562,
        "gold_reward": -3.6004180908203125,
        "kl_divergence": 320.3967590332031,
        "mean_generated_length": 311.125
    },
    {
        "step": 130,
        "reward": 2.449127197265625,
        "gold_reward": -4.2777557373046875,
        "kl_divergence": 337.0917663574219,
        "mean_generated_length": 321.75
    },
    {
        "step": 131,
        "reward": 2.757598876953125,
        "gold_reward": -4.363037109375,
        "kl_divergence": 331.00921630859375,
        "mean_generated_length": 298.875
    },
    {
        "step": 132,
        "reward": 3.079132080078125,
        "gold_reward": -4.37298583984375,
        "kl_divergence": 404.7656555175781,
        "mean_generated_length": 347.0
    },
    {
        "step": 133,
        "reward": 2.4431304931640625,
        "gold_reward": -4.420955657958984,
        "kl_divergence": 372.1763610839844,
        "mean_generated_length": 310.5
    },
    {
        "step": 134,
        "reward": 3.123809814453125,
        "gold_reward": -4.053098678588867,
        "kl_divergence": 389.1864318847656,
        "mean_generated_length": 315.375
    },
    {
        "step": 135,
        "reward": 2.6300926208496094,
        "gold_reward": -4.319587707519531,
        "kl_divergence": 387.4532775878906,
        "mean_generated_length": 309.125
    },
    {
        "step": 136,
        "reward": 3.39483642578125,
        "gold_reward": -4.09454345703125,
        "kl_divergence": 399.96783447265625,
        "mean_generated_length": 322.0
    },
    {
        "step": 137,
        "reward": 3.2964248657226562,
        "gold_reward": -4.38616943359375,
        "kl_divergence": 404.7757568359375,
        "mean_generated_length": 312.25
    },
    {
        "step": 138,
        "reward": 3.013671875,
        "gold_reward": -4.288669586181641,
        "kl_divergence": 415.1665954589844,
        "mean_generated_length": 317.625
    },
    {
        "step": 139,
        "reward": 2.7659459114074707,
        "gold_reward": -4.242919921875,
        "kl_divergence": 416.1379699707031,
        "mean_generated_length": 310.5
    },
    {
        "step": 140,
        "reward": 2.90838623046875,
        "gold_reward": -4.7412109375,
        "kl_divergence": 418.4808044433594,
        "mean_generated_length": 324.75
    },
    {
        "step": 141,
        "reward": 2.6402664184570312,
        "gold_reward": -4.67572021484375,
        "kl_divergence": 427.25335693359375,
        "mean_generated_length": 326.25
    },
    {
        "step": 142,
        "reward": 2.5832061767578125,
        "gold_reward": -4.392730712890625,
        "kl_divergence": 366.96197509765625,
        "mean_generated_length": 276.125
    },
    {
        "step": 143,
        "reward": 2.7317962646484375,
        "gold_reward": -4.3397216796875,
        "kl_divergence": 411.5089416503906,
        "mean_generated_length": 315.125
    },
    {
        "step": 144,
        "reward": 2.2086029052734375,
        "gold_reward": -4.8421630859375,
        "kl_divergence": 388.84686279296875,
        "mean_generated_length": 305.375
    },
    {
        "step": 145,
        "reward": 2.6564254760742188,
        "gold_reward": -4.5548095703125,
        "kl_divergence": 405.852783203125,
        "mean_generated_length": 307.625
    },
    {
        "step": 146,
        "reward": 2.4709701538085938,
        "gold_reward": -4.66949462890625,
        "kl_divergence": 412.84405517578125,
        "mean_generated_length": 328.0
    },
    {
        "step": 147,
        "reward": 2.70001220703125,
        "gold_reward": -4.333984375,
        "kl_divergence": 406.9353942871094,
        "mean_generated_length": 326.875
    },
    {
        "step": 148,
        "reward": 2.5425796508789062,
        "gold_reward": -4.5887451171875,
        "kl_divergence": 410.5934143066406,
        "mean_generated_length": 337.375
    },
    {
        "step": 149,
        "reward": 2.786731719970703,
        "gold_reward": -4.6629638671875,
        "kl_divergence": 384.9271240234375,
        "mean_generated_length": 312.75
    },
    {
        "step": 150,
        "reward": 2.730419158935547,
        "gold_reward": -4.5584716796875,
        "kl_divergence": 381.2274169921875,
        "mean_generated_length": 328.5
    },
    {
        "step": 151,
        "reward": 2.7021331787109375,
        "gold_reward": -4.56982421875,
        "kl_divergence": 372.88897705078125,
        "mean_generated_length": 314.625
    },
    {
        "step": 152,
        "reward": 2.4999008178710938,
        "gold_reward": -4.5552215576171875,
        "kl_divergence": 338.0427551269531,
        "mean_generated_length": 284.25
    },
    {
        "step": 153,
        "reward": 2.939361572265625,
        "gold_reward": -4.726531982421875,
        "kl_divergence": 336.97723388671875,
        "mean_generated_length": 291.0
    },
    {
        "step": 154,
        "reward": 3.346923828125,
        "gold_reward": -4.611328125,
        "kl_divergence": 369.823974609375,
        "mean_generated_length": 321.25
    },
    {
        "step": 155,
        "reward": 3.078125,
        "gold_reward": -4.744873046875,
        "kl_divergence": 363.96453857421875,
        "mean_generated_length": 314.5
    },
    {
        "step": 156,
        "reward": 3.4393720626831055,
        "gold_reward": -4.40155029296875,
        "kl_divergence": 393.5535888671875,
        "mean_generated_length": 349.125
    },
    {
        "step": 157,
        "reward": 3.724609375,
        "gold_reward": -3.4921875,
        "kl_divergence": 414.80401611328125,
        "mean_generated_length": 398.375
    },
    {
        "step": 158,
        "reward": 2.8852920532226562,
        "gold_reward": -4.45452880859375,
        "kl_divergence": 368.3681945800781,
        "mean_generated_length": 311.25
    },
    {
        "step": 159,
        "reward": 2.98699951171875,
        "gold_reward": -4.405029296875,
        "kl_divergence": 368.7586975097656,
        "mean_generated_length": 314.25
    },
    {
        "step": 160,
        "reward": 3.499614715576172,
        "gold_reward": -4.28369140625,
        "kl_divergence": 369.3223876953125,
        "mean_generated_length": 313.625
    },
    {
        "step": 161,
        "reward": 3.215301513671875,
        "gold_reward": -4.6453094482421875,
        "kl_divergence": 372.9475402832031,
        "mean_generated_length": 321.125
    },
    {
        "step": 162,
        "reward": 2.4022903442382812,
        "gold_reward": -4.5191650390625,
        "kl_divergence": 350.0946044921875,
        "mean_generated_length": 303.625
    },
    {
        "step": 163,
        "reward": 2.8659210205078125,
        "gold_reward": -4.694091796875,
        "kl_divergence": 355.1318664550781,
        "mean_generated_length": 306.375
    },
    {
        "step": 164,
        "reward": 2.915557861328125,
        "gold_reward": -4.6187744140625,
        "kl_divergence": 366.8314208984375,
        "mean_generated_length": 316.375
    },
    {
        "step": 165,
        "reward": 2.96795654296875,
        "gold_reward": -4.333953857421875,
        "kl_divergence": 360.095947265625,
        "mean_generated_length": 310.875
    },
    {
        "step": 166,
        "reward": 3.0487899780273438,
        "gold_reward": -4.465576171875,
        "kl_divergence": 358.54296875,
        "mean_generated_length": 309.5
    },
    {
        "step": 167,
        "reward": 3.223705291748047,
        "gold_reward": -4.3482666015625,
        "kl_divergence": 386.6618957519531,
        "mean_generated_length": 343.75
    },
    {
        "step": 168,
        "reward": 3.5029678344726562,
        "gold_reward": -4.1407470703125,
        "kl_divergence": 373.90545654296875,
        "mean_generated_length": 317.0
    },
    {
        "step": 169,
        "reward": 3.153167724609375,
        "gold_reward": -4.653564453125,
        "kl_divergence": 373.697021484375,
        "mean_generated_length": 306.75
    },
    {
        "step": 170,
        "reward": 3.4093017578125,
        "gold_reward": -4.479220390319824,
        "kl_divergence": 381.0126953125,
        "mean_generated_length": 302.375
    },
    {
        "step": 171,
        "reward": 3.341400146484375,
        "gold_reward": -4.05804443359375,
        "kl_divergence": 376.91876220703125,
        "mean_generated_length": 296.25
    },
    {
        "step": 172,
        "reward": 3.1295166015625,
        "gold_reward": -4.277130126953125,
        "kl_divergence": 418.80853271484375,
        "mean_generated_length": 335.25
    },
    {
        "step": 173,
        "reward": 3.496490478515625,
        "gold_reward": -4.39178466796875,
        "kl_divergence": 417.12286376953125,
        "mean_generated_length": 335.875
    },
    {
        "step": 174,
        "reward": 3.676422119140625,
        "gold_reward": -4.1758270263671875,
        "kl_divergence": 415.8942565917969,
        "mean_generated_length": 323.625
    },
    {
        "step": 175,
        "reward": 3.655303955078125,
        "gold_reward": -4.36383056640625,
        "kl_divergence": 432.2750244140625,
        "mean_generated_length": 344.375
    },
    {
        "step": 176,
        "reward": 3.20281982421875,
        "gold_reward": -4.703948974609375,
        "kl_divergence": 390.82373046875,
        "mean_generated_length": 300.25
    },
    {
        "step": 177,
        "reward": 3.3389453887939453,
        "gold_reward": -4.500946044921875,
        "kl_divergence": 423.6775817871094,
        "mean_generated_length": 328.875
    },
    {
        "step": 178,
        "reward": 3.765777587890625,
        "gold_reward": -4.52593994140625,
        "kl_divergence": 408.679443359375,
        "mean_generated_length": 316.625
    },
    {
        "step": 179,
        "reward": 3.334228515625,
        "gold_reward": -4.4034423828125,
        "kl_divergence": 426.445068359375,
        "mean_generated_length": 327.5
    },
    {
        "step": 180,
        "reward": 3.4605560302734375,
        "gold_reward": -4.1201934814453125,
        "kl_divergence": 429.8206481933594,
        "mean_generated_length": 323.875
    },
    {
        "step": 181,
        "reward": 4.201549530029297,
        "gold_reward": -4.06005859375,
        "kl_divergence": 419.72454833984375,
        "mean_generated_length": 317.375
    },
    {
        "step": 182,
        "reward": 3.9432296752929688,
        "gold_reward": -4.409423828125,
        "kl_divergence": 403.30023193359375,
        "mean_generated_length": 300.25
    },
    {
        "step": 183,
        "reward": 4.0543365478515625,
        "gold_reward": -4.35400390625,
        "kl_divergence": 432.4171142578125,
        "mean_generated_length": 323.125
    },
    {
        "step": 184,
        "reward": 3.6418991088867188,
        "gold_reward": -4.4400634765625,
        "kl_divergence": 411.84661865234375,
        "mean_generated_length": 303.25
    },
    {
        "step": 185,
        "reward": 3.648223876953125,
        "gold_reward": -4.67138671875,
        "kl_divergence": 439.5282897949219,
        "mean_generated_length": 331.125
    },
    {
        "step": 186,
        "reward": 3.6965713500976562,
        "gold_reward": -4.5565185546875,
        "kl_divergence": 400.632080078125,
        "mean_generated_length": 287.5
    },
    {
        "step": 187,
        "reward": 3.7454833984375,
        "gold_reward": -4.504852294921875,
        "kl_divergence": 430.02044677734375,
        "mean_generated_length": 324.375
    },
    {
        "step": 188,
        "reward": 4.0631866455078125,
        "gold_reward": -4.30517578125,
        "kl_divergence": 461.145751953125,
        "mean_generated_length": 331.0
    },
    {
        "step": 189,
        "reward": 3.7291488647460938,
        "gold_reward": -4.40911865234375,
        "kl_divergence": 430.3556823730469,
        "mean_generated_length": 311.5
    },
    {
        "step": 190,
        "reward": 4.35076904296875,
        "gold_reward": -4.291893005371094,
        "kl_divergence": 415.07183837890625,
        "mean_generated_length": 299.875
    },
    {
        "step": 191,
        "reward": 4.104461669921875,
        "gold_reward": -4.23565673828125,
        "kl_divergence": 434.03399658203125,
        "mean_generated_length": 314.25
    },
    {
        "step": 192,
        "reward": 3.77850341796875,
        "gold_reward": -4.1895904541015625,
        "kl_divergence": 451.0206298828125,
        "mean_generated_length": 336.5
    },
    {
        "step": 193,
        "reward": 3.6138076782226562,
        "gold_reward": -4.3038330078125,
        "kl_divergence": 482.15814208984375,
        "mean_generated_length": 357.75
    },
    {
        "step": 194,
        "reward": 3.4476318359375,
        "gold_reward": -4.5009765625,
        "kl_divergence": 438.80987548828125,
        "mean_generated_length": 320.5
    },
    {
        "step": 195,
        "reward": 4.07574462890625,
        "gold_reward": -4.5423583984375,
        "kl_divergence": 425.9739685058594,
        "mean_generated_length": 300.25
    },
    {
        "step": 196,
        "reward": 3.914031982421875,
        "gold_reward": -4.681640625,
        "kl_divergence": 441.1669921875,
        "mean_generated_length": 328.375
    },
    {
        "step": 197,
        "reward": 3.723052978515625,
        "gold_reward": -4.3116455078125,
        "kl_divergence": 431.76953125,
        "mean_generated_length": 326.0
    },
    {
        "step": 198,
        "reward": 3.5338134765625,
        "gold_reward": -4.5093994140625,
        "kl_divergence": 448.29425048828125,
        "mean_generated_length": 331.375
    },
    {
        "step": 199,
        "reward": 3.7634429931640625,
        "gold_reward": -4.2066650390625,
        "kl_divergence": 472.1225891113281,
        "mean_generated_length": 351.75
    },
    {
        "step": 200,
        "reward": 4.3496246337890625,
        "gold_reward": -4.29681396484375,
        "kl_divergence": 424.911865234375,
        "mean_generated_length": 303.5
    },
    {
        "step": 201,
        "reward": 3.698028564453125,
        "gold_reward": -4.2444610595703125,
        "kl_divergence": 477.8759460449219,
        "mean_generated_length": 346.125
    },
    {
        "step": 202,
        "reward": 3.96746826171875,
        "gold_reward": -4.402740478515625,
        "kl_divergence": 424.3114318847656,
        "mean_generated_length": 307.75
    },
    {
        "step": 203,
        "reward": 3.576446533203125,
        "gold_reward": -4.55255126953125,
        "kl_divergence": 411.6820068359375,
        "mean_generated_length": 290.375
    },
    {
        "step": 204,
        "reward": 4.4696044921875,
        "gold_reward": -4.3489990234375,
        "kl_divergence": 449.7628479003906,
        "mean_generated_length": 323.625
    },
    {
        "step": 205,
        "reward": 4.003173828125,
        "gold_reward": -4.377998352050781,
        "kl_divergence": 426.9632263183594,
        "mean_generated_length": 300.0
    },
    {
        "step": 206,
        "reward": 4.0959320068359375,
        "gold_reward": -4.3236083984375,
        "kl_divergence": 460.3015441894531,
        "mean_generated_length": 334.25
    },
    {
        "step": 207,
        "reward": 3.816986083984375,
        "gold_reward": -4.405555725097656,
        "kl_divergence": 471.07196044921875,
        "mean_generated_length": 338.5
    },
    {
        "step": 208,
        "reward": 4.602081298828125,
        "gold_reward": -4.095977783203125,
        "kl_divergence": 445.56732177734375,
        "mean_generated_length": 311.125
    },
    {
        "step": 209,
        "reward": 3.8791236877441406,
        "gold_reward": -4.466670989990234,
        "kl_divergence": 444.76043701171875,
        "mean_generated_length": 321.75
    },
    {
        "step": 210,
        "reward": 4.30572509765625,
        "gold_reward": -4.640472412109375,
        "kl_divergence": 419.84661865234375,
        "mean_generated_length": 298.875
    },
    {
        "step": 211,
        "reward": 4.029899597167969,
        "gold_reward": -4.579833984375,
        "kl_divergence": 466.463134765625,
        "mean_generated_length": 347.0
    },
    {
        "step": 212,
        "reward": 4.2491455078125,
        "gold_reward": -4.543357849121094,
        "kl_divergence": 435.0924072265625,
        "mean_generated_length": 310.5
    },
    {
        "step": 213,
        "reward": 4.55657958984375,
        "gold_reward": -4.34228515625,
        "kl_divergence": 437.6314392089844,
        "mean_generated_length": 315.375
    },
    {
        "step": 214,
        "reward": 4.26910400390625,
        "gold_reward": -4.45703125,
        "kl_divergence": 439.9007873535156,
        "mean_generated_length": 309.125
    },
    {
        "step": 215,
        "reward": 4.20294189453125,
        "gold_reward": -4.263824462890625,
        "kl_divergence": 450.94207763671875,
        "mean_generated_length": 322.0
    },
    {
        "step": 216,
        "reward": 4.701812744140625,
        "gold_reward": -4.2859649658203125,
        "kl_divergence": 436.13214111328125,
        "mean_generated_length": 312.25
    },
    {
        "step": 217,
        "reward": 4.36676025390625,
        "gold_reward": -4.115966796875,
        "kl_divergence": 446.9356689453125,
        "mean_generated_length": 317.625
    },
    {
        "step": 218,
        "reward": 4.34417724609375,
        "gold_reward": -4.3060302734375,
        "kl_divergence": 435.48187255859375,
        "mean_generated_length": 310.5
    },
    {
        "step": 219,
        "reward": 4.7816009521484375,
        "gold_reward": -4.4764404296875,
        "kl_divergence": 443.4503173828125,
        "mean_generated_length": 324.75
    },
    {
        "step": 220,
        "reward": 4.4515380859375,
        "gold_reward": -4.4239501953125,
        "kl_divergence": 451.35009765625,
        "mean_generated_length": 326.25
    },
    {
        "step": 221,
        "reward": 4.399664878845215,
        "gold_reward": -4.31866455078125,
        "kl_divergence": 390.5686950683594,
        "mean_generated_length": 276.125
    },
    {
        "step": 222,
        "reward": 4.80609130859375,
        "gold_reward": -4.108245849609375,
        "kl_divergence": 435.9721374511719,
        "mean_generated_length": 315.125
    },
    {
        "step": 223,
        "reward": 4.503631591796875,
        "gold_reward": -4.542938232421875,
        "kl_divergence": 429.51141357421875,
        "mean_generated_length": 305.375
    },
    {
        "step": 224,
        "reward": 4.187774658203125,
        "gold_reward": -4.2862548828125,
        "kl_divergence": 429.477294921875,
        "mean_generated_length": 307.625
    },
    {
        "step": 225,
        "reward": 4.4229583740234375,
        "gold_reward": -4.630859375,
        "kl_divergence": 459.20770263671875,
        "mean_generated_length": 328.0
    },
    {
        "step": 226,
        "reward": 4.7064056396484375,
        "gold_reward": -4.034675598144531,
        "kl_divergence": 446.2923583984375,
        "mean_generated_length": 326.875
    },
    {
        "step": 227,
        "reward": 4.210689544677734,
        "gold_reward": -4.2806396484375,
        "kl_divergence": 451.52313232421875,
        "mean_generated_length": 337.375
    },
    {
        "step": 228,
        "reward": 4.74444580078125,
        "gold_reward": -4.4183349609375,
        "kl_divergence": 434.1015625,
        "mean_generated_length": 312.75
    },
    {
        "step": 229,
        "reward": 4.64013671875,
        "gold_reward": -4.3499755859375,
        "kl_divergence": 451.1446533203125,
        "mean_generated_length": 328.5
    },
    {
        "step": 230,
        "reward": 4.5001220703125,
        "gold_reward": -4.3485107421875,
        "kl_divergence": 435.6737365722656,
        "mean_generated_length": 314.625
    },
    {
        "step": 231,
        "reward": 4.978349685668945,
        "gold_reward": -4.5985107421875,
        "kl_divergence": 408.346923828125,
        "mean_generated_length": 284.25
    },
    {
        "step": 232,
        "reward": 4.3121490478515625,
        "gold_reward": -4.5927734375,
        "kl_divergence": 407.4488830566406,
        "mean_generated_length": 291.0
    },
    {
        "step": 233,
        "reward": 4.521781921386719,
        "gold_reward": -4.4715576171875,
        "kl_divergence": 444.37420654296875,
        "mean_generated_length": 321.25
    },
    {
        "step": 234,
        "reward": 5.02093505859375,
        "gold_reward": -4.30224609375,
        "kl_divergence": 431.75921630859375,
        "mean_generated_length": 314.5
    },
    {
        "step": 235,
        "reward": 4.5863037109375,
        "gold_reward": -4.260986328125,
        "kl_divergence": 477.1019287109375,
        "mean_generated_length": 349.125
    },
    {
        "step": 236,
        "reward": 4.705078125,
        "gold_reward": -3.6064453125,
        "kl_divergence": 507.7486877441406,
        "mean_generated_length": 398.375
    }
]