[
    {
        "step": 0,
        "reward": -3.7819061279296875,
        "gold_reward": -3.13934326171875,
        "kl_divergence": 0.0,
        "mean_generated_length": 133.28125
    },
    {
        "step": 1,
        "reward": -3.4259490966796875,
        "gold_reward": -2.917926788330078,
        "kl_divergence": 0.0,
        "mean_generated_length": 127.515625
    },
    {
        "step": 2,
        "reward": -3.4557647705078125,
        "gold_reward": -3.137094497680664,
        "kl_divergence": -0.06192222237586975,
        "mean_generated_length": 91.34375
    },
    {
        "step": 3,
        "reward": -4.172115325927734,
        "gold_reward": -3.4256744384765625,
        "kl_divergence": -0.012747849337756634,
        "mean_generated_length": 125.515625
    },
    {
        "step": 4,
        "reward": -3.134307861328125,
        "gold_reward": -3.080108642578125,
        "kl_divergence": -0.018795805051922798,
        "mean_generated_length": 126.25
    },
    {
        "step": 5,
        "reward": -4.5013427734375,
        "gold_reward": -3.3180694580078125,
        "kl_divergence": -0.07664982974529266,
        "mean_generated_length": 115.453125
    },
    {
        "step": 6,
        "reward": -3.739227294921875,
        "gold_reward": -3.44891357421875,
        "kl_divergence": 0.11282814294099808,
        "mean_generated_length": 138.40625
    },
    {
        "step": 7,
        "reward": -3.5193519592285156,
        "gold_reward": -2.6213455200195312,
        "kl_divergence": 0.16902847588062286,
        "mean_generated_length": 129.375
    },
    {
        "step": 8,
        "reward": -2.6635284423828125,
        "gold_reward": -3.116943359375,
        "kl_divergence": 0.6301347613334656,
        "mean_generated_length": 146.625
    },
    {
        "step": 9,
        "reward": -2.497344970703125,
        "gold_reward": -2.227703094482422,
        "kl_divergence": 1.0311249494552612,
        "mean_generated_length": 135.390625
    },
    {
        "step": 10,
        "reward": -2.17327880859375,
        "gold_reward": -2.0726165771484375,
        "kl_divergence": 1.7827181816101074,
        "mean_generated_length": 136.5
    },
    {
        "step": 11,
        "reward": -2.74847412109375,
        "gold_reward": -2.936473846435547,
        "kl_divergence": 1.9652719497680664,
        "mean_generated_length": 148.65625
    },
    {
        "step": 12,
        "reward": -2.8463134765625,
        "gold_reward": -2.6583251953125,
        "kl_divergence": 1.7317962646484375,
        "mean_generated_length": 142.3125
    },
    {
        "step": 13,
        "reward": -2.430744171142578,
        "gold_reward": -1.825408935546875,
        "kl_divergence": 3.2793493270874023,
        "mean_generated_length": 128.109375
    },
    {
        "step": 14,
        "reward": -2.402587890625,
        "gold_reward": -2.540283203125,
        "kl_divergence": 3.2417454719543457,
        "mean_generated_length": 145.5625
    },
    {
        "step": 15,
        "reward": -2.112091064453125,
        "gold_reward": -2.329204559326172,
        "kl_divergence": 4.512903213500977,
        "mean_generated_length": 144.390625
    },
    {
        "step": 16,
        "reward": -2.8175888061523438,
        "gold_reward": -2.416778564453125,
        "kl_divergence": 4.427425384521484,
        "mean_generated_length": 147.59375
    },
    {
        "step": 17,
        "reward": -2.3721237182617188,
        "gold_reward": -2.5572509765625,
        "kl_divergence": 4.500034809112549,
        "mean_generated_length": 146.609375
    },
    {
        "step": 18,
        "reward": -2.6672210693359375,
        "gold_reward": -2.7639808654785156,
        "kl_divergence": 6.052210330963135,
        "mean_generated_length": 169.59375
    },
    {
        "step": 19,
        "reward": -2.3202781677246094,
        "gold_reward": -2.6734771728515625,
        "kl_divergence": 6.83812141418457,
        "mean_generated_length": 152.421875
    },
    {
        "step": 20,
        "reward": -2.6334304809570312,
        "gold_reward": -2.6780548095703125,
        "kl_divergence": 9.294983863830566,
        "mean_generated_length": 168.515625
    },
    {
        "step": 21,
        "reward": -1.8364219665527344,
        "gold_reward": -2.7130985260009766,
        "kl_divergence": 6.988122940063477,
        "mean_generated_length": 156.78125
    },
    {
        "step": 22,
        "reward": -2.0258331298828125,
        "gold_reward": -2.224956512451172,
        "kl_divergence": 9.244950294494629,
        "mean_generated_length": 166.046875
    },
    {
        "step": 23,
        "reward": -1.4397201538085938,
        "gold_reward": -2.239818572998047,
        "kl_divergence": 9.794267654418945,
        "mean_generated_length": 156.765625
    },
    {
        "step": 24,
        "reward": -2.330150604248047,
        "gold_reward": -2.3578033447265625,
        "kl_divergence": 12.967016220092773,
        "mean_generated_length": 177.21875
    },
    {
        "step": 25,
        "reward": -1.8361473083496094,
        "gold_reward": -1.9292726516723633,
        "kl_divergence": 12.700794219970703,
        "mean_generated_length": 200.25
    },
    {
        "step": 26,
        "reward": -2.5854949951171875,
        "gold_reward": -2.665064811706543,
        "kl_divergence": 19.108694076538086,
        "mean_generated_length": 208.03125
    },
    {
        "step": 27,
        "reward": -2.217041015625,
        "gold_reward": -2.6472396850585938,
        "kl_divergence": 16.368091583251953,
        "mean_generated_length": 229.78125
    },
    {
        "step": 28,
        "reward": -1.8408317565917969,
        "gold_reward": -2.4618453979492188,
        "kl_divergence": 20.397035598754883,
        "mean_generated_length": 201.609375
    },
    {
        "step": 29,
        "reward": -1.7100677490234375,
        "gold_reward": -2.4744632244110107,
        "kl_divergence": 19.594091415405273,
        "mean_generated_length": 218.453125
    },
    {
        "step": 30,
        "reward": -1.9094619750976562,
        "gold_reward": -2.34246826171875,
        "kl_divergence": 21.7932186126709,
        "mean_generated_length": 219.9375
    },
    {
        "step": 31,
        "reward": -1.9214248657226562,
        "gold_reward": -2.7897138595581055,
        "kl_divergence": 17.937467575073242,
        "mean_generated_length": 230.265625
    },
    {
        "step": 32,
        "reward": -2.4449844360351562,
        "gold_reward": -2.3519439697265625,
        "kl_divergence": 19.378755569458008,
        "mean_generated_length": 183.671875
    },
    {
        "step": 33,
        "reward": -1.6714630126953125,
        "gold_reward": -2.24755859375,
        "kl_divergence": 16.099750518798828,
        "mean_generated_length": 184.71875
    },
    {
        "step": 34,
        "reward": -2.5757675170898438,
        "gold_reward": -2.3348388671875,
        "kl_divergence": 14.26315689086914,
        "mean_generated_length": 162.34375
    },
    {
        "step": 35,
        "reward": -2.7738876342773438,
        "gold_reward": -2.571239471435547,
        "kl_divergence": 11.838269233703613,
        "mean_generated_length": 156.6875
    },
    {
        "step": 36,
        "reward": -1.64691162109375,
        "gold_reward": -2.912464141845703,
        "kl_divergence": 11.837576866149902,
        "mean_generated_length": 172.1875
    },
    {
        "step": 37,
        "reward": -2.1360034942626953,
        "gold_reward": -2.352752685546875,
        "kl_divergence": 10.815688133239746,
        "mean_generated_length": 130.546875
    },
    {
        "step": 38,
        "reward": -2.5881805419921875,
        "gold_reward": -3.0418038368225098,
        "kl_divergence": 12.178363800048828,
        "mean_generated_length": 142.78125
    },
    {
        "step": 39,
        "reward": -3.0094985961914062,
        "gold_reward": -2.9492340087890625,
        "kl_divergence": 9.994701385498047,
        "mean_generated_length": 128.40625
    },
    {
        "step": 40,
        "reward": -3.038055419921875,
        "gold_reward": -2.8866500854492188,
        "kl_divergence": 10.820741653442383,
        "mean_generated_length": 122.90625
    },
    {
        "step": 41,
        "reward": -3.0072193145751953,
        "gold_reward": -2.8528900146484375,
        "kl_divergence": 11.750295639038086,
        "mean_generated_length": 137.25
    },
    {
        "step": 42,
        "reward": -2.472412109375,
        "gold_reward": -2.6167755126953125,
        "kl_divergence": 12.098933219909668,
        "mean_generated_length": 127.71875
    },
    {
        "step": 43,
        "reward": -3.10003662109375,
        "gold_reward": -2.8574371337890625,
        "kl_divergence": 10.425408363342285,
        "mean_generated_length": 120.546875
    },
    {
        "step": 44,
        "reward": -2.286224365234375,
        "gold_reward": -2.521209716796875,
        "kl_divergence": 12.347084999084473,
        "mean_generated_length": 123.390625
    },
    {
        "step": 45,
        "reward": -2.7389068603515625,
        "gold_reward": -3.019744873046875,
        "kl_divergence": 13.170923233032227,
        "mean_generated_length": 124.109375
    },
    {
        "step": 46,
        "reward": -2.4725112915039062,
        "gold_reward": -2.9786376953125,
        "kl_divergence": 14.419679641723633,
        "mean_generated_length": 131.84375
    },
    {
        "step": 47,
        "reward": -2.4735946655273438,
        "gold_reward": -2.6213912963867188,
        "kl_divergence": 12.060996055603027,
        "mean_generated_length": 137.3125
    },
    {
        "step": 48,
        "reward": -2.362154483795166,
        "gold_reward": -2.5670166015625,
        "kl_divergence": 12.431221008300781,
        "mean_generated_length": 116.140625
    },
    {
        "step": 49,
        "reward": -3.09356689453125,
        "gold_reward": -2.61627197265625,
        "kl_divergence": 12.92031192779541,
        "mean_generated_length": 139.734375
    },
    {
        "step": 50,
        "reward": -1.7708816528320312,
        "gold_reward": -2.361419677734375,
        "kl_divergence": 12.709328651428223,
        "mean_generated_length": 116.3125
    },
    {
        "step": 51,
        "reward": -2.1961936950683594,
        "gold_reward": -2.59930419921875,
        "kl_divergence": 15.62201976776123,
        "mean_generated_length": 157.890625
    },
    {
        "step": 52,
        "reward": -2.35128116607666,
        "gold_reward": -2.4735107421875,
        "kl_divergence": 11.507019996643066,
        "mean_generated_length": 137.484375
    },
    {
        "step": 53,
        "reward": -2.3929214477539062,
        "gold_reward": -2.5784912109375,
        "kl_divergence": 14.473258972167969,
        "mean_generated_length": 146.671875
    },
    {
        "step": 54,
        "reward": -2.8546218872070312,
        "gold_reward": -3.160980224609375,
        "kl_divergence": 15.174439430236816,
        "mean_generated_length": 150.515625
    },
    {
        "step": 55,
        "reward": -1.6731529235839844,
        "gold_reward": -2.042247772216797,
        "kl_divergence": 14.545564651489258,
        "mean_generated_length": 142.265625
    },
    {
        "step": 56,
        "reward": -2.8743362426757812,
        "gold_reward": -2.6347341537475586,
        "kl_divergence": 15.613090515136719,
        "mean_generated_length": 154.5625
    },
    {
        "step": 57,
        "reward": -2.3312225341796875,
        "gold_reward": -2.5666275024414062,
        "kl_divergence": 13.137279510498047,
        "mean_generated_length": 134.453125
    },
    {
        "step": 58,
        "reward": -2.3820419311523438,
        "gold_reward": -2.490447998046875,
        "kl_divergence": 15.824666976928711,
        "mean_generated_length": 139.8125
    },
    {
        "step": 59,
        "reward": -2.596405029296875,
        "gold_reward": -2.5332024097442627,
        "kl_divergence": 14.299356460571289,
        "mean_generated_length": 133.65625
    },
    {
        "step": 60,
        "reward": -1.529754638671875,
        "gold_reward": -1.9988336563110352,
        "kl_divergence": 14.197118759155273,
        "mean_generated_length": 127.84375
    },
    {
        "step": 61,
        "reward": -2.3995132446289062,
        "gold_reward": -2.6514549255371094,
        "kl_divergence": 17.257122039794922,
        "mean_generated_length": 159.328125
    },
    {
        "step": 62,
        "reward": -2.19242000579834,
        "gold_reward": -2.507375717163086,
        "kl_divergence": 15.967135429382324,
        "mean_generated_length": 147.59375
    },
    {
        "step": 63,
        "reward": -2.409130334854126,
        "gold_reward": -2.683013916015625,
        "kl_divergence": 13.941971778869629,
        "mean_generated_length": 140.296875
    },
    {
        "step": 64,
        "reward": -2.715648651123047,
        "gold_reward": -2.36474609375,
        "kl_divergence": 15.249353408813477,
        "mean_generated_length": 151.3125
    },
    {
        "step": 65,
        "reward": -2.7482833862304688,
        "gold_reward": -2.990386962890625,
        "kl_divergence": 16.276824951171875,
        "mean_generated_length": 162.9375
    },
    {
        "step": 66,
        "reward": -2.616596221923828,
        "gold_reward": -2.487359046936035,
        "kl_divergence": 14.6727294921875,
        "mean_generated_length": 146.625
    },
    {
        "step": 67,
        "reward": -2.107349395751953,
        "gold_reward": -3.1778907775878906,
        "kl_divergence": 21.304841995239258,
        "mean_generated_length": 200.78125
    },
    {
        "step": 68,
        "reward": -1.1567230224609375,
        "gold_reward": -2.042461395263672,
        "kl_divergence": 15.148998260498047,
        "mean_generated_length": 146.734375
    },
    {
        "step": 69,
        "reward": -2.3631591796875,
        "gold_reward": -2.8462982177734375,
        "kl_divergence": 15.277461051940918,
        "mean_generated_length": 139.0
    },
    {
        "step": 70,
        "reward": -1.8223419189453125,
        "gold_reward": -2.9577829837799072,
        "kl_divergence": 15.048384666442871,
        "mean_generated_length": 154.0625
    },
    {
        "step": 71,
        "reward": -2.7319793701171875,
        "gold_reward": -2.8594818115234375,
        "kl_divergence": 17.10859489440918,
        "mean_generated_length": 167.0625
    },
    {
        "step": 72,
        "reward": -1.9005889892578125,
        "gold_reward": -2.5055532455444336,
        "kl_divergence": 17.01399803161621,
        "mean_generated_length": 155.828125
    },
    {
        "step": 73,
        "reward": -2.541748046875,
        "gold_reward": -2.694009780883789,
        "kl_divergence": 15.354207992553711,
        "mean_generated_length": 139.015625
    },
    {
        "step": 74,
        "reward": -2.236736297607422,
        "gold_reward": -2.7914505004882812,
        "kl_divergence": 16.31041145324707,
        "mean_generated_length": 160.078125
    },
    {
        "step": 75,
        "reward": -1.7880783081054688,
        "gold_reward": -3.036712646484375,
        "kl_divergence": 16.41405487060547,
        "mean_generated_length": 142.953125
    },
    {
        "step": 76,
        "reward": -2.8268485069274902,
        "gold_reward": -2.4060134887695312,
        "kl_divergence": 18.1934757232666,
        "mean_generated_length": 170.578125
    },
    {
        "step": 77,
        "reward": -1.9605560302734375,
        "gold_reward": -2.3233642578125,
        "kl_divergence": 17.73800277709961,
        "mean_generated_length": 175.53125
    },
    {
        "step": 78,
        "reward": -0.994140625,
        "gold_reward": -1.974853515625,
        "kl_divergence": 21.335933685302734,
        "mean_generated_length": 169.0
    },
    {
        "step": 79,
        "reward": -2.2771530151367188,
        "gold_reward": -2.8069305419921875,
        "kl_divergence": 20.055543899536133,
        "mean_generated_length": 169.421875
    },
    {
        "step": 80,
        "reward": -1.9737300872802734,
        "gold_reward": -2.5808334350585938,
        "kl_divergence": 21.899898529052734,
        "mean_generated_length": 169.484375
    },
    {
        "step": 81,
        "reward": -1.6870193481445312,
        "gold_reward": -2.327953338623047,
        "kl_divergence": 16.09941291809082,
        "mean_generated_length": 129.84375
    },
    {
        "step": 82,
        "reward": -2.402566909790039,
        "gold_reward": -2.79080867767334,
        "kl_divergence": 22.815181732177734,
        "mean_generated_length": 175.296875
    },
    {
        "step": 83,
        "reward": -1.9832000732421875,
        "gold_reward": -2.6975152492523193,
        "kl_divergence": 23.462966918945312,
        "mean_generated_length": 155.984375
    },
    {
        "step": 84,
        "reward": -2.7272186279296875,
        "gold_reward": -2.8794937133789062,
        "kl_divergence": 22.882627487182617,
        "mean_generated_length": 169.21875
    },
    {
        "step": 85,
        "reward": -2.794830322265625,
        "gold_reward": -3.240814208984375,
        "kl_divergence": 22.56417465209961,
        "mean_generated_length": 162.546875
    },
    {
        "step": 86,
        "reward": -2.078510284423828,
        "gold_reward": -2.3920135498046875,
        "kl_divergence": 17.457324981689453,
        "mean_generated_length": 139.28125
    },
    {
        "step": 87,
        "reward": -1.7645950317382812,
        "gold_reward": -2.7940101623535156,
        "kl_divergence": 19.213186264038086,
        "mean_generated_length": 149.5
    },
    {
        "step": 88,
        "reward": -1.992041826248169,
        "gold_reward": -2.41473388671875,
        "kl_divergence": 23.114484786987305,
        "mean_generated_length": 157.71875
    },
    {
        "step": 89,
        "reward": -1.2415523529052734,
        "gold_reward": -1.86279296875,
        "kl_divergence": 20.496822357177734,
        "mean_generated_length": 146.5625
    },
    {
        "step": 90,
        "reward": -1.9642319679260254,
        "gold_reward": -2.8855743408203125,
        "kl_divergence": 20.952831268310547,
        "mean_generated_length": 165.796875
    },
    {
        "step": 91,
        "reward": -2.28826904296875,
        "gold_reward": -2.80133056640625,
        "kl_divergence": 20.402063369750977,
        "mean_generated_length": 145.3125
    },
    {
        "step": 92,
        "reward": -1.58502197265625,
        "gold_reward": -1.835723876953125,
        "kl_divergence": 15.46832275390625,
        "mean_generated_length": 117.703125
    },
    {
        "step": 93,
        "reward": -2.2752532958984375,
        "gold_reward": -2.5872955322265625,
        "kl_divergence": 19.019914627075195,
        "mean_generated_length": 151.0
    },
    {
        "step": 94,
        "reward": -2.1809310913085938,
        "gold_reward": -2.42193603515625,
        "kl_divergence": 20.00354766845703,
        "mean_generated_length": 134.265625
    },
    {
        "step": 95,
        "reward": -2.2184600830078125,
        "gold_reward": -2.261516571044922,
        "kl_divergence": 16.36690902709961,
        "mean_generated_length": 154.390625
    },
    {
        "step": 96,
        "reward": -2.0280380249023438,
        "gold_reward": -2.4621124267578125,
        "kl_divergence": 17.615095138549805,
        "mean_generated_length": 146.5625
    },
    {
        "step": 97,
        "reward": -1.9800148010253906,
        "gold_reward": -2.7673873901367188,
        "kl_divergence": 16.00107765197754,
        "mean_generated_length": 162.203125
    },
    {
        "step": 98,
        "reward": -1.4296798706054688,
        "gold_reward": -2.6554489135742188,
        "kl_divergence": 18.87293815612793,
        "mean_generated_length": 156.46875
    },
    {
        "step": 99,
        "reward": -2.840496063232422,
        "gold_reward": -3.0231246948242188,
        "kl_divergence": 20.418048858642578,
        "mean_generated_length": 159.59375
    },
    {
        "step": 100,
        "reward": -1.0341415405273438,
        "gold_reward": -2.310089111328125,
        "kl_divergence": 16.623577117919922,
        "mean_generated_length": 144.40625
    },
    {
        "step": 101,
        "reward": -1.903411865234375,
        "gold_reward": -2.3592166900634766,
        "kl_divergence": 20.2552433013916,
        "mean_generated_length": 138.78125
    },
    {
        "step": 102,
        "reward": -1.7660179138183594,
        "gold_reward": -2.323394775390625,
        "kl_divergence": 14.726091384887695,
        "mean_generated_length": 126.546875
    },
    {
        "step": 103,
        "reward": -2.4309158325195312,
        "gold_reward": -2.6847381591796875,
        "kl_divergence": 18.105182647705078,
        "mean_generated_length": 136.21875
    },
    {
        "step": 104,
        "reward": -2.3301239013671875,
        "gold_reward": -2.8464088439941406,
        "kl_divergence": 19.15607261657715,
        "mean_generated_length": 134.3125
    },
    {
        "step": 105,
        "reward": -2.4840087890625,
        "gold_reward": -2.822601318359375,
        "kl_divergence": 15.167254447937012,
        "mean_generated_length": 131.40625
    },
    {
        "step": 106,
        "reward": -2.128814697265625,
        "gold_reward": -2.882935047149658,
        "kl_divergence": 17.868499755859375,
        "mean_generated_length": 155.21875
    },
    {
        "step": 107,
        "reward": -1.7478256225585938,
        "gold_reward": -2.3446578979492188,
        "kl_divergence": 17.9224910736084,
        "mean_generated_length": 134.15625
    },
    {
        "step": 108,
        "reward": -1.7037620544433594,
        "gold_reward": -2.5429372787475586,
        "kl_divergence": 15.874563217163086,
        "mean_generated_length": 146.25
    },
    {
        "step": 109,
        "reward": -2.22515869140625,
        "gold_reward": -2.5626220703125,
        "kl_divergence": 16.405380249023438,
        "mean_generated_length": 137.671875
    },
    {
        "step": 110,
        "reward": -1.9567413330078125,
        "gold_reward": -2.632568359375,
        "kl_divergence": 18.032947540283203,
        "mean_generated_length": 155.453125
    },
    {
        "step": 111,
        "reward": -2.162109375,
        "gold_reward": -2.401153564453125,
        "kl_divergence": 17.843961715698242,
        "mean_generated_length": 138.8125
    },
    {
        "step": 112,
        "reward": -1.5248003005981445,
        "gold_reward": -2.103710174560547,
        "kl_divergence": 16.7564640045166,
        "mean_generated_length": 141.546875
    },
    {
        "step": 113,
        "reward": -1.8081893920898438,
        "gold_reward": -2.1608171463012695,
        "kl_divergence": 17.84823989868164,
        "mean_generated_length": 144.296875
    },
    {
        "step": 114,
        "reward": -1.9800262451171875,
        "gold_reward": -2.4413318634033203,
        "kl_divergence": 16.001819610595703,
        "mean_generated_length": 128.515625
    },
    {
        "step": 115,
        "reward": -1.6835967302322388,
        "gold_reward": -2.8310089111328125,
        "kl_divergence": 21.50342559814453,
        "mean_generated_length": 174.25
    },
    {
        "step": 116,
        "reward": -1.4602394104003906,
        "gold_reward": -2.6290283203125,
        "kl_divergence": 17.861177444458008,
        "mean_generated_length": 136.4375
    },
    {
        "step": 117,
        "reward": -1.6930465698242188,
        "gold_reward": -2.862823486328125,
        "kl_divergence": 20.642147064208984,
        "mean_generated_length": 173.140625
    },
    {
        "step": 118,
        "reward": -1.781092643737793,
        "gold_reward": -2.2650070190429688,
        "kl_divergence": 19.315631866455078,
        "mean_generated_length": 169.390625
    },
    {
        "step": 119,
        "reward": -2.20355224609375,
        "gold_reward": -2.4360923767089844,
        "kl_divergence": 18.811588287353516,
        "mean_generated_length": 156.984375
    },
    {
        "step": 120,
        "reward": -2.4365386962890625,
        "gold_reward": -2.5204811096191406,
        "kl_divergence": 18.858882904052734,
        "mean_generated_length": 165.609375
    },
    {
        "step": 121,
        "reward": -1.6362953186035156,
        "gold_reward": -2.4320526123046875,
        "kl_divergence": 16.975317001342773,
        "mean_generated_length": 160.484375
    },
    {
        "step": 122,
        "reward": -2.0037689208984375,
        "gold_reward": -2.5073699951171875,
        "kl_divergence": 17.450969696044922,
        "mean_generated_length": 152.09375
    },
    {
        "step": 123,
        "reward": -1.5628662109375,
        "gold_reward": -2.4045867919921875,
        "kl_divergence": 15.903162956237793,
        "mean_generated_length": 154.4375
    },
    {
        "step": 124,
        "reward": -2.07452392578125,
        "gold_reward": -2.772369384765625,
        "kl_divergence": 15.719780921936035,
        "mean_generated_length": 162.109375
    },
    {
        "step": 125,
        "reward": -1.931159496307373,
        "gold_reward": -2.5152320861816406,
        "kl_divergence": 16.149898529052734,
        "mean_generated_length": 147.421875
    },
    {
        "step": 126,
        "reward": -1.9759635925292969,
        "gold_reward": -2.461334228515625,
        "kl_divergence": 16.161592483520508,
        "mean_generated_length": 132.703125
    },
    {
        "step": 127,
        "reward": -1.5628938674926758,
        "gold_reward": -2.363862991333008,
        "kl_divergence": 18.65277671813965,
        "mean_generated_length": 140.328125
    },
    {
        "step": 128,
        "reward": -2.043720245361328,
        "gold_reward": -2.5420684814453125,
        "kl_divergence": 17.497108459472656,
        "mean_generated_length": 148.328125
    },
    {
        "step": 129,
        "reward": -1.9996700286865234,
        "gold_reward": -2.2177810668945312,
        "kl_divergence": 14.359062194824219,
        "mean_generated_length": 110.421875
    },
    {
        "step": 130,
        "reward": -1.8825225830078125,
        "gold_reward": -2.639984130859375,
        "kl_divergence": 17.37171173095703,
        "mean_generated_length": 149.5
    },
    {
        "step": 131,
        "reward": -1.236968994140625,
        "gold_reward": -2.515411376953125,
        "kl_divergence": 16.201921463012695,
        "mean_generated_length": 141.984375
    },
    {
        "step": 132,
        "reward": -2.1213932037353516,
        "gold_reward": -2.539703369140625,
        "kl_divergence": 14.578705787658691,
        "mean_generated_length": 137.640625
    },
    {
        "step": 133,
        "reward": -2.2777862548828125,
        "gold_reward": -2.946887969970703,
        "kl_divergence": 15.911895751953125,
        "mean_generated_length": 145.328125
    },
    {
        "step": 134,
        "reward": -1.597625732421875,
        "gold_reward": -2.212738037109375,
        "kl_divergence": 13.96709156036377,
        "mean_generated_length": 133.140625
    },
    {
        "step": 135,
        "reward": -1.995452880859375,
        "gold_reward": -2.6003494262695312,
        "kl_divergence": 16.498626708984375,
        "mean_generated_length": 146.65625
    },
    {
        "step": 136,
        "reward": -1.3443870544433594,
        "gold_reward": -2.0632781982421875,
        "kl_divergence": 14.681485176086426,
        "mean_generated_length": 117.375
    },
    {
        "step": 137,
        "reward": -1.9638023376464844,
        "gold_reward": -2.239593505859375,
        "kl_divergence": 14.971813201904297,
        "mean_generated_length": 129.078125
    },
    {
        "step": 138,
        "reward": -1.8418731689453125,
        "gold_reward": -2.23831844329834,
        "kl_divergence": 14.598995208740234,
        "mean_generated_length": 123.15625
    },
    {
        "step": 139,
        "reward": -1.9535255432128906,
        "gold_reward": -2.30047607421875,
        "kl_divergence": 15.283469200134277,
        "mean_generated_length": 127.5
    },
    {
        "step": 140,
        "reward": -1.693756103515625,
        "gold_reward": -2.398622512817383,
        "kl_divergence": 16.561702728271484,
        "mean_generated_length": 133.15625
    },
    {
        "step": 141,
        "reward": -1.6011276245117188,
        "gold_reward": -2.46136474609375,
        "kl_divergence": 14.500480651855469,
        "mean_generated_length": 123.90625
    },
    {
        "step": 142,
        "reward": -1.68243408203125,
        "gold_reward": -2.5411949157714844,
        "kl_divergence": 14.604990005493164,
        "mean_generated_length": 126.125
    },
    {
        "step": 143,
        "reward": -2.1450767517089844,
        "gold_reward": -2.3179664611816406,
        "kl_divergence": 16.833768844604492,
        "mean_generated_length": 122.75
    },
    {
        "step": 144,
        "reward": -2.233325481414795,
        "gold_reward": -2.88946533203125,
        "kl_divergence": 14.273448944091797,
        "mean_generated_length": 139.359375
    },
    {
        "step": 145,
        "reward": -2.0244369506835938,
        "gold_reward": -2.361426591873169,
        "kl_divergence": 15.906825065612793,
        "mean_generated_length": 140.4375
    },
    {
        "step": 146,
        "reward": -1.4473628997802734,
        "gold_reward": -2.932668685913086,
        "kl_divergence": 18.280818939208984,
        "mean_generated_length": 153.578125
    },
    {
        "step": 147,
        "reward": -0.9125471115112305,
        "gold_reward": -1.9588623046875,
        "kl_divergence": 15.777445793151855,
        "mean_generated_length": 127.125
    },
    {
        "step": 148,
        "reward": -1.6403579711914062,
        "gold_reward": -2.6902008056640625,
        "kl_divergence": 18.50897216796875,
        "mean_generated_length": 129.03125
    },
    {
        "step": 149,
        "reward": -1.7710418701171875,
        "gold_reward": -2.962554931640625,
        "kl_divergence": 14.698058128356934,
        "mean_generated_length": 122.59375
    },
    {
        "step": 150,
        "reward": -2.2883753776550293,
        "gold_reward": -2.4111328125,
        "kl_divergence": 16.771886825561523,
        "mean_generated_length": 144.65625
    },
    {
        "step": 151,
        "reward": -1.8022353649139404,
        "gold_reward": -2.360107421875,
        "kl_divergence": 16.27882957458496,
        "mean_generated_length": 139.4375
    },
    {
        "step": 152,
        "reward": -2.297870635986328,
        "gold_reward": -2.5606842041015625,
        "kl_divergence": 14.976633071899414,
        "mean_generated_length": 120.40625
    },
    {
        "step": 153,
        "reward": -2.0555801391601562,
        "gold_reward": -2.9009780883789062,
        "kl_divergence": 17.155780792236328,
        "mean_generated_length": 144.84375
    },
    {
        "step": 154,
        "reward": -1.4585838317871094,
        "gold_reward": -2.683258056640625,
        "kl_divergence": 16.1353702545166,
        "mean_generated_length": 124.75
    },
    {
        "step": 155,
        "reward": -2.340686798095703,
        "gold_reward": -2.620880126953125,
        "kl_divergence": 15.265151977539062,
        "mean_generated_length": 142.4375
    },
    {
        "step": 156,
        "reward": -1.3861827850341797,
        "gold_reward": -1.9759674072265625,
        "kl_divergence": 19.224924087524414,
        "mean_generated_length": 155.609375
    },
    {
        "step": 157,
        "reward": -1.2235107421875,
        "gold_reward": -2.5245361328125,
        "kl_divergence": 24.080307006835938,
        "mean_generated_length": 128.0
    },
    {
        "step": 158,
        "reward": -2.2223987579345703,
        "gold_reward": -2.891357421875,
        "kl_divergence": 15.331076622009277,
        "mean_generated_length": 141.953125
    },
    {
        "step": 159,
        "reward": -1.8819561004638672,
        "gold_reward": -2.616483688354492,
        "kl_divergence": 18.640718460083008,
        "mean_generated_length": 146.109375
    },
    {
        "step": 160,
        "reward": -1.80267333984375,
        "gold_reward": -2.324312210083008,
        "kl_divergence": 14.344903945922852,
        "mean_generated_length": 108.578125
    },
    {
        "step": 161,
        "reward": -2.091705322265625,
        "gold_reward": -2.6386795043945312,
        "kl_divergence": 19.2850284576416,
        "mean_generated_length": 154.015625
    },
    {
        "step": 162,
        "reward": -1.5039219856262207,
        "gold_reward": -2.6556854248046875,
        "kl_divergence": 22.186201095581055,
        "mean_generated_length": 159.140625
    },
    {
        "step": 163,
        "reward": -2.6382904052734375,
        "gold_reward": -2.7970657348632812,
        "kl_divergence": 20.056398391723633,
        "mean_generated_length": 144.03125
    },
    {
        "step": 164,
        "reward": -2.143951416015625,
        "gold_reward": -3.0684814453125,
        "kl_divergence": 18.630983352661133,
        "mean_generated_length": 164.640625
    },
    {
        "step": 165,
        "reward": -2.030902862548828,
        "gold_reward": -2.2210464477539062,
        "kl_divergence": 15.056314468383789,
        "mean_generated_length": 141.125
    },
    {
        "step": 166,
        "reward": -1.3082923889160156,
        "gold_reward": -2.8366479873657227,
        "kl_divergence": 20.70634651184082,
        "mean_generated_length": 149.890625
    },
    {
        "step": 167,
        "reward": -1.2670669555664062,
        "gold_reward": -2.0316295623779297,
        "kl_divergence": 17.303417205810547,
        "mean_generated_length": 135.203125
    },
    {
        "step": 168,
        "reward": -0.6512908935546875,
        "gold_reward": -1.7214202880859375,
        "kl_divergence": 18.348360061645508,
        "mean_generated_length": 148.765625
    },
    {
        "step": 169,
        "reward": -1.0878639221191406,
        "gold_reward": -2.50445556640625,
        "kl_divergence": 19.65845489501953,
        "mean_generated_length": 159.140625
    },
    {
        "step": 170,
        "reward": -1.260894775390625,
        "gold_reward": -2.455219268798828,
        "kl_divergence": 18.032670974731445,
        "mean_generated_length": 142.109375
    },
    {
        "step": 171,
        "reward": -1.31719970703125,
        "gold_reward": -1.762786865234375,
        "kl_divergence": 15.650522232055664,
        "mean_generated_length": 121.015625
    },
    {
        "step": 172,
        "reward": -1.770111083984375,
        "gold_reward": -2.1383304595947266,
        "kl_divergence": 18.53889274597168,
        "mean_generated_length": 155.453125
    },
    {
        "step": 173,
        "reward": -1.38580322265625,
        "gold_reward": -1.9973297119140625,
        "kl_divergence": 16.682661056518555,
        "mean_generated_length": 135.578125
    },
    {
        "step": 174,
        "reward": -2.133026123046875,
        "gold_reward": -2.257415771484375,
        "kl_divergence": 16.584514617919922,
        "mean_generated_length": 142.203125
    },
    {
        "step": 175,
        "reward": -1.2579631805419922,
        "gold_reward": -2.1757278442382812,
        "kl_divergence": 19.27725601196289,
        "mean_generated_length": 153.796875
    },
    {
        "step": 176,
        "reward": -2.1353931427001953,
        "gold_reward": -2.6591949462890625,
        "kl_divergence": 16.957683563232422,
        "mean_generated_length": 150.8125
    },
    {
        "step": 177,
        "reward": -1.3135032653808594,
        "gold_reward": -2.5953292846679688,
        "kl_divergence": 17.561309814453125,
        "mean_generated_length": 155.828125
    },
    {
        "step": 178,
        "reward": -1.9893715381622314,
        "gold_reward": -2.7750120162963867,
        "kl_divergence": 21.164342880249023,
        "mean_generated_length": 162.25
    },
    {
        "step": 179,
        "reward": -0.9539985656738281,
        "gold_reward": -2.205169677734375,
        "kl_divergence": 15.289754867553711,
        "mean_generated_length": 140.71875
    },
    {
        "step": 180,
        "reward": -1.8149185180664062,
        "gold_reward": -2.12841796875,
        "kl_divergence": 19.02739906311035,
        "mean_generated_length": 137.625
    },
    {
        "step": 181,
        "reward": -1.4253501892089844,
        "gold_reward": -2.0047988891601562,
        "kl_divergence": 17.539541244506836,
        "mean_generated_length": 132.8125
    },
    {
        "step": 182,
        "reward": -1.4812419414520264,
        "gold_reward": -2.419342041015625,
        "kl_divergence": 16.368755340576172,
        "mean_generated_length": 134.8125
    },
    {
        "step": 183,
        "reward": -1.6847114562988281,
        "gold_reward": -2.368332862854004,
        "kl_divergence": 15.296360969543457,
        "mean_generated_length": 142.75
    },
    {
        "step": 184,
        "reward": -2.1070165634155273,
        "gold_reward": -2.6422271728515625,
        "kl_divergence": 16.459199905395508,
        "mean_generated_length": 145.96875
    },
    {
        "step": 185,
        "reward": -1.7978515625,
        "gold_reward": -2.759368896484375,
        "kl_divergence": 19.06903076171875,
        "mean_generated_length": 169.734375
    },
    {
        "step": 186,
        "reward": -1.0770263671875,
        "gold_reward": -2.3033294677734375,
        "kl_divergence": 18.005929946899414,
        "mean_generated_length": 150.34375
    },
    {
        "step": 187,
        "reward": -1.7855987548828125,
        "gold_reward": -2.688950777053833,
        "kl_divergence": 15.2971830368042,
        "mean_generated_length": 140.5625
    },
    {
        "step": 188,
        "reward": -1.9756011962890625,
        "gold_reward": -2.2376785278320312,
        "kl_divergence": 17.04982566833496,
        "mean_generated_length": 146.140625
    },
    {
        "step": 189,
        "reward": -1.5616073608398438,
        "gold_reward": -2.560760498046875,
        "kl_divergence": 17.057893753051758,
        "mean_generated_length": 157.078125
    },
    {
        "step": 190,
        "reward": -1.8270339965820312,
        "gold_reward": -2.2901268005371094,
        "kl_divergence": 17.66437339782715,
        "mean_generated_length": 145.390625
    },
    {
        "step": 191,
        "reward": -1.2393932342529297,
        "gold_reward": -2.0008201599121094,
        "kl_divergence": 18.0104923248291,
        "mean_generated_length": 139.625
    },
    {
        "step": 192,
        "reward": -1.08074951171875,
        "gold_reward": -2.049285888671875,
        "kl_divergence": 20.652618408203125,
        "mean_generated_length": 157.75
    },
    {
        "step": 193,
        "reward": -1.5082588195800781,
        "gold_reward": -1.9307441711425781,
        "kl_divergence": 20.648014068603516,
        "mean_generated_length": 150.828125
    },
    {
        "step": 194,
        "reward": -0.9726276397705078,
        "gold_reward": -2.391145706176758,
        "kl_divergence": 20.86746597290039,
        "mean_generated_length": 175.3125
    },
    {
        "step": 195,
        "reward": -1.1705322265625,
        "gold_reward": -2.496551513671875,
        "kl_divergence": 18.639163970947266,
        "mean_generated_length": 147.375
    },
    {
        "step": 196,
        "reward": -1.286346435546875,
        "gold_reward": -2.6532440185546875,
        "kl_divergence": 20.185171127319336,
        "mean_generated_length": 177.671875
    },
    {
        "step": 197,
        "reward": -1.7261419296264648,
        "gold_reward": -2.66888427734375,
        "kl_divergence": 18.871044158935547,
        "mean_generated_length": 170.78125
    },
    {
        "step": 198,
        "reward": -2.0189971923828125,
        "gold_reward": -2.1955108642578125,
        "kl_divergence": 18.466270446777344,
        "mean_generated_length": 156.421875
    },
    {
        "step": 199,
        "reward": -1.6105270385742188,
        "gold_reward": -2.2058868408203125,
        "kl_divergence": 19.514707565307617,
        "mean_generated_length": 166.90625
    },
    {
        "step": 200,
        "reward": -1.4336929321289062,
        "gold_reward": -2.0847930908203125,
        "kl_divergence": 15.996776580810547,
        "mean_generated_length": 154.359375
    },
    {
        "step": 201,
        "reward": -2.1798629760742188,
        "gold_reward": -2.64801025390625,
        "kl_divergence": 15.710214614868164,
        "mean_generated_length": 141.5
    },
    {
        "step": 202,
        "reward": -1.4006805419921875,
        "gold_reward": -2.237823486328125,
        "kl_divergence": 15.304695129394531,
        "mean_generated_length": 147.734375
    },
    {
        "step": 203,
        "reward": -1.3996562957763672,
        "gold_reward": -2.37225341796875,
        "kl_divergence": 16.375396728515625,
        "mean_generated_length": 151.078125
    },
    {
        "step": 204,
        "reward": -1.8771438598632812,
        "gold_reward": -2.5546798706054688,
        "kl_divergence": 17.163562774658203,
        "mean_generated_length": 150.71875
    },
    {
        "step": 205,
        "reward": -1.2496452331542969,
        "gold_reward": -2.3910064697265625,
        "kl_divergence": 16.22626304626465,
        "mean_generated_length": 140.375
    },
    {
        "step": 206,
        "reward": -1.4643440246582031,
        "gold_reward": -2.267791748046875,
        "kl_divergence": 17.345989227294922,
        "mean_generated_length": 143.265625
    },
    {
        "step": 207,
        "reward": -1.6120223999023438,
        "gold_reward": -2.276123046875,
        "kl_divergence": 17.87652015686035,
        "mean_generated_length": 161.53125
    },
    {
        "step": 208,
        "reward": -0.9494400024414062,
        "gold_reward": -1.927276611328125,
        "kl_divergence": 16.415449142456055,
        "mean_generated_length": 125.640625
    },
    {
        "step": 209,
        "reward": -0.8885574340820312,
        "gold_reward": -2.320068359375,
        "kl_divergence": 20.878223419189453,
        "mean_generated_length": 165.109375
    },
    {
        "step": 210,
        "reward": -1.3000783920288086,
        "gold_reward": -2.4503555297851562,
        "kl_divergence": 17.328189849853516,
        "mean_generated_length": 152.828125
    },
    {
        "step": 211,
        "reward": -1.388275146484375,
        "gold_reward": -2.1866912841796875,
        "kl_divergence": 18.790904998779297,
        "mean_generated_length": 166.109375
    },
    {
        "step": 212,
        "reward": -1.4682388305664062,
        "gold_reward": -2.6613197326660156,
        "kl_divergence": 17.78118133544922,
        "mean_generated_length": 154.265625
    },
    {
        "step": 213,
        "reward": -1.0220670700073242,
        "gold_reward": -2.2497940063476562,
        "kl_divergence": 16.483905792236328,
        "mean_generated_length": 147.5
    },
    {
        "step": 214,
        "reward": -1.2180976867675781,
        "gold_reward": -2.0814285278320312,
        "kl_divergence": 20.143218994140625,
        "mean_generated_length": 170.703125
    },
    {
        "step": 215,
        "reward": -0.8541121482849121,
        "gold_reward": -1.9466705322265625,
        "kl_divergence": 15.376447677612305,
        "mean_generated_length": 129.375
    },
    {
        "step": 216,
        "reward": -1.0785064697265625,
        "gold_reward": -1.842498779296875,
        "kl_divergence": 18.433998107910156,
        "mean_generated_length": 150.578125
    },
    {
        "step": 217,
        "reward": -1.63189697265625,
        "gold_reward": -2.103829860687256,
        "kl_divergence": 16.26260757446289,
        "mean_generated_length": 140.4375
    },
    {
        "step": 218,
        "reward": -1.348388671875,
        "gold_reward": -2.2130508422851562,
        "kl_divergence": 15.508007049560547,
        "mean_generated_length": 146.921875
    },
    {
        "step": 219,
        "reward": -1.2969131469726562,
        "gold_reward": -2.1437225341796875,
        "kl_divergence": 19.602855682373047,
        "mean_generated_length": 153.015625
    },
    {
        "step": 220,
        "reward": -1.6509424448013306,
        "gold_reward": -2.2710494995117188,
        "kl_divergence": 17.163589477539062,
        "mean_generated_length": 149.296875
    },
    {
        "step": 221,
        "reward": -1.724954605102539,
        "gold_reward": -2.3448944091796875,
        "kl_divergence": 15.46618938446045,
        "mean_generated_length": 134.328125
    },
    {
        "step": 222,
        "reward": -1.46807861328125,
        "gold_reward": -2.0202789306640625,
        "kl_divergence": 18.163532257080078,
        "mean_generated_length": 137.453125
    },
    {
        "step": 223,
        "reward": -1.414520263671875,
        "gold_reward": -2.3555679321289062,
        "kl_divergence": 16.773366928100586,
        "mean_generated_length": 159.546875
    },
    {
        "step": 224,
        "reward": -1.4456100463867188,
        "gold_reward": -2.0982513427734375,
        "kl_divergence": 16.71678352355957,
        "mean_generated_length": 146.953125
    },
    {
        "step": 225,
        "reward": -1.5488643646240234,
        "gold_reward": -2.7279396057128906,
        "kl_divergence": 17.654020309448242,
        "mean_generated_length": 158.71875
    },
    {
        "step": 226,
        "reward": -0.7559871673583984,
        "gold_reward": -1.779550552368164,
        "kl_divergence": 15.054080963134766,
        "mean_generated_length": 134.203125
    },
    {
        "step": 227,
        "reward": -0.9643630981445312,
        "gold_reward": -2.4890594482421875,
        "kl_divergence": 16.108505249023438,
        "mean_generated_length": 142.59375
    },
    {
        "step": 228,
        "reward": -0.802660346031189,
        "gold_reward": -2.602132797241211,
        "kl_divergence": 18.040151596069336,
        "mean_generated_length": 147.546875
    },
    {
        "step": 229,
        "reward": -1.2740364074707031,
        "gold_reward": -2.2792015075683594,
        "kl_divergence": 17.312227249145508,
        "mean_generated_length": 140.4375
    },
    {
        "step": 230,
        "reward": -1.28643798828125,
        "gold_reward": -2.4160537719726562,
        "kl_divergence": 17.989837646484375,
        "mean_generated_length": 144.84375
    },
    {
        "step": 231,
        "reward": -1.263528823852539,
        "gold_reward": -1.9645271301269531,
        "kl_divergence": 18.43136978149414,
        "mean_generated_length": 150.828125
    },
    {
        "step": 232,
        "reward": -1.64324951171875,
        "gold_reward": -2.6764984130859375,
        "kl_divergence": 18.588693618774414,
        "mean_generated_length": 166.984375
    },
    {
        "step": 233,
        "reward": -0.8823471069335938,
        "gold_reward": -2.1973609924316406,
        "kl_divergence": 18.263444900512695,
        "mean_generated_length": 155.59375
    },
    {
        "step": 234,
        "reward": -1.9218635559082031,
        "gold_reward": -2.412872314453125,
        "kl_divergence": 16.570409774780273,
        "mean_generated_length": 150.140625
    },
    {
        "step": 235,
        "reward": -1.2019424438476562,
        "gold_reward": -2.309774398803711,
        "kl_divergence": 18.928945541381836,
        "mean_generated_length": 165.421875
    },
    {
        "step": 236,
        "reward": -0.10101318359375,
        "gold_reward": -1.93896484375,
        "kl_divergence": 18.551877975463867,
        "mean_generated_length": 154.375
    }
]