[
    {
        "step": 0,
        "reward": -3.7819061279296875,
        "gold_reward": -3.13934326171875,
        "kl_divergence": 0.0,
        "mean_generated_length": 133.28125
    },
    {
        "step": 1,
        "reward": -3.4259490966796875,
        "gold_reward": -2.917926788330078,
        "kl_divergence": 0.0,
        "mean_generated_length": 127.515625
    },
    {
        "step": 2,
        "reward": -3.3758621215820312,
        "gold_reward": -2.925752639770508,
        "kl_divergence": -0.020905308425426483,
        "mean_generated_length": 86.6875
    },
    {
        "step": 3,
        "reward": -4.009613037109375,
        "gold_reward": -3.4702911376953125,
        "kl_divergence": -0.008562659844756126,
        "mean_generated_length": 134.03125
    },
    {
        "step": 4,
        "reward": -3.0318145751953125,
        "gold_reward": -3.080810546875,
        "kl_divergence": -0.047395698726177216,
        "mean_generated_length": 143.46875
    },
    {
        "step": 5,
        "reward": -4.3480682373046875,
        "gold_reward": -3.2733917236328125,
        "kl_divergence": -0.021612979471683502,
        "mean_generated_length": 115.953125
    },
    {
        "step": 6,
        "reward": -3.2512054443359375,
        "gold_reward": -3.1492538452148438,
        "kl_divergence": 0.2161986082792282,
        "mean_generated_length": 140.890625
    },
    {
        "step": 7,
        "reward": -3.6445369720458984,
        "gold_reward": -2.7218780517578125,
        "kl_divergence": 0.1205640509724617,
        "mean_generated_length": 124.015625
    },
    {
        "step": 8,
        "reward": -2.2419586181640625,
        "gold_reward": -2.85296630859375,
        "kl_divergence": 0.7911969423294067,
        "mean_generated_length": 148.25
    },
    {
        "step": 9,
        "reward": -2.41082763671875,
        "gold_reward": -2.382080078125,
        "kl_divergence": 1.31269109249115,
        "mean_generated_length": 136.390625
    },
    {
        "step": 10,
        "reward": -1.5515899658203125,
        "gold_reward": -1.8934173583984375,
        "kl_divergence": 2.100008249282837,
        "mean_generated_length": 158.578125
    },
    {
        "step": 11,
        "reward": -2.766021728515625,
        "gold_reward": -3.0215301513671875,
        "kl_divergence": 3.385822057723999,
        "mean_generated_length": 164.21875
    },
    {
        "step": 12,
        "reward": -2.186366081237793,
        "gold_reward": -2.2952499389648438,
        "kl_divergence": 2.7319962978363037,
        "mean_generated_length": 164.390625
    },
    {
        "step": 13,
        "reward": -2.569244384765625,
        "gold_reward": -1.9475021362304688,
        "kl_divergence": 4.257086753845215,
        "mean_generated_length": 138.34375
    },
    {
        "step": 14,
        "reward": -2.5289764404296875,
        "gold_reward": -2.2180404663085938,
        "kl_divergence": 5.36950159072876,
        "mean_generated_length": 170.53125
    },
    {
        "step": 15,
        "reward": -2.1097755432128906,
        "gold_reward": -2.3932266235351562,
        "kl_divergence": 7.399336814880371,
        "mean_generated_length": 181.546875
    },
    {
        "step": 16,
        "reward": -2.2606887817382812,
        "gold_reward": -2.4676361083984375,
        "kl_divergence": 7.849730014801025,
        "mean_generated_length": 190.0
    },
    {
        "step": 17,
        "reward": -1.5785140991210938,
        "gold_reward": -2.299468994140625,
        "kl_divergence": 9.717005729675293,
        "mean_generated_length": 216.6875
    },
    {
        "step": 18,
        "reward": -2.6196422576904297,
        "gold_reward": -2.87542724609375,
        "kl_divergence": 10.628277778625488,
        "mean_generated_length": 220.015625
    },
    {
        "step": 19,
        "reward": -1.958261489868164,
        "gold_reward": -2.437896728515625,
        "kl_divergence": 14.230387687683105,
        "mean_generated_length": 241.1875
    },
    {
        "step": 20,
        "reward": -2.5076980590820312,
        "gold_reward": -2.8982925415039062,
        "kl_divergence": 17.713455200195312,
        "mean_generated_length": 253.890625
    },
    {
        "step": 21,
        "reward": -1.0965461730957031,
        "gold_reward": -2.4472198486328125,
        "kl_divergence": 20.277597427368164,
        "mean_generated_length": 260.59375
    },
    {
        "step": 22,
        "reward": -1.412384033203125,
        "gold_reward": -2.380401611328125,
        "kl_divergence": 19.71249771118164,
        "mean_generated_length": 246.984375
    },
    {
        "step": 23,
        "reward": -1.4933586120605469,
        "gold_reward": -2.3828277587890625,
        "kl_divergence": 19.720468521118164,
        "mean_generated_length": 246.203125
    },
    {
        "step": 24,
        "reward": -1.7266589403152466,
        "gold_reward": -2.633544921875,
        "kl_divergence": 22.757667541503906,
        "mean_generated_length": 232.5625
    },
    {
        "step": 25,
        "reward": -2.0128326416015625,
        "gold_reward": -2.6702423095703125,
        "kl_divergence": 27.892791748046875,
        "mean_generated_length": 274.015625
    },
    {
        "step": 26,
        "reward": -2.3368091583251953,
        "gold_reward": -2.614410400390625,
        "kl_divergence": 25.74697494506836,
        "mean_generated_length": 249.734375
    },
    {
        "step": 27,
        "reward": -2.061260223388672,
        "gold_reward": -2.9620285034179688,
        "kl_divergence": 25.061735153198242,
        "mean_generated_length": 275.28125
    },
    {
        "step": 28,
        "reward": -1.9746313095092773,
        "gold_reward": -2.58648681640625,
        "kl_divergence": 31.329326629638672,
        "mean_generated_length": 262.890625
    },
    {
        "step": 29,
        "reward": -1.2690963745117188,
        "gold_reward": -2.4862709045410156,
        "kl_divergence": 33.966590881347656,
        "mean_generated_length": 293.46875
    },
    {
        "step": 30,
        "reward": -2.3259124755859375,
        "gold_reward": -2.5691423416137695,
        "kl_divergence": 39.83039093017578,
        "mean_generated_length": 314.046875
    },
    {
        "step": 31,
        "reward": -1.5893516540527344,
        "gold_reward": -2.714691162109375,
        "kl_divergence": 36.081363677978516,
        "mean_generated_length": 300.65625
    },
    {
        "step": 32,
        "reward": -1.8038692474365234,
        "gold_reward": -2.443330764770508,
        "kl_divergence": 37.23667907714844,
        "mean_generated_length": 277.296875
    },
    {
        "step": 33,
        "reward": -1.9723052978515625,
        "gold_reward": -2.5556640625,
        "kl_divergence": 37.70909118652344,
        "mean_generated_length": 296.28125
    },
    {
        "step": 34,
        "reward": -1.558694839477539,
        "gold_reward": -2.3155975341796875,
        "kl_divergence": 38.773990631103516,
        "mean_generated_length": 302.796875
    },
    {
        "step": 35,
        "reward": -2.1547470092773438,
        "gold_reward": -2.0746021270751953,
        "kl_divergence": 43.48390197753906,
        "mean_generated_length": 319.40625
    },
    {
        "step": 36,
        "reward": -1.6438827514648438,
        "gold_reward": -3.0829200744628906,
        "kl_divergence": 38.699951171875,
        "mean_generated_length": 306.90625
    },
    {
        "step": 37,
        "reward": -1.3291015625,
        "gold_reward": -2.563323974609375,
        "kl_divergence": 40.133426666259766,
        "mean_generated_length": 274.28125
    },
    {
        "step": 38,
        "reward": -1.049652099609375,
        "gold_reward": -2.8554916381835938,
        "kl_divergence": 52.00505828857422,
        "mean_generated_length": 312.921875
    },
    {
        "step": 39,
        "reward": -1.70001220703125,
        "gold_reward": -2.48065185546875,
        "kl_divergence": 44.72224044799805,
        "mean_generated_length": 295.609375
    },
    {
        "step": 40,
        "reward": -1.7186331748962402,
        "gold_reward": -2.2768101692199707,
        "kl_divergence": 41.89999771118164,
        "mean_generated_length": 264.1875
    },
    {
        "step": 41,
        "reward": -1.2732086181640625,
        "gold_reward": -2.39324951171875,
        "kl_divergence": 32.406463623046875,
        "mean_generated_length": 237.984375
    },
    {
        "step": 42,
        "reward": -1.3873481750488281,
        "gold_reward": -2.3696136474609375,
        "kl_divergence": 36.177886962890625,
        "mean_generated_length": 219.28125
    },
    {
        "step": 43,
        "reward": -1.1435623168945312,
        "gold_reward": -2.3283920288085938,
        "kl_divergence": 33.50278854370117,
        "mean_generated_length": 208.703125
    },
    {
        "step": 44,
        "reward": -1.3732833862304688,
        "gold_reward": -2.4194564819335938,
        "kl_divergence": 31.236675262451172,
        "mean_generated_length": 198.171875
    },
    {
        "step": 45,
        "reward": -1.6128578186035156,
        "gold_reward": -2.553396224975586,
        "kl_divergence": 30.341873168945312,
        "mean_generated_length": 190.625
    },
    {
        "step": 46,
        "reward": -2.111652374267578,
        "gold_reward": -2.917022705078125,
        "kl_divergence": 34.300838470458984,
        "mean_generated_length": 188.921875
    },
    {
        "step": 47,
        "reward": -1.4697723388671875,
        "gold_reward": -2.745513916015625,
        "kl_divergence": 46.24647521972656,
        "mean_generated_length": 217.09375
    },
    {
        "step": 48,
        "reward": -1.5074615478515625,
        "gold_reward": -2.5324554443359375,
        "kl_divergence": 56.62043762207031,
        "mean_generated_length": 261.53125
    },
    {
        "step": 49,
        "reward": -1.3133087158203125,
        "gold_reward": -2.566009521484375,
        "kl_divergence": 70.59806823730469,
        "mean_generated_length": 300.5625
    },
    {
        "step": 50,
        "reward": -1.5853080749511719,
        "gold_reward": -2.4727783203125,
        "kl_divergence": 77.03129577636719,
        "mean_generated_length": 303.21875
    },
    {
        "step": 51,
        "reward": -1.3329315185546875,
        "gold_reward": -3.510223388671875,
        "kl_divergence": 86.58097839355469,
        "mean_generated_length": 321.390625
    },
    {
        "step": 52,
        "reward": -1.9344291687011719,
        "gold_reward": -3.57720947265625,
        "kl_divergence": 76.93954467773438,
        "mean_generated_length": 297.703125
    },
    {
        "step": 53,
        "reward": -1.73382568359375,
        "gold_reward": -3.41900634765625,
        "kl_divergence": 84.85421752929688,
        "mean_generated_length": 342.484375
    },
    {
        "step": 54,
        "reward": -2.5753374099731445,
        "gold_reward": -3.73626708984375,
        "kl_divergence": 89.59300994873047,
        "mean_generated_length": 310.5
    },
    {
        "step": 55,
        "reward": -2.68017578125,
        "gold_reward": -3.9559326171875,
        "kl_divergence": 98.19766998291016,
        "mean_generated_length": 315.375
    },
    {
        "step": 56,
        "reward": -2.668182373046875,
        "gold_reward": -3.6829490661621094,
        "kl_divergence": 95.62174224853516,
        "mean_generated_length": 309.125
    },
    {
        "step": 57,
        "reward": -1.8945999145507812,
        "gold_reward": -3.5259475708007812,
        "kl_divergence": 110.38279724121094,
        "mean_generated_length": 313.140625
    },
    {
        "step": 58,
        "reward": -2.8394393920898438,
        "gold_reward": -3.480926513671875,
        "kl_divergence": 109.13484191894531,
        "mean_generated_length": 312.25
    },
    {
        "step": 59,
        "reward": -1.7026290893554688,
        "gold_reward": -3.506908416748047,
        "kl_divergence": 113.44620513916016,
        "mean_generated_length": 314.53125
    },
    {
        "step": 60,
        "reward": -2.3627471923828125,
        "gold_reward": -3.5225830078125,
        "kl_divergence": 118.38542175292969,
        "mean_generated_length": 310.5
    },
    {
        "step": 61,
        "reward": -0.42777442932128906,
        "gold_reward": -3.1149749755859375,
        "kl_divergence": 128.896484375,
        "mean_generated_length": 324.75
    },
    {
        "step": 62,
        "reward": -0.7825698852539062,
        "gold_reward": -2.99725341796875,
        "kl_divergence": 120.60751342773438,
        "mean_generated_length": 326.25
    },
    {
        "step": 63,
        "reward": -0.38976478576660156,
        "gold_reward": -2.6241931915283203,
        "kl_divergence": 103.30377960205078,
        "mean_generated_length": 272.6875
    },
    {
        "step": 64,
        "reward": -0.2297821044921875,
        "gold_reward": -2.0828094482421875,
        "kl_divergence": 127.36524963378906,
        "mean_generated_length": 315.125
    },
    {
        "step": 65,
        "reward": -0.22290420532226562,
        "gold_reward": -3.0850486755371094,
        "kl_divergence": 104.41731262207031,
        "mean_generated_length": 302.40625
    },
    {
        "step": 66,
        "reward": -0.23904800415039062,
        "gold_reward": -2.54150390625,
        "kl_divergence": 112.89959716796875,
        "mean_generated_length": 307.625
    },
    {
        "step": 67,
        "reward": -0.09624099731445312,
        "gold_reward": -2.6630892753601074,
        "kl_divergence": 120.51789855957031,
        "mean_generated_length": 324.9375
    },
    {
        "step": 68,
        "reward": 0.7519798278808594,
        "gold_reward": -1.94189453125,
        "kl_divergence": 131.58963012695312,
        "mean_generated_length": 326.875
    },
    {
        "step": 69,
        "reward": -0.08282756805419922,
        "gold_reward": -2.666156768798828,
        "kl_divergence": 129.8408203125,
        "mean_generated_length": 336.8125
    },
    {
        "step": 70,
        "reward": 0.12827682495117188,
        "gold_reward": -2.78521728515625,
        "kl_divergence": 116.05213928222656,
        "mean_generated_length": 311.265625
    },
    {
        "step": 71,
        "reward": -0.7413597106933594,
        "gold_reward": -2.879608154296875,
        "kl_divergence": 127.00725555419922,
        "mean_generated_length": 317.65625
    },
    {
        "step": 72,
        "reward": -0.0540008544921875,
        "gold_reward": -2.622283935546875,
        "kl_divergence": 123.96282196044922,
        "mean_generated_length": 310.90625
    },
    {
        "step": 73,
        "reward": -1.2664432525634766,
        "gold_reward": -2.751007080078125,
        "kl_divergence": 107.30958557128906,
        "mean_generated_length": 277.890625
    },
    {
        "step": 74,
        "reward": -0.4630775451660156,
        "gold_reward": -3.18896484375,
        "kl_divergence": 106.1769027709961,
        "mean_generated_length": 282.703125
    },
    {
        "step": 75,
        "reward": -0.04845428466796875,
        "gold_reward": -2.783905029296875,
        "kl_divergence": 117.62135314941406,
        "mean_generated_length": 302.734375
    },
    {
        "step": 76,
        "reward": 0.009273529052734375,
        "gold_reward": -2.977203369140625,
        "kl_divergence": 113.29976654052734,
        "mean_generated_length": 295.609375
    },
    {
        "step": 77,
        "reward": 0.01190185546875,
        "gold_reward": -2.442233085632324,
        "kl_divergence": 129.61148071289062,
        "mean_generated_length": 349.125
    },
    {
        "step": 78,
        "reward": 0.245849609375,
        "gold_reward": -2.69537353515625,
        "kl_divergence": 149.96649169921875,
        "mean_generated_length": 382.625
    },
    {
        "step": 79,
        "reward": -0.49484705924987793,
        "gold_reward": -2.94879150390625,
        "kl_divergence": 121.7814712524414,
        "mean_generated_length": 301.359375
    },
    {
        "step": 80,
        "reward": -0.1394824981689453,
        "gold_reward": -2.6210174560546875,
        "kl_divergence": 124.62663269042969,
        "mean_generated_length": 305.609375
    },
    {
        "step": 81,
        "reward": -0.28951263427734375,
        "gold_reward": -2.453948974609375,
        "kl_divergence": 127.77508544921875,
        "mean_generated_length": 313.625
    },
    {
        "step": 82,
        "reward": -0.39798736572265625,
        "gold_reward": -2.8909788131713867,
        "kl_divergence": 133.00897216796875,
        "mean_generated_length": 321.125
    },
    {
        "step": 83,
        "reward": -0.5831451416015625,
        "gold_reward": -2.6796188354492188,
        "kl_divergence": 134.13552856445312,
        "mean_generated_length": 303.625
    },
    {
        "step": 84,
        "reward": -0.4977455139160156,
        "gold_reward": -3.3342723846435547,
        "kl_divergence": 128.55235290527344,
        "mean_generated_length": 306.375
    },
    {
        "step": 85,
        "reward": 0.10976028442382812,
        "gold_reward": -2.9727630615234375,
        "kl_divergence": 131.657958984375,
        "mean_generated_length": 316.375
    },
    {
        "step": 86,
        "reward": 0.045291900634765625,
        "gold_reward": -2.5914039611816406,
        "kl_divergence": 131.4830780029297,
        "mean_generated_length": 310.875
    },
    {
        "step": 87,
        "reward": 0.4622383117675781,
        "gold_reward": -3.18218994140625,
        "kl_divergence": 131.28475952148438,
        "mean_generated_length": 309.5
    },
    {
        "step": 88,
        "reward": 0.4600677490234375,
        "gold_reward": -2.4132232666015625,
        "kl_divergence": 152.54476928710938,
        "mean_generated_length": 343.75
    },
    {
        "step": 89,
        "reward": -0.18401527404785156,
        "gold_reward": -2.3174407482147217,
        "kl_divergence": 143.468505859375,
        "mean_generated_length": 313.46875
    },
    {
        "step": 90,
        "reward": 0.23489761352539062,
        "gold_reward": -2.8646240234375,
        "kl_divergence": 138.83889770507812,
        "mean_generated_length": 306.75
    },
    {
        "step": 91,
        "reward": 0.36168527603149414,
        "gold_reward": -2.6618118286132812,
        "kl_divergence": 134.339599609375,
        "mean_generated_length": 300.5
    },
    {
        "step": 92,
        "reward": 0.7018833160400391,
        "gold_reward": -2.374652862548828,
        "kl_divergence": 141.6238555908203,
        "mean_generated_length": 296.25
    },
    {
        "step": 93,
        "reward": 0.07955169677734375,
        "gold_reward": -2.855255126953125,
        "kl_divergence": 153.5548858642578,
        "mean_generated_length": 335.25
    },
    {
        "step": 94,
        "reward": -0.5021781921386719,
        "gold_reward": -2.697463035583496,
        "kl_divergence": 158.48500061035156,
        "mean_generated_length": 335.875
    },
    {
        "step": 95,
        "reward": -0.4204282760620117,
        "gold_reward": -2.90838623046875,
        "kl_divergence": 148.15679931640625,
        "mean_generated_length": 323.625
    },
    {
        "step": 96,
        "reward": 0.4286785125732422,
        "gold_reward": -2.6681976318359375,
        "kl_divergence": 166.98843383789062,
        "mean_generated_length": 344.375
    },
    {
        "step": 97,
        "reward": -0.0001277923583984375,
        "gold_reward": -3.1349639892578125,
        "kl_divergence": 136.23422241210938,
        "mean_generated_length": 300.25
    },
    {
        "step": 98,
        "reward": 1.025848388671875,
        "gold_reward": -2.7700119018554688,
        "kl_divergence": 155.69256591796875,
        "mean_generated_length": 328.875
    },
    {
        "step": 99,
        "reward": -0.06300544738769531,
        "gold_reward": -3.211543083190918,
        "kl_divergence": 154.72274780273438,
        "mean_generated_length": 312.84375
    },
    {
        "step": 100,
        "reward": 0.4976682662963867,
        "gold_reward": -2.9110374450683594,
        "kl_divergence": 161.9906768798828,
        "mean_generated_length": 327.5
    },
    {
        "step": 101,
        "reward": -0.22469329833984375,
        "gold_reward": -2.7340087890625,
        "kl_divergence": 164.036865234375,
        "mean_generated_length": 323.875
    },
    {
        "step": 102,
        "reward": 0.816925048828125,
        "gold_reward": -2.6075286865234375,
        "kl_divergence": 166.91871643066406,
        "mean_generated_length": 308.234375
    },
    {
        "step": 103,
        "reward": 0.4946403503417969,
        "gold_reward": -2.66357421875,
        "kl_divergence": 157.85055541992188,
        "mean_generated_length": 300.25
    },
    {
        "step": 104,
        "reward": 0.38262271881103516,
        "gold_reward": -2.848724365234375,
        "kl_divergence": 170.25018310546875,
        "mean_generated_length": 323.125
    },
    {
        "step": 105,
        "reward": 0.5129165649414062,
        "gold_reward": -3.0255126953125,
        "kl_divergence": 166.42396545410156,
        "mean_generated_length": 298.546875
    },
    {
        "step": 106,
        "reward": 0.8070888519287109,
        "gold_reward": -3.1757049560546875,
        "kl_divergence": 178.0335235595703,
        "mean_generated_length": 331.125
    },
    {
        "step": 107,
        "reward": 0.5226325988769531,
        "gold_reward": -3.1060791015625,
        "kl_divergence": 168.10516357421875,
        "mean_generated_length": 287.5
    },
    {
        "step": 108,
        "reward": 0.5873184204101562,
        "gold_reward": -3.0358829498291016,
        "kl_divergence": 195.26239013671875,
        "mean_generated_length": 324.375
    },
    {
        "step": 109,
        "reward": 0.4775657653808594,
        "gold_reward": -2.9759292602539062,
        "kl_divergence": 207.00827026367188,
        "mean_generated_length": 331.0
    },
    {
        "step": 110,
        "reward": 0.4078369140625,
        "gold_reward": -3.27386474609375,
        "kl_divergence": 180.20797729492188,
        "mean_generated_length": 306.453125
    },
    {
        "step": 111,
        "reward": 0.8971099853515625,
        "gold_reward": -2.9889602661132812,
        "kl_divergence": 195.27723693847656,
        "mean_generated_length": 299.875
    },
    {
        "step": 112,
        "reward": 0.8767032623291016,
        "gold_reward": -3.0581741333007812,
        "kl_divergence": 203.23370361328125,
        "mean_generated_length": 308.90625
    },
    {
        "step": 113,
        "reward": 0.9709396362304688,
        "gold_reward": -2.9345703125,
        "kl_divergence": 218.19741821289062,
        "mean_generated_length": 336.5
    },
    {
        "step": 114,
        "reward": 0.07151985168457031,
        "gold_reward": -3.526275634765625,
        "kl_divergence": 255.03656005859375,
        "mean_generated_length": 357.75
    },
    {
        "step": 115,
        "reward": 0.5786895751953125,
        "gold_reward": -3.7412567138671875,
        "kl_divergence": 215.2198028564453,
        "mean_generated_length": 320.5
    },
    {
        "step": 116,
        "reward": 0.614654541015625,
        "gold_reward": -3.41729736328125,
        "kl_divergence": 224.52377319335938,
        "mean_generated_length": 300.25
    },
    {
        "step": 117,
        "reward": 0.4507293701171875,
        "gold_reward": -3.5720062255859375,
        "kl_divergence": 237.91610717773438,
        "mean_generated_length": 328.375
    },
    {
        "step": 118,
        "reward": 1.3125600814819336,
        "gold_reward": -3.7000732421875,
        "kl_divergence": 258.11505126953125,
        "mean_generated_length": 326.0
    },
    {
        "step": 119,
        "reward": 0.6890144348144531,
        "gold_reward": -3.151092529296875,
        "kl_divergence": 268.1787414550781,
        "mean_generated_length": 331.375
    },
    {
        "step": 120,
        "reward": 0.40799617767333984,
        "gold_reward": -3.285968780517578,
        "kl_divergence": 284.2946472167969,
        "mean_generated_length": 351.75
    },
    {
        "step": 121,
        "reward": 1.2600250244140625,
        "gold_reward": -3.3295745849609375,
        "kl_divergence": 249.36825561523438,
        "mean_generated_length": 303.5
    },
    {
        "step": 122,
        "reward": 1.0965805053710938,
        "gold_reward": -3.5019378662109375,
        "kl_divergence": 298.4870300292969,
        "mean_generated_length": 346.125
    },
    {
        "step": 123,
        "reward": 0.6525917053222656,
        "gold_reward": -3.4796295166015625,
        "kl_divergence": 281.4522705078125,
        "mean_generated_length": 307.75
    },
    {
        "step": 124,
        "reward": 0.7279610633850098,
        "gold_reward": -3.9260482788085938,
        "kl_divergence": 262.95623779296875,
        "mean_generated_length": 290.375
    },
    {
        "step": 125,
        "reward": 1.288137435913086,
        "gold_reward": -3.0698471069335938,
        "kl_divergence": 296.7201232910156,
        "mean_generated_length": 323.625
    },
    {
        "step": 126,
        "reward": 1.2317981719970703,
        "gold_reward": -3.480865001678467,
        "kl_divergence": 267.7612609863281,
        "mean_generated_length": 300.0
    },
    {
        "step": 127,
        "reward": 0.9190654754638672,
        "gold_reward": -3.1584243774414062,
        "kl_divergence": 317.8988952636719,
        "mean_generated_length": 334.25
    },
    {
        "step": 128,
        "reward": 1.3662567138671875,
        "gold_reward": -3.3451080322265625,
        "kl_divergence": 309.0972595214844,
        "mean_generated_length": 338.5
    },
    {
        "step": 129,
        "reward": 1.38525390625,
        "gold_reward": -3.107067108154297,
        "kl_divergence": 292.4318542480469,
        "mean_generated_length": 311.125
    },
    {
        "step": 130,
        "reward": 1.3604917526245117,
        "gold_reward": -3.430023193359375,
        "kl_divergence": 289.6372375488281,
        "mean_generated_length": 321.75
    },
    {
        "step": 131,
        "reward": 1.1608409881591797,
        "gold_reward": -3.630523681640625,
        "kl_divergence": 263.47998046875,
        "mean_generated_length": 298.875
    },
    {
        "step": 132,
        "reward": 1.2034645080566406,
        "gold_reward": -3.472900390625,
        "kl_divergence": 307.33856201171875,
        "mean_generated_length": 347.0
    },
    {
        "step": 133,
        "reward": 1.3766961097717285,
        "gold_reward": -3.6004638671875,
        "kl_divergence": 270.8825378417969,
        "mean_generated_length": 310.5
    },
    {
        "step": 134,
        "reward": 2.261920928955078,
        "gold_reward": -3.1890487670898438,
        "kl_divergence": 271.87591552734375,
        "mean_generated_length": 315.375
    },
    {
        "step": 135,
        "reward": 1.7677078247070312,
        "gold_reward": -3.3105316162109375,
        "kl_divergence": 260.7760314941406,
        "mean_generated_length": 309.125
    },
    {
        "step": 136,
        "reward": 1.8235318660736084,
        "gold_reward": -2.9701995849609375,
        "kl_divergence": 265.3398132324219,
        "mean_generated_length": 322.0
    },
    {
        "step": 137,
        "reward": 1.621957778930664,
        "gold_reward": -2.9705657958984375,
        "kl_divergence": 250.6417236328125,
        "mean_generated_length": 312.25
    },
    {
        "step": 138,
        "reward": 2.157536506652832,
        "gold_reward": -2.98162841796875,
        "kl_divergence": 254.3010711669922,
        "mean_generated_length": 317.625
    },
    {
        "step": 139,
        "reward": 1.6294136047363281,
        "gold_reward": -2.8744049072265625,
        "kl_divergence": 255.30203247070312,
        "mean_generated_length": 310.5
    },
    {
        "step": 140,
        "reward": 1.7439460754394531,
        "gold_reward": -3.44873046875,
        "kl_divergence": 258.32562255859375,
        "mean_generated_length": 324.75
    },
    {
        "step": 141,
        "reward": 1.6145000457763672,
        "gold_reward": -3.2439708709716797,
        "kl_divergence": 246.3470001220703,
        "mean_generated_length": 326.25
    },
    {
        "step": 142,
        "reward": 1.0288772583007812,
        "gold_reward": -3.446075439453125,
        "kl_divergence": 210.43948364257812,
        "mean_generated_length": 276.125
    },
    {
        "step": 143,
        "reward": 2.0661473274230957,
        "gold_reward": -2.8650364875793457,
        "kl_divergence": 241.29766845703125,
        "mean_generated_length": 315.125
    },
    {
        "step": 144,
        "reward": 1.4669914245605469,
        "gold_reward": -3.38299560546875,
        "kl_divergence": 229.85008239746094,
        "mean_generated_length": 305.375
    },
    {
        "step": 145,
        "reward": 2.2005958557128906,
        "gold_reward": -2.7306671142578125,
        "kl_divergence": 228.67465209960938,
        "mean_generated_length": 307.625
    },
    {
        "step": 146,
        "reward": 1.7142715454101562,
        "gold_reward": -3.172607421875,
        "kl_divergence": 239.64688110351562,
        "mean_generated_length": 328.0
    },
    {
        "step": 147,
        "reward": 2.2995457649230957,
        "gold_reward": -2.6523265838623047,
        "kl_divergence": 248.2651824951172,
        "mean_generated_length": 326.875
    },
    {
        "step": 148,
        "reward": 1.88995361328125,
        "gold_reward": -3.115428924560547,
        "kl_divergence": 251.50537109375,
        "mean_generated_length": 337.375
    },
    {
        "step": 149,
        "reward": 1.9867095947265625,
        "gold_reward": -3.014087677001953,
        "kl_divergence": 225.0680694580078,
        "mean_generated_length": 312.75
    },
    {
        "step": 150,
        "reward": 1.8273544311523438,
        "gold_reward": -3.1415863037109375,
        "kl_divergence": 245.1300048828125,
        "mean_generated_length": 328.5
    },
    {
        "step": 151,
        "reward": 2.346306562423706,
        "gold_reward": -3.1399307250976562,
        "kl_divergence": 233.20767211914062,
        "mean_generated_length": 314.625
    },
    {
        "step": 152,
        "reward": 0.9227714538574219,
        "gold_reward": -3.3661956787109375,
        "kl_divergence": 204.39202880859375,
        "mean_generated_length": 284.25
    },
    {
        "step": 153,
        "reward": 1.307459831237793,
        "gold_reward": -3.545135498046875,
        "kl_divergence": 203.25697326660156,
        "mean_generated_length": 291.0
    },
    {
        "step": 154,
        "reward": 1.9141387939453125,
        "gold_reward": -3.383819580078125,
        "kl_divergence": 229.49798583984375,
        "mean_generated_length": 321.25
    },
    {
        "step": 155,
        "reward": 1.8413314819335938,
        "gold_reward": -3.3176517486572266,
        "kl_divergence": 227.1392822265625,
        "mean_generated_length": 314.5
    },
    {
        "step": 156,
        "reward": 2.3944931030273438,
        "gold_reward": -3.0893445014953613,
        "kl_divergence": 257.07574462890625,
        "mean_generated_length": 349.125
    },
    {
        "step": 157,
        "reward": 1.200927734375,
        "gold_reward": -3.06591796875,
        "kl_divergence": 304.894287109375,
        "mean_generated_length": 398.375
    },
    {
        "step": 158,
        "reward": 1.6659164428710938,
        "gold_reward": -3.3150177001953125,
        "kl_divergence": 239.61138916015625,
        "mean_generated_length": 311.25
    },
    {
        "step": 159,
        "reward": 1.8787078857421875,
        "gold_reward": -3.035430908203125,
        "kl_divergence": 235.42059326171875,
        "mean_generated_length": 314.25
    },
    {
        "step": 160,
        "reward": 2.3318939208984375,
        "gold_reward": -3.04595947265625,
        "kl_divergence": 242.73924255371094,
        "mean_generated_length": 313.625
    },
    {
        "step": 161,
        "reward": 2.3290557861328125,
        "gold_reward": -3.395904541015625,
        "kl_divergence": 252.54591369628906,
        "mean_generated_length": 321.125
    },
    {
        "step": 162,
        "reward": 2.015777587890625,
        "gold_reward": -3.44512939453125,
        "kl_divergence": 241.48626708984375,
        "mean_generated_length": 303.625
    },
    {
        "step": 163,
        "reward": 1.4699935913085938,
        "gold_reward": -3.3974151611328125,
        "kl_divergence": 234.01141357421875,
        "mean_generated_length": 306.375
    },
    {
        "step": 164,
        "reward": 2.0998220443725586,
        "gold_reward": -3.582061767578125,
        "kl_divergence": 248.6400604248047,
        "mean_generated_length": 316.375
    },
    {
        "step": 165,
        "reward": 2.028656005859375,
        "gold_reward": -3.2575531005859375,
        "kl_divergence": 241.9441680908203,
        "mean_generated_length": 310.875
    },
    {
        "step": 166,
        "reward": 2.25146484375,
        "gold_reward": -3.5630111694335938,
        "kl_divergence": 246.06109619140625,
        "mean_generated_length": 309.5
    },
    {
        "step": 167,
        "reward": 2.67242431640625,
        "gold_reward": -3.3051376342773438,
        "kl_divergence": 283.6163330078125,
        "mean_generated_length": 343.75
    },
    {
        "step": 168,
        "reward": 2.198139190673828,
        "gold_reward": -3.090057373046875,
        "kl_divergence": 258.5780029296875,
        "mean_generated_length": 317.0
    },
    {
        "step": 169,
        "reward": 2.0329818725585938,
        "gold_reward": -3.3957443237304688,
        "kl_divergence": 252.62794494628906,
        "mean_generated_length": 306.75
    },
    {
        "step": 170,
        "reward": 2.5371785163879395,
        "gold_reward": -3.449493408203125,
        "kl_divergence": 248.48837280273438,
        "mean_generated_length": 302.375
    },
    {
        "step": 171,
        "reward": 1.700164794921875,
        "gold_reward": -2.70343017578125,
        "kl_divergence": 252.47544860839844,
        "mean_generated_length": 296.25
    },
    {
        "step": 172,
        "reward": 1.8998489379882812,
        "gold_reward": -3.277099609375,
        "kl_divergence": 277.9421081542969,
        "mean_generated_length": 335.25
    },
    {
        "step": 173,
        "reward": 2.2802696228027344,
        "gold_reward": -3.324798583984375,
        "kl_divergence": 276.3580627441406,
        "mean_generated_length": 335.875
    },
    {
        "step": 174,
        "reward": 1.5899085998535156,
        "gold_reward": -3.2659759521484375,
        "kl_divergence": 259.162353515625,
        "mean_generated_length": 323.625
    },
    {
        "step": 175,
        "reward": 2.9222869873046875,
        "gold_reward": -3.230682373046875,
        "kl_divergence": 284.2518005371094,
        "mean_generated_length": 344.375
    },
    {
        "step": 176,
        "reward": 1.6358528137207031,
        "gold_reward": -3.6988143920898438,
        "kl_divergence": 243.87060546875,
        "mean_generated_length": 300.25
    },
    {
        "step": 177,
        "reward": 2.927886962890625,
        "gold_reward": -3.4473876953125,
        "kl_divergence": 278.4732971191406,
        "mean_generated_length": 328.875
    },
    {
        "step": 178,
        "reward": 2.2983779907226562,
        "gold_reward": -3.59307861328125,
        "kl_divergence": 265.5600891113281,
        "mean_generated_length": 316.625
    },
    {
        "step": 179,
        "reward": 2.561676025390625,
        "gold_reward": -3.3011856079101562,
        "kl_divergence": 270.43743896484375,
        "mean_generated_length": 327.5
    },
    {
        "step": 180,
        "reward": 3.016817092895508,
        "gold_reward": -3.0522217750549316,
        "kl_divergence": 264.3910827636719,
        "mean_generated_length": 323.875
    },
    {
        "step": 181,
        "reward": 2.0445327758789062,
        "gold_reward": -3.0692520141601562,
        "kl_divergence": 271.0049743652344,
        "mean_generated_length": 317.375
    },
    {
        "step": 182,
        "reward": 2.5009918212890625,
        "gold_reward": -3.333812713623047,
        "kl_divergence": 247.2896728515625,
        "mean_generated_length": 300.25
    },
    {
        "step": 183,
        "reward": 2.1888580322265625,
        "gold_reward": -3.3786849975585938,
        "kl_divergence": 259.7344665527344,
        "mean_generated_length": 323.125
    },
    {
        "step": 184,
        "reward": 2.148193359375,
        "gold_reward": -3.477447509765625,
        "kl_divergence": 246.31314086914062,
        "mean_generated_length": 303.25
    },
    {
        "step": 185,
        "reward": 2.269634246826172,
        "gold_reward": -3.6980743408203125,
        "kl_divergence": 273.3747863769531,
        "mean_generated_length": 331.125
    },
    {
        "step": 186,
        "reward": 2.233295440673828,
        "gold_reward": -3.4661865234375,
        "kl_divergence": 233.89016723632812,
        "mean_generated_length": 287.5
    },
    {
        "step": 187,
        "reward": 2.404052734375,
        "gold_reward": -3.45843505859375,
        "kl_divergence": 269.4273986816406,
        "mean_generated_length": 324.375
    },
    {
        "step": 188,
        "reward": 2.5778961181640625,
        "gold_reward": -3.283721923828125,
        "kl_divergence": 277.90142822265625,
        "mean_generated_length": 331.0
    },
    {
        "step": 189,
        "reward": 2.255125045776367,
        "gold_reward": -3.58721923828125,
        "kl_divergence": 247.7493133544922,
        "mean_generated_length": 311.5
    },
    {
        "step": 190,
        "reward": 2.1749420166015625,
        "gold_reward": -3.4512557983398438,
        "kl_divergence": 253.532470703125,
        "mean_generated_length": 299.875
    },
    {
        "step": 191,
        "reward": 2.325439453125,
        "gold_reward": -3.42266845703125,
        "kl_divergence": 263.708251953125,
        "mean_generated_length": 314.25
    },
    {
        "step": 192,
        "reward": 2.378976821899414,
        "gold_reward": -3.3367462158203125,
        "kl_divergence": 273.8762512207031,
        "mean_generated_length": 336.5
    },
    {
        "step": 193,
        "reward": 2.609832763671875,
        "gold_reward": -3.2378005981445312,
        "kl_divergence": 304.54248046875,
        "mean_generated_length": 357.75
    },
    {
        "step": 194,
        "reward": 2.0610122680664062,
        "gold_reward": -4.049224853515625,
        "kl_divergence": 262.62615966796875,
        "mean_generated_length": 320.5
    },
    {
        "step": 195,
        "reward": 2.1711959838867188,
        "gold_reward": -3.514404296875,
        "kl_divergence": 258.2840881347656,
        "mean_generated_length": 300.25
    },
    {
        "step": 196,
        "reward": 2.3698577880859375,
        "gold_reward": -3.92669677734375,
        "kl_divergence": 274.44384765625,
        "mean_generated_length": 328.375
    },
    {
        "step": 197,
        "reward": 2.6829051971435547,
        "gold_reward": -3.6490478515625,
        "kl_divergence": 266.2899169921875,
        "mean_generated_length": 324.3125
    },
    {
        "step": 198,
        "reward": 2.2658843994140625,
        "gold_reward": -3.340362548828125,
        "kl_divergence": 272.17974853515625,
        "mean_generated_length": 331.375
    },
    {
        "step": 199,
        "reward": 2.511209487915039,
        "gold_reward": -3.44500732421875,
        "kl_divergence": 291.370849609375,
        "mean_generated_length": 351.75
    },
    {
        "step": 200,
        "reward": 3.072998046875,
        "gold_reward": -3.4041595458984375,
        "kl_divergence": 254.0001678466797,
        "mean_generated_length": 303.5
    },
    {
        "step": 201,
        "reward": 2.411323070526123,
        "gold_reward": -3.54022216796875,
        "kl_divergence": 282.26727294921875,
        "mean_generated_length": 346.125
    },
    {
        "step": 202,
        "reward": 2.076547622680664,
        "gold_reward": -3.74688720703125,
        "kl_divergence": 263.2450256347656,
        "mean_generated_length": 307.75
    },
    {
        "step": 203,
        "reward": 2.21002197265625,
        "gold_reward": -3.8222503662109375,
        "kl_divergence": 238.24261474609375,
        "mean_generated_length": 290.375
    },
    {
        "step": 204,
        "reward": 2.3842926025390625,
        "gold_reward": -3.504669189453125,
        "kl_divergence": 265.4458923339844,
        "mean_generated_length": 323.625
    },
    {
        "step": 205,
        "reward": 2.1874828338623047,
        "gold_reward": -3.7308349609375,
        "kl_divergence": 249.38722229003906,
        "mean_generated_length": 300.0
    },
    {
        "step": 206,
        "reward": 2.842254638671875,
        "gold_reward": -3.1685791015625,
        "kl_divergence": 280.6466064453125,
        "mean_generated_length": 334.25
    },
    {
        "step": 207,
        "reward": 2.484661102294922,
        "gold_reward": -3.42352294921875,
        "kl_divergence": 281.8705139160156,
        "mean_generated_length": 338.5
    },
    {
        "step": 208,
        "reward": 2.65533447265625,
        "gold_reward": -2.9211196899414062,
        "kl_divergence": 264.70465087890625,
        "mean_generated_length": 311.125
    },
    {
        "step": 209,
        "reward": 2.1137237548828125,
        "gold_reward": -3.600921630859375,
        "kl_divergence": 263.9676513671875,
        "mean_generated_length": 321.75
    },
    {
        "step": 210,
        "reward": 1.5642127990722656,
        "gold_reward": -3.878326416015625,
        "kl_divergence": 250.72763061523438,
        "mean_generated_length": 298.875
    },
    {
        "step": 211,
        "reward": 1.952178955078125,
        "gold_reward": -3.6640625,
        "kl_divergence": 284.2773132324219,
        "mean_generated_length": 347.0
    },
    {
        "step": 212,
        "reward": 2.3057708740234375,
        "gold_reward": -3.7965269088745117,
        "kl_divergence": 257.458984375,
        "mean_generated_length": 310.5
    },
    {
        "step": 213,
        "reward": 1.9691238403320312,
        "gold_reward": -3.35821533203125,
        "kl_divergence": 258.9440002441406,
        "mean_generated_length": 315.375
    },
    {
        "step": 214,
        "reward": 1.9468402862548828,
        "gold_reward": -3.5679931640625,
        "kl_divergence": 257.2422180175781,
        "mean_generated_length": 309.125
    },
    {
        "step": 215,
        "reward": 2.771484375,
        "gold_reward": -3.222198486328125,
        "kl_divergence": 267.238525390625,
        "mean_generated_length": 322.0
    },
    {
        "step": 216,
        "reward": 2.951181411743164,
        "gold_reward": -3.445629119873047,
        "kl_divergence": 264.3818664550781,
        "mean_generated_length": 312.25
    },
    {
        "step": 217,
        "reward": 2.256406784057617,
        "gold_reward": -3.465519428253174,
        "kl_divergence": 265.4328308105469,
        "mean_generated_length": 317.625
    },
    {
        "step": 218,
        "reward": 2.187671661376953,
        "gold_reward": -3.3281707763671875,
        "kl_divergence": 270.8213195800781,
        "mean_generated_length": 310.5
    },
    {
        "step": 219,
        "reward": 2.9546103477478027,
        "gold_reward": -3.5872039794921875,
        "kl_divergence": 270.78240966796875,
        "mean_generated_length": 324.75
    },
    {
        "step": 220,
        "reward": 2.4250869750976562,
        "gold_reward": -3.451324462890625,
        "kl_divergence": 267.94842529296875,
        "mean_generated_length": 326.25
    },
    {
        "step": 221,
        "reward": 2.1781210899353027,
        "gold_reward": -3.42578125,
        "kl_divergence": 238.1891632080078,
        "mean_generated_length": 276.125
    },
    {
        "step": 222,
        "reward": 2.5295333862304688,
        "gold_reward": -3.07391357421875,
        "kl_divergence": 270.9518737792969,
        "mean_generated_length": 315.125
    },
    {
        "step": 223,
        "reward": 1.7177925109863281,
        "gold_reward": -4.011566162109375,
        "kl_divergence": 255.16966247558594,
        "mean_generated_length": 305.375
    },
    {
        "step": 224,
        "reward": 1.9611968994140625,
        "gold_reward": -3.30584716796875,
        "kl_divergence": 248.30526733398438,
        "mean_generated_length": 307.625
    },
    {
        "step": 225,
        "reward": 2.8765907287597656,
        "gold_reward": -3.6201906204223633,
        "kl_divergence": 267.5470886230469,
        "mean_generated_length": 328.0
    },
    {
        "step": 226,
        "reward": 2.77301025390625,
        "gold_reward": -3.456911087036133,
        "kl_divergence": 278.5325927734375,
        "mean_generated_length": 326.875
    },
    {
        "step": 227,
        "reward": 2.8765296936035156,
        "gold_reward": -3.5505828857421875,
        "kl_divergence": 270.5703125,
        "mean_generated_length": 337.375
    },
    {
        "step": 228,
        "reward": 2.2795791625976562,
        "gold_reward": -3.5350914001464844,
        "kl_divergence": 265.1873474121094,
        "mean_generated_length": 312.75
    },
    {
        "step": 229,
        "reward": 2.3776683807373047,
        "gold_reward": -3.5353546142578125,
        "kl_divergence": 280.9463195800781,
        "mean_generated_length": 328.5
    },
    {
        "step": 230,
        "reward": 2.7612762451171875,
        "gold_reward": -3.4912109375,
        "kl_divergence": 258.0873107910156,
        "mean_generated_length": 314.625
    },
    {
        "step": 231,
        "reward": 1.7410736083984375,
        "gold_reward": -3.505523681640625,
        "kl_divergence": 241.38616943359375,
        "mean_generated_length": 284.25
    },
    {
        "step": 232,
        "reward": 2.127033233642578,
        "gold_reward": -3.9916000366210938,
        "kl_divergence": 241.91903686523438,
        "mean_generated_length": 291.0
    },
    {
        "step": 233,
        "reward": 2.5951385498046875,
        "gold_reward": -3.6336898803710938,
        "kl_divergence": 261.396728515625,
        "mean_generated_length": 321.25
    },
    {
        "step": 234,
        "reward": 2.088409423828125,
        "gold_reward": -3.6663818359375,
        "kl_divergence": 251.512939453125,
        "mean_generated_length": 314.5
    },
    {
        "step": 235,
        "reward": 2.3279342651367188,
        "gold_reward": -3.3265466690063477,
        "kl_divergence": 278.9927978515625,
        "mean_generated_length": 349.125
    },
    {
        "step": 236,
        "reward": 2.7851715087890625,
        "gold_reward": -3.580078125,
        "kl_divergence": 325.47381591796875,
        "mean_generated_length": 398.375
    }
]