[
    {
        "step": 0,
        "reward": -3.7819061279296875,
        "gold_reward": -3.13934326171875,
        "kl_divergence": 0.0,
        "mean_generated_length": 133.28125
    },
    {
        "step": 1,
        "reward": -3.4259490966796875,
        "gold_reward": -2.917926788330078,
        "kl_divergence": 0.0,
        "mean_generated_length": 127.515625
    },
    {
        "step": 2,
        "reward": -2.942087173461914,
        "gold_reward": -2.863901138305664,
        "kl_divergence": -0.10112552344799042,
        "mean_generated_length": 93.140625
    },
    {
        "step": 3,
        "reward": -3.4418907165527344,
        "gold_reward": -3.2739715576171875,
        "kl_divergence": 0.018151190131902695,
        "mean_generated_length": 130.40625
    },
    {
        "step": 4,
        "reward": -3.2274932861328125,
        "gold_reward": -3.016632080078125,
        "kl_divergence": 0.0332208052277565,
        "mean_generated_length": 124.703125
    },
    {
        "step": 5,
        "reward": -4.105865478515625,
        "gold_reward": -3.33929443359375,
        "kl_divergence": 0.03889075666666031,
        "mean_generated_length": 116.28125
    },
    {
        "step": 6,
        "reward": -3.8859710693359375,
        "gold_reward": -3.3250732421875,
        "kl_divergence": 0.014576919376850128,
        "mean_generated_length": 132.078125
    },
    {
        "step": 7,
        "reward": -3.5267105102539062,
        "gold_reward": -2.9154891967773438,
        "kl_divergence": 0.18012291193008423,
        "mean_generated_length": 124.234375
    },
    {
        "step": 8,
        "reward": -2.6624755859375,
        "gold_reward": -3.1268310546875,
        "kl_divergence": 0.8393707275390625,
        "mean_generated_length": 158.890625
    },
    {
        "step": 9,
        "reward": -3.0726518630981445,
        "gold_reward": -2.725597381591797,
        "kl_divergence": 1.4303672313690186,
        "mean_generated_length": 149.71875
    },
    {
        "step": 10,
        "reward": -2.0628509521484375,
        "gold_reward": -1.8209228515625,
        "kl_divergence": 1.9610316753387451,
        "mean_generated_length": 148.234375
    },
    {
        "step": 11,
        "reward": -2.549468994140625,
        "gold_reward": -2.8828415870666504,
        "kl_divergence": 2.255621910095215,
        "mean_generated_length": 149.234375
    },
    {
        "step": 12,
        "reward": -3.2961082458496094,
        "gold_reward": -2.69384765625,
        "kl_divergence": 2.441132068634033,
        "mean_generated_length": 137.328125
    },
    {
        "step": 13,
        "reward": -1.902475357055664,
        "gold_reward": -2.012115478515625,
        "kl_divergence": 4.036214351654053,
        "mean_generated_length": 132.3125
    },
    {
        "step": 14,
        "reward": -1.901186466217041,
        "gold_reward": -2.272247314453125,
        "kl_divergence": 4.756113529205322,
        "mean_generated_length": 172.265625
    },
    {
        "step": 15,
        "reward": -1.8327522277832031,
        "gold_reward": -2.311359405517578,
        "kl_divergence": 5.946868896484375,
        "mean_generated_length": 165.53125
    },
    {
        "step": 16,
        "reward": -2.8223705291748047,
        "gold_reward": -2.2901229858398438,
        "kl_divergence": 5.849390983581543,
        "mean_generated_length": 167.1875
    },
    {
        "step": 17,
        "reward": -1.9482421875,
        "gold_reward": -2.3909149169921875,
        "kl_divergence": 6.893039226531982,
        "mean_generated_length": 181.546875
    },
    {
        "step": 18,
        "reward": -2.5092849731445312,
        "gold_reward": -2.7340240478515625,
        "kl_divergence": 10.368586540222168,
        "mean_generated_length": 186.796875
    },
    {
        "step": 19,
        "reward": -1.82440185546875,
        "gold_reward": -2.639129638671875,
        "kl_divergence": 10.317743301391602,
        "mean_generated_length": 175.15625
    },
    {
        "step": 20,
        "reward": -2.3094305992126465,
        "gold_reward": -2.454010009765625,
        "kl_divergence": 14.126115798950195,
        "mean_generated_length": 210.53125
    },
    {
        "step": 21,
        "reward": -1.2886695861816406,
        "gold_reward": -1.8406105041503906,
        "kl_divergence": 13.686912536621094,
        "mean_generated_length": 178.921875
    },
    {
        "step": 22,
        "reward": -1.6991424560546875,
        "gold_reward": -1.821502685546875,
        "kl_divergence": 16.00798797607422,
        "mean_generated_length": 190.078125
    },
    {
        "step": 23,
        "reward": -1.480194091796875,
        "gold_reward": -2.163604736328125,
        "kl_divergence": 17.60043716430664,
        "mean_generated_length": 207.625
    },
    {
        "step": 24,
        "reward": -2.129852294921875,
        "gold_reward": -2.26104736328125,
        "kl_divergence": 19.844175338745117,
        "mean_generated_length": 197.859375
    },
    {
        "step": 25,
        "reward": -1.4664669036865234,
        "gold_reward": -2.436248779296875,
        "kl_divergence": 28.133155822753906,
        "mean_generated_length": 254.125
    },
    {
        "step": 26,
        "reward": -1.932891845703125,
        "gold_reward": -2.4501800537109375,
        "kl_divergence": 25.071392059326172,
        "mean_generated_length": 232.75
    },
    {
        "step": 27,
        "reward": -0.856353759765625,
        "gold_reward": -2.3194198608398438,
        "kl_divergence": 32.777000427246094,
        "mean_generated_length": 268.28125
    },
    {
        "step": 28,
        "reward": -1.4160022735595703,
        "gold_reward": -2.313507080078125,
        "kl_divergence": 33.95976257324219,
        "mean_generated_length": 244.78125
    },
    {
        "step": 29,
        "reward": -1.3033409118652344,
        "gold_reward": -2.282726287841797,
        "kl_divergence": 36.250160217285156,
        "mean_generated_length": 274.453125
    },
    {
        "step": 30,
        "reward": -1.6526603698730469,
        "gold_reward": -2.557281494140625,
        "kl_divergence": 45.055294036865234,
        "mean_generated_length": 308.359375
    },
    {
        "step": 31,
        "reward": -1.372197151184082,
        "gold_reward": -2.4918212890625,
        "kl_divergence": 37.14872741699219,
        "mean_generated_length": 291.1875
    },
    {
        "step": 32,
        "reward": -1.72894287109375,
        "gold_reward": -2.6040725708007812,
        "kl_divergence": 41.20378112792969,
        "mean_generated_length": 294.796875
    },
    {
        "step": 33,
        "reward": -1.2940330505371094,
        "gold_reward": -2.0692138671875,
        "kl_divergence": 43.25074768066406,
        "mean_generated_length": 288.390625
    },
    {
        "step": 34,
        "reward": -1.3062744140625,
        "gold_reward": -2.3810501098632812,
        "kl_divergence": 43.51288604736328,
        "mean_generated_length": 316.34375
    },
    {
        "step": 35,
        "reward": -1.9979476928710938,
        "gold_reward": -2.363677978515625,
        "kl_divergence": 52.33977508544922,
        "mean_generated_length": 330.1875
    },
    {
        "step": 36,
        "reward": -1.2786474227905273,
        "gold_reward": -3.00091552734375,
        "kl_divergence": 38.18295669555664,
        "mean_generated_length": 303.890625
    },
    {
        "step": 37,
        "reward": -1.6795024871826172,
        "gold_reward": -2.8555908203125,
        "kl_divergence": 42.763916015625,
        "mean_generated_length": 285.75
    },
    {
        "step": 38,
        "reward": -1.4542465209960938,
        "gold_reward": -2.9466552734375,
        "kl_divergence": 44.415443420410156,
        "mean_generated_length": 308.6875
    },
    {
        "step": 39,
        "reward": -0.96820068359375,
        "gold_reward": -2.4169235229492188,
        "kl_divergence": 43.248138427734375,
        "mean_generated_length": 300.875
    },
    {
        "step": 40,
        "reward": -1.876861572265625,
        "gold_reward": -2.6024420261383057,
        "kl_divergence": 53.08441925048828,
        "mean_generated_length": 300.390625
    },
    {
        "step": 41,
        "reward": -1.4191617965698242,
        "gold_reward": -2.0666046142578125,
        "kl_divergence": 44.40256118774414,
        "mean_generated_length": 287.84375
    },
    {
        "step": 42,
        "reward": -1.2355823516845703,
        "gold_reward": -2.2382583618164062,
        "kl_divergence": 38.68030548095703,
        "mean_generated_length": 265.53125
    },
    {
        "step": 43,
        "reward": -0.928098201751709,
        "gold_reward": -2.3570632934570312,
        "kl_divergence": 39.96193313598633,
        "mean_generated_length": 270.984375
    },
    {
        "step": 44,
        "reward": -1.0511398315429688,
        "gold_reward": -1.90789794921875,
        "kl_divergence": 35.461578369140625,
        "mean_generated_length": 217.625
    },
    {
        "step": 45,
        "reward": -0.7481880187988281,
        "gold_reward": -2.46978759765625,
        "kl_divergence": 33.129364013671875,
        "mean_generated_length": 227.078125
    },
    {
        "step": 46,
        "reward": -1.5903987884521484,
        "gold_reward": -2.3651657104492188,
        "kl_divergence": 33.99646759033203,
        "mean_generated_length": 217.296875
    },
    {
        "step": 47,
        "reward": -0.9342937469482422,
        "gold_reward": -2.3063621520996094,
        "kl_divergence": 30.593826293945312,
        "mean_generated_length": 215.390625
    },
    {
        "step": 48,
        "reward": -0.8026962280273438,
        "gold_reward": -1.7902498245239258,
        "kl_divergence": 35.94743728637695,
        "mean_generated_length": 221.3125
    },
    {
        "step": 49,
        "reward": -1.3890361785888672,
        "gold_reward": -2.1924972534179688,
        "kl_divergence": 34.904109954833984,
        "mean_generated_length": 225.4375
    },
    {
        "step": 50,
        "reward": -1.513763427734375,
        "gold_reward": -1.7301902770996094,
        "kl_divergence": 29.188779830932617,
        "mean_generated_length": 174.265625
    },
    {
        "step": 51,
        "reward": -1.478668212890625,
        "gold_reward": -2.5700645446777344,
        "kl_divergence": 31.097759246826172,
        "mean_generated_length": 218.265625
    },
    {
        "step": 52,
        "reward": -1.766719102859497,
        "gold_reward": -2.7664794921875,
        "kl_divergence": 33.267677307128906,
        "mean_generated_length": 212.0625
    },
    {
        "step": 53,
        "reward": -1.1942977905273438,
        "gold_reward": -1.8514366149902344,
        "kl_divergence": 27.899066925048828,
        "mean_generated_length": 209.0625
    },
    {
        "step": 54,
        "reward": -1.4883346557617188,
        "gold_reward": -2.375701904296875,
        "kl_divergence": 29.451873779296875,
        "mean_generated_length": 190.609375
    },
    {
        "step": 55,
        "reward": -1.2828369140625,
        "gold_reward": -2.1248931884765625,
        "kl_divergence": 26.99425506591797,
        "mean_generated_length": 191.984375
    },
    {
        "step": 56,
        "reward": -1.5489978790283203,
        "gold_reward": -2.3253087997436523,
        "kl_divergence": 31.042251586914062,
        "mean_generated_length": 216.375
    },
    {
        "step": 57,
        "reward": -1.8552093505859375,
        "gold_reward": -2.3557968139648438,
        "kl_divergence": 23.241382598876953,
        "mean_generated_length": 157.953125
    },
    {
        "step": 58,
        "reward": -1.189478874206543,
        "gold_reward": -2.292144775390625,
        "kl_divergence": 26.533634185791016,
        "mean_generated_length": 181.859375
    },
    {
        "step": 59,
        "reward": -1.4849853515625,
        "gold_reward": -2.0836715698242188,
        "kl_divergence": 26.18781089782715,
        "mean_generated_length": 174.765625
    },
    {
        "step": 60,
        "reward": -1.3517379760742188,
        "gold_reward": -1.860392451286316,
        "kl_divergence": 28.400976181030273,
        "mean_generated_length": 178.546875
    },
    {
        "step": 61,
        "reward": -1.1102943420410156,
        "gold_reward": -2.0396957397460938,
        "kl_divergence": 31.180179595947266,
        "mean_generated_length": 211.171875
    },
    {
        "step": 62,
        "reward": -1.31597900390625,
        "gold_reward": -2.3054351806640625,
        "kl_divergence": 32.90956115722656,
        "mean_generated_length": 201.390625
    },
    {
        "step": 63,
        "reward": -1.6158599853515625,
        "gold_reward": -2.700655937194824,
        "kl_divergence": 34.39444351196289,
        "mean_generated_length": 193.0625
    },
    {
        "step": 64,
        "reward": -0.9317455291748047,
        "gold_reward": -1.675374984741211,
        "kl_divergence": 36.02251434326172,
        "mean_generated_length": 196.875
    },
    {
        "step": 65,
        "reward": -0.729435920715332,
        "gold_reward": -2.5096054077148438,
        "kl_divergence": 36.35591506958008,
        "mean_generated_length": 222.1875
    },
    {
        "step": 66,
        "reward": -1.062723159790039,
        "gold_reward": -2.004413604736328,
        "kl_divergence": 37.337738037109375,
        "mean_generated_length": 207.0
    },
    {
        "step": 67,
        "reward": -0.6669921875,
        "gold_reward": -2.3693695068359375,
        "kl_divergence": 41.27640151977539,
        "mean_generated_length": 239.28125
    },
    {
        "step": 68,
        "reward": -0.3501548767089844,
        "gold_reward": -1.5313186645507812,
        "kl_divergence": 43.74783706665039,
        "mean_generated_length": 230.1875
    },
    {
        "step": 69,
        "reward": -0.7942962646484375,
        "gold_reward": -2.0571136474609375,
        "kl_divergence": 40.359947204589844,
        "mean_generated_length": 236.90625
    },
    {
        "step": 70,
        "reward": -0.4692840576171875,
        "gold_reward": -2.3637237548828125,
        "kl_divergence": 41.29473876953125,
        "mean_generated_length": 242.859375
    },
    {
        "step": 71,
        "reward": -1.1563472747802734,
        "gold_reward": -2.2179336547851562,
        "kl_divergence": 48.29375457763672,
        "mean_generated_length": 266.734375
    },
    {
        "step": 72,
        "reward": -0.8139528036117554,
        "gold_reward": -2.179443359375,
        "kl_divergence": 50.93128204345703,
        "mean_generated_length": 251.203125
    },
    {
        "step": 73,
        "reward": -0.7670783996582031,
        "gold_reward": -2.3312716484069824,
        "kl_divergence": 50.88926696777344,
        "mean_generated_length": 241.40625
    },
    {
        "step": 74,
        "reward": -0.8307647705078125,
        "gold_reward": -2.548551559448242,
        "kl_divergence": 41.72731018066406,
        "mean_generated_length": 264.109375
    },
    {
        "step": 75,
        "reward": -0.1179351806640625,
        "gold_reward": -2.1122055053710938,
        "kl_divergence": 48.72281265258789,
        "mean_generated_length": 261.671875
    },
    {
        "step": 76,
        "reward": -0.9520225524902344,
        "gold_reward": -2.358356475830078,
        "kl_divergence": 47.112548828125,
        "mean_generated_length": 264.953125
    },
    {
        "step": 77,
        "reward": 0.15090179443359375,
        "gold_reward": -1.835357666015625,
        "kl_divergence": 49.4852180480957,
        "mean_generated_length": 308.53125
    },
    {
        "step": 78,
        "reward": -0.977783203125,
        "gold_reward": -1.5350341796875,
        "kl_divergence": 64.81128692626953,
        "mean_generated_length": 337.0
    },
    {
        "step": 79,
        "reward": -1.3050804138183594,
        "gold_reward": -2.435701847076416,
        "kl_divergence": 49.05644989013672,
        "mean_generated_length": 286.21875
    },
    {
        "step": 80,
        "reward": -0.2644786834716797,
        "gold_reward": -2.3304595947265625,
        "kl_divergence": 51.62613296508789,
        "mean_generated_length": 286.65625
    },
    {
        "step": 81,
        "reward": -1.36126708984375,
        "gold_reward": -2.1849021911621094,
        "kl_divergence": 56.12855529785156,
        "mean_generated_length": 292.109375
    },
    {
        "step": 82,
        "reward": -1.0072784423828125,
        "gold_reward": -2.609405517578125,
        "kl_divergence": 55.688140869140625,
        "mean_generated_length": 297.1875
    },
    {
        "step": 83,
        "reward": -0.9306907653808594,
        "gold_reward": -2.61065673828125,
        "kl_divergence": 52.49040222167969,
        "mean_generated_length": 280.640625
    },
    {
        "step": 84,
        "reward": -1.4883289337158203,
        "gold_reward": -2.5523509979248047,
        "kl_divergence": 54.271060943603516,
        "mean_generated_length": 271.390625
    },
    {
        "step": 85,
        "reward": -1.59442138671875,
        "gold_reward": -2.78167724609375,
        "kl_divergence": 51.98044967651367,
        "mean_generated_length": 293.46875
    },
    {
        "step": 86,
        "reward": -1.0232429504394531,
        "gold_reward": -2.3869476318359375,
        "kl_divergence": 47.285789489746094,
        "mean_generated_length": 256.75
    },
    {
        "step": 87,
        "reward": -0.52069091796875,
        "gold_reward": -2.424072265625,
        "kl_divergence": 42.78155517578125,
        "mean_generated_length": 260.34375
    },
    {
        "step": 88,
        "reward": -1.099508285522461,
        "gold_reward": -1.8910980224609375,
        "kl_divergence": 48.01165771484375,
        "mean_generated_length": 247.015625
    },
    {
        "step": 89,
        "reward": -0.3725242614746094,
        "gold_reward": -1.6226730346679688,
        "kl_divergence": 36.89651107788086,
        "mean_generated_length": 223.4375
    },
    {
        "step": 90,
        "reward": -0.9625043869018555,
        "gold_reward": -2.5684814453125,
        "kl_divergence": 41.8519172668457,
        "mean_generated_length": 227.71875
    },
    {
        "step": 91,
        "reward": -1.4509544372558594,
        "gold_reward": -2.0609779357910156,
        "kl_divergence": 42.5269889831543,
        "mean_generated_length": 230.015625
    },
    {
        "step": 92,
        "reward": -0.46323394775390625,
        "gold_reward": -1.64739990234375,
        "kl_divergence": 42.23017883300781,
        "mean_generated_length": 207.359375
    },
    {
        "step": 93,
        "reward": -1.26298189163208,
        "gold_reward": -2.320281982421875,
        "kl_divergence": 43.910484313964844,
        "mean_generated_length": 237.171875
    },
    {
        "step": 94,
        "reward": -1.0178985595703125,
        "gold_reward": -2.4927597045898438,
        "kl_divergence": 48.46390151977539,
        "mean_generated_length": 237.546875
    },
    {
        "step": 95,
        "reward": -1.023688793182373,
        "gold_reward": -2.237813949584961,
        "kl_divergence": 46.7187385559082,
        "mean_generated_length": 237.328125
    },
    {
        "step": 96,
        "reward": -0.5098028182983398,
        "gold_reward": -2.0853500366210938,
        "kl_divergence": 43.88642120361328,
        "mean_generated_length": 246.015625
    },
    {
        "step": 97,
        "reward": -1.4850006103515625,
        "gold_reward": -2.7342529296875,
        "kl_divergence": 47.1498908996582,
        "mean_generated_length": 253.40625
    },
    {
        "step": 98,
        "reward": -1.2466344833374023,
        "gold_reward": -2.664764404296875,
        "kl_divergence": 41.54400634765625,
        "mean_generated_length": 215.59375
    },
    {
        "step": 99,
        "reward": -1.087860107421875,
        "gold_reward": -2.3921661376953125,
        "kl_divergence": 45.70686340332031,
        "mean_generated_length": 241.359375
    },
    {
        "step": 100,
        "reward": -0.19153308868408203,
        "gold_reward": -2.0021743774414062,
        "kl_divergence": 39.29399108886719,
        "mean_generated_length": 205.34375
    },
    {
        "step": 101,
        "reward": -0.975433349609375,
        "gold_reward": -1.945425033569336,
        "kl_divergence": 42.285003662109375,
        "mean_generated_length": 210.421875
    },
    {
        "step": 102,
        "reward": -0.4961700439453125,
        "gold_reward": -1.8003082275390625,
        "kl_divergence": 35.91581344604492,
        "mean_generated_length": 205.3125
    },
    {
        "step": 103,
        "reward": -1.2935352325439453,
        "gold_reward": -2.182586669921875,
        "kl_divergence": 35.07163619995117,
        "mean_generated_length": 188.171875
    },
    {
        "step": 104,
        "reward": -1.0602705478668213,
        "gold_reward": -2.3313159942626953,
        "kl_divergence": 42.195655822753906,
        "mean_generated_length": 224.890625
    },
    {
        "step": 105,
        "reward": -1.02752685546875,
        "gold_reward": -2.361541748046875,
        "kl_divergence": 39.03120040893555,
        "mean_generated_length": 193.546875
    },
    {
        "step": 106,
        "reward": -1.4151573181152344,
        "gold_reward": -2.6866531372070312,
        "kl_divergence": 38.60729217529297,
        "mean_generated_length": 222.53125
    },
    {
        "step": 107,
        "reward": -0.6449737548828125,
        "gold_reward": -2.0293426513671875,
        "kl_divergence": 45.45258712768555,
        "mean_generated_length": 213.59375
    },
    {
        "step": 108,
        "reward": -0.9819583892822266,
        "gold_reward": -2.330322265625,
        "kl_divergence": 42.815818786621094,
        "mean_generated_length": 213.9375
    },
    {
        "step": 109,
        "reward": -1.1153755187988281,
        "gold_reward": -2.0556869506835938,
        "kl_divergence": 53.091453552246094,
        "mean_generated_length": 236.40625
    },
    {
        "step": 110,
        "reward": -0.8603553771972656,
        "gold_reward": -2.300511360168457,
        "kl_divergence": 51.94776916503906,
        "mean_generated_length": 249.796875
    },
    {
        "step": 111,
        "reward": -0.911407470703125,
        "gold_reward": -2.0142059326171875,
        "kl_divergence": 53.08163070678711,
        "mean_generated_length": 221.96875
    },
    {
        "step": 112,
        "reward": -1.0499420166015625,
        "gold_reward": -2.4998779296875,
        "kl_divergence": 53.86075973510742,
        "mean_generated_length": 245.21875
    },
    {
        "step": 113,
        "reward": -1.0023422241210938,
        "gold_reward": -1.9786376953125,
        "kl_divergence": 61.668190002441406,
        "mean_generated_length": 279.3125
    },
    {
        "step": 114,
        "reward": -1.5706634521484375,
        "gold_reward": -2.1989707946777344,
        "kl_divergence": 60.0496826171875,
        "mean_generated_length": 247.796875
    },
    {
        "step": 115,
        "reward": -0.463897705078125,
        "gold_reward": -2.6076202392578125,
        "kl_divergence": 58.64997863769531,
        "mean_generated_length": 280.40625
    },
    {
        "step": 116,
        "reward": -0.712266206741333,
        "gold_reward": -2.3252811431884766,
        "kl_divergence": 58.07959747314453,
        "mean_generated_length": 241.125
    },
    {
        "step": 117,
        "reward": -1.1063976287841797,
        "gold_reward": -2.7503814697265625,
        "kl_divergence": 62.350807189941406,
        "mean_generated_length": 295.328125
    },
    {
        "step": 118,
        "reward": -1.0860252380371094,
        "gold_reward": -2.2749481201171875,
        "kl_divergence": 59.98896408081055,
        "mean_generated_length": 253.53125
    },
    {
        "step": 119,
        "reward": -1.601593017578125,
        "gold_reward": -2.7477760314941406,
        "kl_divergence": 62.631622314453125,
        "mean_generated_length": 271.90625
    },
    {
        "step": 120,
        "reward": -1.3246231079101562,
        "gold_reward": -2.2208251953125,
        "kl_divergence": 60.61607360839844,
        "mean_generated_length": 276.84375
    },
    {
        "step": 121,
        "reward": -1.0414886474609375,
        "gold_reward": -2.5518798828125,
        "kl_divergence": 54.90904235839844,
        "mean_generated_length": 259.890625
    },
    {
        "step": 122,
        "reward": -1.3132343292236328,
        "gold_reward": -2.7633209228515625,
        "kl_divergence": 70.81959533691406,
        "mean_generated_length": 309.546875
    },
    {
        "step": 123,
        "reward": -0.6058082580566406,
        "gold_reward": -2.6854171752929688,
        "kl_divergence": 63.910362243652344,
        "mean_generated_length": 279.078125
    },
    {
        "step": 124,
        "reward": -0.3263707160949707,
        "gold_reward": -2.3821487426757812,
        "kl_divergence": 52.46712112426758,
        "mean_generated_length": 239.78125
    },
    {
        "step": 125,
        "reward": -0.9031753540039062,
        "gold_reward": -2.1901321411132812,
        "kl_divergence": 49.53785705566406,
        "mean_generated_length": 213.734375
    },
    {
        "step": 126,
        "reward": -0.5709686279296875,
        "gold_reward": -2.1377334594726562,
        "kl_divergence": 43.4412841796875,
        "mean_generated_length": 194.921875
    },
    {
        "step": 127,
        "reward": -0.7204666137695312,
        "gold_reward": -2.0358710289001465,
        "kl_divergence": 36.999549865722656,
        "mean_generated_length": 176.015625
    },
    {
        "step": 128,
        "reward": -0.775421142578125,
        "gold_reward": -1.875082015991211,
        "kl_divergence": 35.8177375793457,
        "mean_generated_length": 212.984375
    },
    {
        "step": 129,
        "reward": -0.6702709197998047,
        "gold_reward": -1.7743802070617676,
        "kl_divergence": 37.02277374267578,
        "mean_generated_length": 170.125
    },
    {
        "step": 130,
        "reward": -1.2687339782714844,
        "gold_reward": -2.4037322998046875,
        "kl_divergence": 42.567813873291016,
        "mean_generated_length": 213.171875
    },
    {
        "step": 131,
        "reward": -0.6964874267578125,
        "gold_reward": -2.103531837463379,
        "kl_divergence": 34.471214294433594,
        "mean_generated_length": 182.84375
    },
    {
        "step": 132,
        "reward": -1.0170249938964844,
        "gold_reward": -2.0699939727783203,
        "kl_divergence": 32.81340026855469,
        "mean_generated_length": 176.6875
    },
    {
        "step": 133,
        "reward": -1.0056343078613281,
        "gold_reward": -2.2649612426757812,
        "kl_divergence": 37.90582275390625,
        "mean_generated_length": 171.875
    },
    {
        "step": 134,
        "reward": -0.5042533874511719,
        "gold_reward": -1.6807022094726562,
        "kl_divergence": 32.983829498291016,
        "mean_generated_length": 183.25
    },
    {
        "step": 135,
        "reward": -1.3537750244140625,
        "gold_reward": -2.5078506469726562,
        "kl_divergence": 37.51255798339844,
        "mean_generated_length": 185.046875
    },
    {
        "step": 136,
        "reward": -0.911834716796875,
        "gold_reward": -2.019075870513916,
        "kl_divergence": 33.46611785888672,
        "mean_generated_length": 175.734375
    },
    {
        "step": 137,
        "reward": -1.0513153076171875,
        "gold_reward": -1.7406110763549805,
        "kl_divergence": 32.77268981933594,
        "mean_generated_length": 177.234375
    },
    {
        "step": 138,
        "reward": -1.4026689529418945,
        "gold_reward": -2.010772705078125,
        "kl_divergence": 36.20962905883789,
        "mean_generated_length": 172.796875
    },
    {
        "step": 139,
        "reward": -0.9345383644104004,
        "gold_reward": -1.8307723999023438,
        "kl_divergence": 36.72357177734375,
        "mean_generated_length": 184.28125
    },
    {
        "step": 140,
        "reward": -0.6115798950195312,
        "gold_reward": -2.0753402709960938,
        "kl_divergence": 38.62977981567383,
        "mean_generated_length": 196.484375
    },
    {
        "step": 141,
        "reward": -1.5910263061523438,
        "gold_reward": -2.516510009765625,
        "kl_divergence": 34.434844970703125,
        "mean_generated_length": 169.09375
    },
    {
        "step": 142,
        "reward": -0.8889007568359375,
        "gold_reward": -2.3536376953125,
        "kl_divergence": 32.52626419067383,
        "mean_generated_length": 171.625
    },
    {
        "step": 143,
        "reward": -0.896575927734375,
        "gold_reward": -1.7881927490234375,
        "kl_divergence": 37.43942642211914,
        "mean_generated_length": 181.25
    },
    {
        "step": 144,
        "reward": -0.9808273315429688,
        "gold_reward": -2.9388999938964844,
        "kl_divergence": 42.81034851074219,
        "mean_generated_length": 202.78125
    },
    {
        "step": 145,
        "reward": -1.1192712783813477,
        "gold_reward": -2.3589515686035156,
        "kl_divergence": 37.42396545410156,
        "mean_generated_length": 191.203125
    },
    {
        "step": 146,
        "reward": -1.2168312072753906,
        "gold_reward": -2.817584991455078,
        "kl_divergence": 44.1016845703125,
        "mean_generated_length": 220.25
    },
    {
        "step": 147,
        "reward": -0.07152366638183594,
        "gold_reward": -2.0691375732421875,
        "kl_divergence": 42.143463134765625,
        "mean_generated_length": 180.921875
    },
    {
        "step": 148,
        "reward": -0.6763763427734375,
        "gold_reward": -2.418609619140625,
        "kl_divergence": 39.58769989013672,
        "mean_generated_length": 189.453125
    },
    {
        "step": 149,
        "reward": -0.7630767822265625,
        "gold_reward": -2.328695297241211,
        "kl_divergence": 34.23202896118164,
        "mean_generated_length": 173.90625
    },
    {
        "step": 150,
        "reward": -1.4552154541015625,
        "gold_reward": -2.2437591552734375,
        "kl_divergence": 36.54349136352539,
        "mean_generated_length": 173.640625
    },
    {
        "step": 151,
        "reward": -0.5171079635620117,
        "gold_reward": -1.8425140380859375,
        "kl_divergence": 38.730384826660156,
        "mean_generated_length": 188.6875
    },
    {
        "step": 152,
        "reward": -1.1095542907714844,
        "gold_reward": -2.0851821899414062,
        "kl_divergence": 34.556705474853516,
        "mean_generated_length": 156.15625
    },
    {
        "step": 153,
        "reward": -0.9790191650390625,
        "gold_reward": -2.560029983520508,
        "kl_divergence": 31.69156265258789,
        "mean_generated_length": 191.484375
    },
    {
        "step": 154,
        "reward": -0.4107217788696289,
        "gold_reward": -2.521697998046875,
        "kl_divergence": 32.74375534057617,
        "mean_generated_length": 173.921875
    },
    {
        "step": 155,
        "reward": -1.4845123291015625,
        "gold_reward": -2.3016128540039062,
        "kl_divergence": 35.95720672607422,
        "mean_generated_length": 178.9375
    },
    {
        "step": 156,
        "reward": -1.0025272369384766,
        "gold_reward": -1.829498291015625,
        "kl_divergence": 39.24026870727539,
        "mean_generated_length": 223.421875
    },
    {
        "step": 157,
        "reward": 0.945556640625,
        "gold_reward": -1.304443359375,
        "kl_divergence": 31.0733642578125,
        "mean_generated_length": 169.125
    },
    {
        "step": 158,
        "reward": -1.0373802185058594,
        "gold_reward": -2.17620849609375,
        "kl_divergence": 39.1625862121582,
        "mean_generated_length": 191.703125
    },
    {
        "step": 159,
        "reward": -1.2415428161621094,
        "gold_reward": -2.419921875,
        "kl_divergence": 40.165958404541016,
        "mean_generated_length": 191.796875
    },
    {
        "step": 160,
        "reward": -0.4534912109375,
        "gold_reward": -1.8757781982421875,
        "kl_divergence": 41.44477081298828,
        "mean_generated_length": 180.546875
    },
    {
        "step": 161,
        "reward": -0.9453811645507812,
        "gold_reward": -2.4071502685546875,
        "kl_divergence": 41.79751968383789,
        "mean_generated_length": 207.609375
    },
    {
        "step": 162,
        "reward": -0.8989801406860352,
        "gold_reward": -2.4893035888671875,
        "kl_divergence": 40.43056106567383,
        "mean_generated_length": 188.484375
    },
    {
        "step": 163,
        "reward": -1.3911170959472656,
        "gold_reward": -2.6073732376098633,
        "kl_divergence": 43.93703079223633,
        "mean_generated_length": 194.40625
    },
    {
        "step": 164,
        "reward": -0.8231201171875,
        "gold_reward": -2.6922988891601562,
        "kl_divergence": 41.08013153076172,
        "mean_generated_length": 210.34375
    },
    {
        "step": 165,
        "reward": -0.6897487640380859,
        "gold_reward": -2.0385971069335938,
        "kl_divergence": 39.12681198120117,
        "mean_generated_length": 191.3125
    },
    {
        "step": 166,
        "reward": -0.22110939025878906,
        "gold_reward": -2.2643213272094727,
        "kl_divergence": 40.78740310668945,
        "mean_generated_length": 207.125
    },
    {
        "step": 167,
        "reward": 0.079437255859375,
        "gold_reward": -1.3073844909667969,
        "kl_divergence": 42.47435760498047,
        "mean_generated_length": 196.671875
    },
    {
        "step": 168,
        "reward": -0.09696197509765625,
        "gold_reward": -1.3095808029174805,
        "kl_divergence": 39.022193908691406,
        "mean_generated_length": 184.734375
    },
    {
        "step": 169,
        "reward": -0.21198034286499023,
        "gold_reward": -2.1209678649902344,
        "kl_divergence": 38.56840515136719,
        "mean_generated_length": 189.359375
    },
    {
        "step": 170,
        "reward": -0.75201416015625,
        "gold_reward": -1.7763404846191406,
        "kl_divergence": 41.0496711730957,
        "mean_generated_length": 206.140625
    },
    {
        "step": 171,
        "reward": -0.01247406005859375,
        "gold_reward": -1.0599517822265625,
        "kl_divergence": 43.77490234375,
        "mean_generated_length": 170.171875
    },
    {
        "step": 172,
        "reward": -0.420867919921875,
        "gold_reward": -1.824066162109375,
        "kl_divergence": 42.765682220458984,
        "mean_generated_length": 204.21875
    },
    {
        "step": 173,
        "reward": -0.6044950485229492,
        "gold_reward": -1.9654731750488281,
        "kl_divergence": 46.118873596191406,
        "mean_generated_length": 189.78125
    },
    {
        "step": 174,
        "reward": -0.4166259765625,
        "gold_reward": -2.1412696838378906,
        "kl_divergence": 44.85832595825195,
        "mean_generated_length": 209.9375
    },
    {
        "step": 175,
        "reward": -0.058940887451171875,
        "gold_reward": -1.710390567779541,
        "kl_divergence": 44.508888244628906,
        "mean_generated_length": 205.984375
    },
    {
        "step": 176,
        "reward": -0.9166412353515625,
        "gold_reward": -2.53619384765625,
        "kl_divergence": 46.061004638671875,
        "mean_generated_length": 217.703125
    },
    {
        "step": 177,
        "reward": -0.334686279296875,
        "gold_reward": -2.478607177734375,
        "kl_divergence": 49.18852233886719,
        "mean_generated_length": 214.703125
    },
    {
        "step": 178,
        "reward": -0.4894866943359375,
        "gold_reward": -2.020172119140625,
        "kl_divergence": 52.30464172363281,
        "mean_generated_length": 223.4375
    },
    {
        "step": 179,
        "reward": 0.5794939994812012,
        "gold_reward": -1.661346435546875,
        "kl_divergence": 39.644813537597656,
        "mean_generated_length": 191.46875
    },
    {
        "step": 180,
        "reward": -0.5012931823730469,
        "gold_reward": -1.9822845458984375,
        "kl_divergence": 52.036476135253906,
        "mean_generated_length": 210.078125
    },
    {
        "step": 181,
        "reward": 0.31268787384033203,
        "gold_reward": -1.2061767578125,
        "kl_divergence": 42.10980224609375,
        "mean_generated_length": 183.640625
    },
    {
        "step": 182,
        "reward": -0.4081108570098877,
        "gold_reward": -1.8748550415039062,
        "kl_divergence": 44.28519821166992,
        "mean_generated_length": 192.234375
    },
    {
        "step": 183,
        "reward": 0.17246437072753906,
        "gold_reward": -1.7714920043945312,
        "kl_divergence": 44.30254364013672,
        "mean_generated_length": 204.84375
    },
    {
        "step": 184,
        "reward": -0.7697219848632812,
        "gold_reward": -2.20989990234375,
        "kl_divergence": 43.906044006347656,
        "mean_generated_length": 199.15625
    },
    {
        "step": 185,
        "reward": -0.4843597412109375,
        "gold_reward": -2.267354965209961,
        "kl_divergence": 43.3767204284668,
        "mean_generated_length": 213.03125
    },
    {
        "step": 186,
        "reward": -0.207855224609375,
        "gold_reward": -1.8126220703125,
        "kl_divergence": 43.13969039916992,
        "mean_generated_length": 182.265625
    },
    {
        "step": 187,
        "reward": 0.16030144691467285,
        "gold_reward": -1.8829460144042969,
        "kl_divergence": 45.96183776855469,
        "mean_generated_length": 226.5
    },
    {
        "step": 188,
        "reward": -0.1029815673828125,
        "gold_reward": -1.6970491409301758,
        "kl_divergence": 47.12307357788086,
        "mean_generated_length": 203.953125
    },
    {
        "step": 189,
        "reward": -0.4950294494628906,
        "gold_reward": -2.0119476318359375,
        "kl_divergence": 47.256290435791016,
        "mean_generated_length": 219.921875
    },
    {
        "step": 190,
        "reward": 0.036407470703125,
        "gold_reward": -1.713958740234375,
        "kl_divergence": 47.8005256652832,
        "mean_generated_length": 207.5
    },
    {
        "step": 191,
        "reward": -0.2420552372932434,
        "gold_reward": -1.9484134912490845,
        "kl_divergence": 49.30671691894531,
        "mean_generated_length": 207.71875
    },
    {
        "step": 192,
        "reward": 0.3845405578613281,
        "gold_reward": -1.4977188110351562,
        "kl_divergence": 46.81734848022461,
        "mean_generated_length": 216.5
    },
    {
        "step": 193,
        "reward": -0.45336151123046875,
        "gold_reward": -1.822174072265625,
        "kl_divergence": 47.549198150634766,
        "mean_generated_length": 210.03125
    },
    {
        "step": 194,
        "reward": 0.3674888610839844,
        "gold_reward": -2.4274110794067383,
        "kl_divergence": 47.00420379638672,
        "mean_generated_length": 231.78125
    },
    {
        "step": 195,
        "reward": 0.13455772399902344,
        "gold_reward": -2.2048325538635254,
        "kl_divergence": 41.41944885253906,
        "mean_generated_length": 197.984375
    },
    {
        "step": 196,
        "reward": -0.12329578399658203,
        "gold_reward": -2.593536376953125,
        "kl_divergence": 51.91798782348633,
        "mean_generated_length": 228.078125
    },
    {
        "step": 197,
        "reward": 0.2620878219604492,
        "gold_reward": -1.5892562866210938,
        "kl_divergence": 49.406394958496094,
        "mean_generated_length": 210.390625
    },
    {
        "step": 198,
        "reward": 0.1623363494873047,
        "gold_reward": -1.7926025390625,
        "kl_divergence": 49.82994079589844,
        "mean_generated_length": 226.71875
    },
    {
        "step": 199,
        "reward": -0.0937042236328125,
        "gold_reward": -1.6418228149414062,
        "kl_divergence": 52.48539352416992,
        "mean_generated_length": 234.71875
    },
    {
        "step": 200,
        "reward": 0.43262481689453125,
        "gold_reward": -1.6741790771484375,
        "kl_divergence": 47.762916564941406,
        "mean_generated_length": 228.296875
    },
    {
        "step": 201,
        "reward": -0.4520416259765625,
        "gold_reward": -1.95849609375,
        "kl_divergence": 45.76906204223633,
        "mean_generated_length": 222.25
    },
    {
        "step": 202,
        "reward": -0.12378501892089844,
        "gold_reward": -1.8098561763763428,
        "kl_divergence": 47.5084228515625,
        "mean_generated_length": 201.703125
    },
    {
        "step": 203,
        "reward": 0.7539215087890625,
        "gold_reward": -2.2196044921875,
        "kl_divergence": 46.13774490356445,
        "mean_generated_length": 215.75
    },
    {
        "step": 204,
        "reward": -0.11349105834960938,
        "gold_reward": -1.9544601440429688,
        "kl_divergence": 50.467227935791016,
        "mean_generated_length": 218.53125
    },
    {
        "step": 205,
        "reward": 0.3895263671875,
        "gold_reward": -1.7122039794921875,
        "kl_divergence": 45.82936477661133,
        "mean_generated_length": 210.71875
    },
    {
        "step": 206,
        "reward": 0.005126953125,
        "gold_reward": -1.829498291015625,
        "kl_divergence": 53.794132232666016,
        "mean_generated_length": 203.953125
    },
    {
        "step": 207,
        "reward": -0.38486480712890625,
        "gold_reward": -1.9625568389892578,
        "kl_divergence": 54.95743942260742,
        "mean_generated_length": 244.03125
    },
    {
        "step": 208,
        "reward": 0.527752161026001,
        "gold_reward": -1.0639801025390625,
        "kl_divergence": 50.4287109375,
        "mean_generated_length": 188.34375
    },
    {
        "step": 209,
        "reward": -0.2510490417480469,
        "gold_reward": -2.1682891845703125,
        "kl_divergence": 52.419918060302734,
        "mean_generated_length": 231.390625
    },
    {
        "step": 210,
        "reward": 0.00385284423828125,
        "gold_reward": -2.2631072998046875,
        "kl_divergence": 47.94871139526367,
        "mean_generated_length": 216.640625
    },
    {
        "step": 211,
        "reward": -0.49019622802734375,
        "gold_reward": -1.8976631164550781,
        "kl_divergence": 44.23621368408203,
        "mean_generated_length": 212.359375
    },
    {
        "step": 212,
        "reward": 0.5534095764160156,
        "gold_reward": -1.9519500732421875,
        "kl_divergence": 51.73306655883789,
        "mean_generated_length": 224.21875
    },
    {
        "step": 213,
        "reward": 0.6819076538085938,
        "gold_reward": -1.3863248825073242,
        "kl_divergence": 49.195045471191406,
        "mean_generated_length": 223.625
    },
    {
        "step": 214,
        "reward": -0.18286895751953125,
        "gold_reward": -2.0702247619628906,
        "kl_divergence": 48.615478515625,
        "mean_generated_length": 213.953125
    },
    {
        "step": 215,
        "reward": 0.124755859375,
        "gold_reward": -1.6957550048828125,
        "kl_divergence": 50.665618896484375,
        "mean_generated_length": 228.28125
    },
    {
        "step": 216,
        "reward": 0.10132884979248047,
        "gold_reward": -1.5910792350769043,
        "kl_divergence": 46.224327087402344,
        "mean_generated_length": 204.671875
    },
    {
        "step": 217,
        "reward": 0.14131927490234375,
        "gold_reward": -1.6233272552490234,
        "kl_divergence": 51.47059631347656,
        "mean_generated_length": 218.359375
    },
    {
        "step": 218,
        "reward": 0.19496631622314453,
        "gold_reward": -1.2423629760742188,
        "kl_divergence": 48.13006591796875,
        "mean_generated_length": 205.421875
    },
    {
        "step": 219,
        "reward": 0.38767051696777344,
        "gold_reward": -1.837982177734375,
        "kl_divergence": 47.370384216308594,
        "mean_generated_length": 218.890625
    },
    {
        "step": 220,
        "reward": -0.3428611755371094,
        "gold_reward": -2.0443115234375,
        "kl_divergence": 48.36191177368164,
        "mean_generated_length": 225.390625
    },
    {
        "step": 221,
        "reward": -0.5075149536132812,
        "gold_reward": -1.927490234375,
        "kl_divergence": 44.21537399291992,
        "mean_generated_length": 203.15625
    },
    {
        "step": 222,
        "reward": -0.21914291381835938,
        "gold_reward": -1.6122589111328125,
        "kl_divergence": 49.076595306396484,
        "mean_generated_length": 211.3125
    },
    {
        "step": 223,
        "reward": 0.48853302001953125,
        "gold_reward": -1.8802490234375,
        "kl_divergence": 46.381893157958984,
        "mean_generated_length": 224.125
    },
    {
        "step": 224,
        "reward": -0.4014549255371094,
        "gold_reward": -1.91693115234375,
        "kl_divergence": 49.343963623046875,
        "mean_generated_length": 224.0625
    },
    {
        "step": 225,
        "reward": 0.0354769229888916,
        "gold_reward": -2.2229080200195312,
        "kl_divergence": 49.0798225402832,
        "mean_generated_length": 228.765625
    },
    {
        "step": 226,
        "reward": 0.6632003784179688,
        "gold_reward": -1.48919677734375,
        "kl_divergence": 50.28592300415039,
        "mean_generated_length": 210.890625
    },
    {
        "step": 227,
        "reward": -0.32146263122558594,
        "gold_reward": -1.8997917175292969,
        "kl_divergence": 53.20832061767578,
        "mean_generated_length": 222.546875
    },
    {
        "step": 228,
        "reward": 0.4425363540649414,
        "gold_reward": -2.0962862968444824,
        "kl_divergence": 46.25506591796875,
        "mean_generated_length": 223.140625
    },
    {
        "step": 229,
        "reward": -0.3233757019042969,
        "gold_reward": -1.987722396850586,
        "kl_divergence": 47.402198791503906,
        "mean_generated_length": 199.71875
    },
    {
        "step": 230,
        "reward": 0.16159820556640625,
        "gold_reward": -1.6984977722167969,
        "kl_divergence": 46.675933837890625,
        "mean_generated_length": 213.75
    },
    {
        "step": 231,
        "reward": 0.2831459045410156,
        "gold_reward": -1.72314453125,
        "kl_divergence": 46.217811584472656,
        "mean_generated_length": 202.734375
    },
    {
        "step": 232,
        "reward": -0.22560501098632812,
        "gold_reward": -2.3350448608398438,
        "kl_divergence": 45.475433349609375,
        "mean_generated_length": 232.859375
    },
    {
        "step": 233,
        "reward": 0.3861236572265625,
        "gold_reward": -1.8922195434570312,
        "kl_divergence": 49.51545715332031,
        "mean_generated_length": 220.21875
    },
    {
        "step": 234,
        "reward": -0.5226449966430664,
        "gold_reward": -2.218231201171875,
        "kl_divergence": 48.69343185424805,
        "mean_generated_length": 236.359375
    },
    {
        "step": 235,
        "reward": 0.17539691925048828,
        "gold_reward": -1.6758880615234375,
        "kl_divergence": 51.33269119262695,
        "mean_generated_length": 246.765625
    },
    {
        "step": 236,
        "reward": 0.677978515625,
        "gold_reward": -1.5682373046875,
        "kl_divergence": 47.463504791259766,
        "mean_generated_length": 195.5
    }
]