{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.9997740696510989,
  "eval_steps": 100,
  "global_step": 1936,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.0,
      "learning_rate": 2.5773195876288662e-08,
      "logps/chosen": -212.1134033203125,
      "logps/rejected": -166.82583618164062,
      "loss": 0.6931,
      "mask/mask_ratio": 0.4512827694416046,
      "regularization/forward_KL": 0.0,
      "regularization/policy_data_loss": 1.335125207901001,
      "regularization/policy_ref_data_loss_gap": 0.0,
      "regularization/reference_data_loss": 1.335125207901001,
      "regularization/reverse_KL": 0.0,
      "rewards/accuracies": 0.0,
      "rewards/chosen": 0.0,
      "rewards/margins": 0.0,
      "rewards/rejected": 0.0,
      "step": 1,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 16.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.01,
      "learning_rate": 2.577319587628866e-07,
      "logps/chosen": -326.1533508300781,
      "logps/rejected": -244.8445587158203,
      "loss": 0.6929,
      "mask/mask_ratio": 0.5241280794143677,
      "regularization/forward_KL": 0.0001769196824170649,
      "regularization/policy_data_loss": 1.241602897644043,
      "regularization/policy_ref_data_loss_gap": -3.196681791450828e-05,
      "regularization/reference_data_loss": 1.2416348457336426,
      "regularization/reverse_KL": 0.00017688308435026556,
      "rewards/accuracies": 0.5,
      "rewards/chosen": 0.00025799646391533315,
      "rewards/margins": 0.0005369112477637827,
      "rewards/rejected": -0.00027891475474461913,
      "step": 10,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 144.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.01,
      "learning_rate": 5.154639175257732e-07,
      "logps/chosen": -280.6797790527344,
      "logps/rejected": -227.4728240966797,
      "loss": 0.6931,
      "mask/mask_ratio": 0.4665634036064148,
      "regularization/forward_KL": 0.00019728089682757854,
      "regularization/policy_data_loss": 1.3743268251419067,
      "regularization/policy_ref_data_loss_gap": 1.679910383245442e-05,
      "regularization/reference_data_loss": 1.3743098974227905,
      "regularization/reverse_KL": 0.00019725369929801673,
      "rewards/accuracies": 0.4906249940395355,
      "rewards/chosen": -3.195634781150147e-05,
      "rewards/margins": 2.279781074321363e-05,
      "rewards/rejected": -5.475413490785286e-05,
      "step": 20,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.02,
      "learning_rate": 7.731958762886599e-07,
      "logps/chosen": -289.6597900390625,
      "logps/rejected": -239.7221221923828,
      "loss": 0.6929,
      "mask/mask_ratio": 0.48561492562294006,
      "regularization/forward_KL": 0.00020340974151622504,
      "regularization/policy_data_loss": 1.3848317861557007,
      "regularization/policy_ref_data_loss_gap": 0.00010587237920844927,
      "regularization/reference_data_loss": 1.3847260475158691,
      "regularization/reverse_KL": 0.0002034321951214224,
      "rewards/accuracies": 0.534375011920929,
      "rewards/chosen": 0.0002578829589765519,
      "rewards/margins": 0.0005824099062010646,
      "rewards/rejected": -0.0003245268890168518,
      "step": 30,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.02,
      "learning_rate": 1.0309278350515464e-06,
      "logps/chosen": -297.84466552734375,
      "logps/rejected": -258.29998779296875,
      "loss": 0.6926,
      "mask/mask_ratio": 0.4868837893009186,
      "regularization/forward_KL": 0.00020532039343379438,
      "regularization/policy_data_loss": 1.3027656078338623,
      "regularization/policy_ref_data_loss_gap": 0.0003734443453140557,
      "regularization/reference_data_loss": 1.3023921251296997,
      "regularization/reverse_KL": 0.0002053794014500454,
      "rewards/accuracies": 0.534375011920929,
      "rewards/chosen": 0.0006858176784589887,
      "rewards/margins": 0.0010306112235412002,
      "rewards/rejected": -0.0003447936032898724,
      "step": 40,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.03,
      "learning_rate": 1.288659793814433e-06,
      "logps/chosen": -278.8190002441406,
      "logps/rejected": -218.8843231201172,
      "loss": 0.6923,
      "mask/mask_ratio": 0.4644516408443451,
      "regularization/forward_KL": 0.00022548329434357584,
      "regularization/policy_data_loss": 1.3266862630844116,
      "regularization/policy_ref_data_loss_gap": 0.0006853954982943833,
      "regularization/reference_data_loss": 1.3260008096694946,
      "regularization/reverse_KL": 0.00022541460930369794,
      "rewards/accuracies": 0.612500011920929,
      "rewards/chosen": 0.0007073446176946163,
      "rewards/margins": 0.0016571009764447808,
      "rewards/rejected": -0.0009497563587501645,
      "step": 50,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.03,
      "learning_rate": 1.5463917525773197e-06,
      "logps/chosen": -270.2308654785156,
      "logps/rejected": -231.6517791748047,
      "loss": 0.6919,
      "mask/mask_ratio": 0.4796411097049713,
      "regularization/forward_KL": 0.0002527556789573282,
      "regularization/policy_data_loss": 1.2803740501403809,
      "regularization/policy_ref_data_loss_gap": 0.0008799933129921556,
      "regularization/reference_data_loss": 1.279494047164917,
      "regularization/reverse_KL": 0.00025264121359214187,
      "rewards/accuracies": 0.684374988079071,
      "rewards/chosen": 0.0009641913929954171,
      "rewards/margins": 0.0024960762821137905,
      "rewards/rejected": -0.0015318848891183734,
      "step": 60,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.04,
      "learning_rate": 1.8041237113402063e-06,
      "logps/chosen": -269.181640625,
      "logps/rejected": -229.4257049560547,
      "loss": 0.6919,
      "mask/mask_ratio": 0.4748091697692871,
      "regularization/forward_KL": 0.00031470030080527067,
      "regularization/policy_data_loss": 1.3252737522125244,
      "regularization/policy_ref_data_loss_gap": 0.0016052561113610864,
      "regularization/reference_data_loss": 1.3236685991287231,
      "regularization/reverse_KL": 0.0003143385984003544,
      "rewards/accuracies": 0.606249988079071,
      "rewards/chosen": 0.001416604733094573,
      "rewards/margins": 0.0024156190920621157,
      "rewards/rejected": -0.0009990143589675426,
      "step": 70,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.04,
      "learning_rate": 2.061855670103093e-06,
      "logps/chosen": -295.0655212402344,
      "logps/rejected": -246.9297332763672,
      "loss": 0.6908,
      "mask/mask_ratio": 0.4832943379878998,
      "regularization/forward_KL": 0.00038695387775078416,
      "regularization/policy_data_loss": 1.3166722059249878,
      "regularization/policy_ref_data_loss_gap": 0.0022860420867800713,
      "regularization/reference_data_loss": 1.3143861293792725,
      "regularization/reverse_KL": 0.0003858749405480921,
      "rewards/accuracies": 0.7093750238418579,
      "rewards/chosen": 0.0029345352668315172,
      "rewards/margins": 0.004682451952248812,
      "rewards/rejected": -0.0017479164525866508,
      "step": 80,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.05,
      "learning_rate": 2.3195876288659796e-06,
      "logps/chosen": -287.1355895996094,
      "logps/rejected": -237.8474578857422,
      "loss": 0.6904,
      "mask/mask_ratio": 0.4867839813232422,
      "regularization/forward_KL": 0.0005562350270338356,
      "regularization/policy_data_loss": 1.3840858936309814,
      "regularization/policy_ref_data_loss_gap": 0.004764406476169825,
      "regularization/reference_data_loss": 1.379321575164795,
      "regularization/reverse_KL": 0.0005529638146981597,
      "rewards/accuracies": 0.6875,
      "rewards/chosen": 0.002136844675987959,
      "rewards/margins": 0.005438755266368389,
      "rewards/rejected": -0.0033019105903804302,
      "step": 90,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.05,
      "learning_rate": 2.577319587628866e-06,
      "logps/chosen": -270.9361267089844,
      "logps/rejected": -225.43197631835938,
      "loss": 0.6892,
      "mask/mask_ratio": 0.4581800103187561,
      "regularization/forward_KL": 0.0007667395402677357,
      "regularization/policy_data_loss": 1.4045231342315674,
      "regularization/policy_ref_data_loss_gap": 0.006632406264543533,
      "regularization/reference_data_loss": 1.3978906869888306,
      "regularization/reverse_KL": 0.0007600841927342117,
      "rewards/accuracies": 0.715624988079071,
      "rewards/chosen": 0.0039849793538451195,
      "rewards/margins": 0.007900616154074669,
      "rewards/rejected": -0.003915636334568262,
      "step": 100,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.05,
      "eval_logps/chosen": -274.760498046875,
      "eval_logps/rejected": -233.80404663085938,
      "eval_loss": 0.6880948543548584,
      "eval_mask/mask_ratio": 0.4808923900127411,
      "eval_regularization/forward_KL": 0.0009262289968319237,
      "eval_regularization/policy_data_loss": 1.3405098915100098,
      "eval_regularization/policy_ref_data_loss_gap": 0.006788152735680342,
      "eval_regularization/reference_data_loss": 1.33372163772583,
      "eval_regularization/reverse_KL": 0.0009145565563812852,
      "eval_rewards/accuracies": 0.7145000100135803,
      "eval_rewards/chosen": 0.0039162905886769295,
      "eval_rewards/margins": 0.010215412825345993,
      "eval_rewards/rejected": -0.006299122702330351,
      "eval_runtime": 679.0408,
      "eval_samples_per_second": 2.945,
      "eval_steps_per_second": 1.473,
      "eval_verify/bz": 1.0,
      "eval_verify/constant_1": 1.0,
      "eval_verify/constant_1len": 1000.0,
      "eval_verify/gather_bz": 2.0,
      "step": 100
    },
    {
      "epoch": 0.06,
      "learning_rate": 2.8350515463917527e-06,
      "logps/chosen": -267.4963684082031,
      "logps/rejected": -238.74264526367188,
      "loss": 0.6878,
      "mask/mask_ratio": 0.4973033368587494,
      "regularization/forward_KL": 0.001136863837018609,
      "regularization/policy_data_loss": 1.3376984596252441,
      "regularization/policy_ref_data_loss_gap": 0.007283567450940609,
      "regularization/reference_data_loss": 1.330414891242981,
      "regularization/reverse_KL": 0.001120448112487793,
      "rewards/accuracies": 0.6875,
      "rewards/chosen": 0.0032392642460763454,
      "rewards/margins": 0.010863055475056171,
      "rewards/rejected": -0.007623790297657251,
      "step": 110,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.06,
      "learning_rate": 3.0927835051546395e-06,
      "logps/chosen": -266.2042541503906,
      "logps/rejected": -250.436279296875,
      "loss": 0.6847,
      "mask/mask_ratio": 0.4924188256263733,
      "regularization/forward_KL": 0.0017974560614675283,
      "regularization/policy_data_loss": 1.319218397140503,
      "regularization/policy_ref_data_loss_gap": 0.011516178026795387,
      "regularization/reference_data_loss": 1.3077023029327393,
      "regularization/reverse_KL": 0.0017597066471353173,
      "rewards/accuracies": 0.721875011920929,
      "rewards/chosen": 0.005417727865278721,
      "rewards/margins": 0.01713070645928383,
      "rewards/rejected": -0.011712977662682533,
      "step": 120,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.07,
      "learning_rate": 3.350515463917526e-06,
      "logps/chosen": -298.1942443847656,
      "logps/rejected": -260.88787841796875,
      "loss": 0.6825,
      "mask/mask_ratio": 0.49585145711898804,
      "regularization/forward_KL": 0.003197314217686653,
      "regularization/policy_data_loss": 1.3534491062164307,
      "regularization/policy_ref_data_loss_gap": 0.01773560419678688,
      "regularization/reference_data_loss": 1.3357136249542236,
      "regularization/reverse_KL": 0.0030930046923458576,
      "rewards/accuracies": 0.715624988079071,
      "rewards/chosen": 0.008045461028814316,
      "rewards/margins": 0.021784009411931038,
      "rewards/rejected": -0.013738548383116722,
      "step": 130,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.07,
      "learning_rate": 3.6082474226804126e-06,
      "logps/chosen": -304.853759765625,
      "logps/rejected": -263.5474548339844,
      "loss": 0.6788,
      "mask/mask_ratio": 0.5004404783248901,
      "regularization/forward_KL": 0.005552223883569241,
      "regularization/policy_data_loss": 1.302833080291748,
      "regularization/policy_ref_data_loss_gap": 0.02031988836824894,
      "regularization/reference_data_loss": 1.28251314163208,
      "regularization/reverse_KL": 0.005343655589967966,
      "rewards/accuracies": 0.734375,
      "rewards/chosen": 0.007048692554235458,
      "rewards/margins": 0.029550602659583092,
      "rewards/rejected": -0.022501910105347633,
      "step": 140,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.08,
      "learning_rate": 3.865979381443299e-06,
      "logps/chosen": -271.67205810546875,
      "logps/rejected": -241.82235717773438,
      "loss": 0.6744,
      "mask/mask_ratio": 0.4779718518257141,
      "regularization/forward_KL": 0.01054485235363245,
      "regularization/policy_data_loss": 1.3759247064590454,
      "regularization/policy_ref_data_loss_gap": 0.03953505679965019,
      "regularization/reference_data_loss": 1.3363895416259766,
      "regularization/reverse_KL": 0.009854511357843876,
      "rewards/accuracies": 0.7437499761581421,
      "rewards/chosen": 0.005561177618801594,
      "rewards/margins": 0.0392768494784832,
      "rewards/rejected": -0.03371566906571388,
      "step": 150,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.08,
      "learning_rate": 4.123711340206186e-06,
      "logps/chosen": -301.7065124511719,
      "logps/rejected": -253.6079864501953,
      "loss": 0.6667,
      "mask/mask_ratio": 0.49007949233055115,
      "regularization/forward_KL": 0.017336122691631317,
      "regularization/policy_data_loss": 1.3653223514556885,
      "regularization/policy_ref_data_loss_gap": 0.044966112822294235,
      "regularization/reference_data_loss": 1.3203563690185547,
      "regularization/reverse_KL": 0.015951037406921387,
      "rewards/accuracies": 0.7437499761581421,
      "rewards/chosen": 0.0057354886084795,
      "rewards/margins": 0.05589023977518082,
      "rewards/rejected": -0.05015474557876587,
      "step": 160,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.09,
      "learning_rate": 4.381443298969073e-06,
      "logps/chosen": -282.05084228515625,
      "logps/rejected": -228.6247100830078,
      "loss": 0.6588,
      "mask/mask_ratio": 0.474916934967041,
      "regularization/forward_KL": 0.03297495096921921,
      "regularization/policy_data_loss": 1.4297138452529907,
      "regularization/policy_ref_data_loss_gap": 0.09214094281196594,
      "regularization/reference_data_loss": 1.3375728130340576,
      "regularization/reverse_KL": 0.029324505478143692,
      "rewards/accuracies": 0.7749999761581421,
      "rewards/chosen": -0.005311681888997555,
      "rewards/margins": 0.07334191352128983,
      "rewards/rejected": -0.07865358889102936,
      "step": 170,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.09,
      "learning_rate": 4.639175257731959e-06,
      "logps/chosen": -255.3544464111328,
      "logps/rejected": -215.35073852539062,
      "loss": 0.6493,
      "mask/mask_ratio": 0.4415830969810486,
      "regularization/forward_KL": 0.05764538049697876,
      "regularization/policy_data_loss": 1.5159928798675537,
      "regularization/policy_ref_data_loss_gap": 0.1430792510509491,
      "regularization/reference_data_loss": 1.3729135990142822,
      "regularization/reverse_KL": 0.04929365590214729,
      "rewards/accuracies": 0.75,
      "rewards/chosen": -0.024316953495144844,
      "rewards/margins": 0.0962405651807785,
      "rewards/rejected": -0.120557501912117,
      "step": 180,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.1,
      "learning_rate": 4.8969072164948455e-06,
      "logps/chosen": -297.7956237792969,
      "logps/rejected": -251.384521484375,
      "loss": 0.645,
      "mask/mask_ratio": 0.4943568706512451,
      "regularization/forward_KL": 0.08729816228151321,
      "regularization/policy_data_loss": 1.475376844406128,
      "regularization/policy_ref_data_loss_gap": 0.16268345713615417,
      "regularization/reference_data_loss": 1.3126932382583618,
      "regularization/reverse_KL": 0.07220248132944107,
      "rewards/accuracies": 0.734375,
      "rewards/chosen": -0.05386154726147652,
      "rewards/margins": 0.11115912348031998,
      "rewards/rejected": -0.1650206744670868,
      "step": 190,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.1,
      "learning_rate": 4.999853643599349e-06,
      "logps/chosen": -283.1971740722656,
      "logps/rejected": -243.7222900390625,
      "loss": 0.6259,
      "mask/mask_ratio": 0.45931917428970337,
      "regularization/forward_KL": 0.14040490984916687,
      "regularization/policy_data_loss": 1.6028587818145752,
      "regularization/policy_ref_data_loss_gap": 0.2657029628753662,
      "regularization/reference_data_loss": 1.3371559381484985,
      "regularization/reverse_KL": 0.10716424137353897,
      "rewards/accuracies": 0.731249988079071,
      "rewards/chosen": -0.08315013349056244,
      "rewards/margins": 0.15731294453144073,
      "rewards/rejected": -0.24046309292316437,
      "step": 200,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.1,
      "eval_logps/chosen": -287.937255859375,
      "eval_logps/rejected": -262.2265625,
      "eval_loss": 0.6257880926132202,
      "eval_mask/mask_ratio": 0.4808923900127411,
      "eval_regularization/forward_KL": 0.17268812656402588,
      "eval_regularization/policy_data_loss": 1.6331102848052979,
      "eval_regularization/policy_ref_data_loss_gap": 0.2993886470794678,
      "eval_regularization/reference_data_loss": 1.33372163772583,
      "eval_regularization/reverse_KL": 0.12886792421340942,
      "eval_rewards/accuracies": 0.7145000100135803,
      "eval_rewards/chosen": -0.1278514266014099,
      "eval_rewards/margins": 0.16267289221286774,
      "eval_rewards/rejected": -0.29052433371543884,
      "eval_runtime": 679.2062,
      "eval_samples_per_second": 2.945,
      "eval_steps_per_second": 1.472,
      "eval_verify/bz": 1.0,
      "eval_verify/constant_1": 1.0,
      "eval_verify/constant_1len": 1000.0,
      "eval_verify/gather_bz": 2.0,
      "step": 200
    },
    {
      "epoch": 0.11,
      "learning_rate": 4.998959305429261e-06,
      "logps/chosen": -282.1123046875,
      "logps/rejected": -287.3293762207031,
      "loss": 0.6302,
      "mask/mask_ratio": 0.4907340407371521,
      "regularization/forward_KL": 0.18937243521213531,
      "regularization/policy_data_loss": 1.5821033716201782,
      "regularization/policy_ref_data_loss_gap": 0.2877712845802307,
      "regularization/reference_data_loss": 1.2943320274353027,
      "regularization/reverse_KL": 0.14068856835365295,
      "rewards/accuracies": 0.703125,
      "rewards/chosen": -0.17428387701511383,
      "rewards/margins": 0.161749929189682,
      "rewards/rejected": -0.33603379130363464,
      "step": 210,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.11,
      "learning_rate": 4.997252228714279e-06,
      "logps/chosen": -304.8265686035156,
      "logps/rejected": -274.01947021484375,
      "loss": 0.6055,
      "mask/mask_ratio": 0.4929993748664856,
      "regularization/forward_KL": 0.26238906383514404,
      "regularization/policy_data_loss": 1.6742712259292603,
      "regularization/policy_ref_data_loss_gap": 0.4014686644077301,
      "regularization/reference_data_loss": 1.2728025913238525,
      "regularization/reverse_KL": 0.1850946992635727,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": -0.2259730100631714,
      "rewards/margins": 0.22483393549919128,
      "rewards/rejected": -0.4508069157600403,
      "step": 220,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.12,
      "learning_rate": 4.994732968648336e-06,
      "logps/chosen": -298.37725830078125,
      "logps/rejected": -290.5216979980469,
      "loss": 0.5937,
      "mask/mask_ratio": 0.4748677611351013,
      "regularization/forward_KL": 0.3333453834056854,
      "regularization/policy_data_loss": 1.8790830373764038,
      "regularization/policy_ref_data_loss_gap": 0.5127241015434265,
      "regularization/reference_data_loss": 1.3663588762283325,
      "regularization/reverse_KL": 0.2211645543575287,
      "rewards/accuracies": 0.7406250238418579,
      "rewards/chosen": -0.26156681776046753,
      "rewards/margins": 0.27300310134887695,
      "rewards/rejected": -0.5345699191093445,
      "step": 230,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.12,
      "learning_rate": 4.991402344572409e-06,
      "logps/chosen": -326.3783264160156,
      "logps/rejected": -291.57733154296875,
      "loss": 0.5943,
      "mask/mask_ratio": 0.4752369821071625,
      "regularization/forward_KL": 0.40553778409957886,
      "regularization/policy_data_loss": 1.9021514654159546,
      "regularization/policy_ref_data_loss_gap": 0.5666393041610718,
      "regularization/reference_data_loss": 1.3355120420455933,
      "regularization/reverse_KL": 0.26898378133773804,
      "rewards/accuracies": 0.734375,
      "rewards/chosen": -0.34149032831192017,
      "rewards/margins": 0.28985053300857544,
      "rewards/rejected": -0.6313409209251404,
      "step": 240,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.13,
      "learning_rate": 4.987261439708047e-06,
      "logps/chosen": -311.4671936035156,
      "logps/rejected": -313.52081298828125,
      "loss": 0.605,
      "mask/mask_ratio": 0.4988730549812317,
      "regularization/forward_KL": 0.43238434195518494,
      "regularization/policy_data_loss": 1.9205989837646484,
      "regularization/policy_ref_data_loss_gap": 0.5820193886756897,
      "regularization/reference_data_loss": 1.3385794162750244,
      "regularization/reverse_KL": 0.2932417392730713,
      "rewards/accuracies": 0.6875,
      "rewards/chosen": -0.389052152633667,
      "rewards/margins": 0.2913573384284973,
      "rewards/rejected": -0.6804095506668091,
      "step": 250,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.13,
      "learning_rate": 4.982311600805066e-06,
      "logps/chosen": -342.9245910644531,
      "logps/rejected": -307.47979736328125,
      "loss": 0.5942,
      "mask/mask_ratio": 0.4794413447380066,
      "regularization/forward_KL": 0.45548295974731445,
      "regularization/policy_data_loss": 1.9704278707504272,
      "regularization/policy_ref_data_loss_gap": 0.6466713547706604,
      "regularization/reference_data_loss": 1.3237565755844116,
      "regularization/reverse_KL": 0.3007846176624298,
      "rewards/accuracies": 0.731249988079071,
      "rewards/chosen": -0.3935951292514801,
      "rewards/margins": 0.314331591129303,
      "rewards/rejected": -0.7079266905784607,
      "step": 260,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.14,
      "learning_rate": 4.976554437703559e-06,
      "logps/chosen": -308.3914489746094,
      "logps/rejected": -292.6021423339844,
      "loss": 0.579,
      "mask/mask_ratio": 0.4852737784385681,
      "regularization/forward_KL": 0.45756810903549194,
      "regularization/policy_data_loss": 1.8738495111465454,
      "regularization/policy_ref_data_loss_gap": 0.622580349445343,
      "regularization/reference_data_loss": 1.2512691020965576,
      "regularization/reverse_KL": 0.2927935719490051,
      "rewards/accuracies": 0.731249988079071,
      "rewards/chosen": -0.35448208451271057,
      "rewards/margins": 0.34564077854156494,
      "rewards/rejected": -0.7001228928565979,
      "step": 270,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.14,
      "learning_rate": 4.969991822810307e-06,
      "logps/chosen": -309.0791931152344,
      "logps/rejected": -301.0000305175781,
      "loss": 0.5727,
      "mask/mask_ratio": 0.4700326919555664,
      "regularization/forward_KL": 0.49474477767944336,
      "regularization/policy_data_loss": 2.048013210296631,
      "regularization/policy_ref_data_loss_gap": 0.6938953399658203,
      "regularization/reference_data_loss": 1.3541176319122314,
      "regularization/reverse_KL": 0.3161839544773102,
      "rewards/accuracies": 0.762499988079071,
      "rewards/chosen": -0.33057111501693726,
      "rewards/margins": 0.3817325234413147,
      "rewards/rejected": -0.7123036980628967,
      "step": 280,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.15,
      "learning_rate": 4.962625890489834e-06,
      "logps/chosen": -319.30169677734375,
      "logps/rejected": -312.2832946777344,
      "loss": 0.5699,
      "mask/mask_ratio": 0.49438172578811646,
      "regularization/forward_KL": 0.5500718355178833,
      "regularization/policy_data_loss": 2.1421239376068115,
      "regularization/policy_ref_data_loss_gap": 0.8315132856369019,
      "regularization/reference_data_loss": 1.3106107711791992,
      "regularization/reverse_KL": 0.32790082693099976,
      "rewards/accuracies": 0.7437499761581421,
      "rewards/chosen": -0.42027369141578674,
      "rewards/margins": 0.37420108914375305,
      "rewards/rejected": -0.794474720954895,
      "step": 290,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.15,
      "learning_rate": 4.954459036370232e-06,
      "logps/chosen": -289.519287109375,
      "logps/rejected": -324.66534423828125,
      "loss": 0.5436,
      "mask/mask_ratio": 0.47517338395118713,
      "regularization/forward_KL": 0.6539136171340942,
      "regularization/policy_data_loss": 2.2663440704345703,
      "regularization/policy_ref_data_loss_gap": 0.9426689147949219,
      "regularization/reference_data_loss": 1.3236749172210693,
      "regularization/reverse_KL": 0.36934933066368103,
      "rewards/accuracies": 0.731249988079071,
      "rewards/chosen": -0.4274187684059143,
      "rewards/margins": 0.46259841322898865,
      "rewards/rejected": -0.8900171518325806,
      "step": 300,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.15,
      "eval_logps/chosen": -322.51251220703125,
      "eval_logps/rejected": -327.1224060058594,
      "eval_loss": 0.549545168876648,
      "eval_mask/mask_ratio": 0.4808923900127411,
      "eval_regularization/forward_KL": 0.6903554201126099,
      "eval_regularization/policy_data_loss": 2.2939770221710205,
      "eval_regularization/policy_ref_data_loss_gap": 0.9602554440498352,
      "eval_regularization/reference_data_loss": 1.33372163772583,
      "eval_regularization/reverse_KL": 0.3994987905025482,
      "eval_rewards/accuracies": 0.7415000200271606,
      "eval_rewards/chosen": -0.47360387444496155,
      "eval_rewards/margins": 0.4658789336681366,
      "eval_rewards/rejected": -0.9394828677177429,
      "eval_runtime": 681.196,
      "eval_samples_per_second": 2.936,
      "eval_steps_per_second": 1.468,
      "eval_verify/bz": 1.0,
      "eval_verify/constant_1": 1.0,
      "eval_verify/constant_1len": 1000.0,
      "eval_verify/gather_bz": 2.0,
      "step": 300
    },
    {
      "epoch": 0.16,
      "learning_rate": 4.945493916564034e-06,
      "logps/chosen": -316.70281982421875,
      "logps/rejected": -315.42303466796875,
      "loss": 0.5617,
      "mask/mask_ratio": 0.4630749225616455,
      "regularization/forward_KL": 0.728979229927063,
      "regularization/policy_data_loss": 2.2659270763397217,
      "regularization/policy_ref_data_loss_gap": 0.937238335609436,
      "regularization/reference_data_loss": 1.3286888599395752,
      "regularization/reverse_KL": 0.4182661473751068,
      "rewards/accuracies": 0.7093750238418579,
      "rewards/chosen": -0.481964111328125,
      "rewards/margins": 0.4231549799442291,
      "rewards/rejected": -0.9051190614700317,
      "step": 310,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.17,
      "learning_rate": 4.9357334468043675e-06,
      "logps/chosen": -350.0586853027344,
      "logps/rejected": -328.50799560546875,
      "loss": 0.5355,
      "mask/mask_ratio": 0.4867240786552429,
      "regularization/forward_KL": 0.799272894859314,
      "regularization/policy_data_loss": 2.3319547176361084,
      "regularization/policy_ref_data_loss_gap": 1.0592420101165771,
      "regularization/reference_data_loss": 1.2727129459381104,
      "regularization/reverse_KL": 0.42697662115097046,
      "rewards/accuracies": 0.768750011920929,
      "rewards/chosen": -0.5991016626358032,
      "rewards/margins": 0.5477563142776489,
      "rewards/rejected": -1.1468579769134521,
      "step": 320,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.17,
      "learning_rate": 4.92518080149666e-06,
      "logps/chosen": -323.0548095703125,
      "logps/rejected": -335.94873046875,
      "loss": 0.5237,
      "mask/mask_ratio": 0.4889696538448334,
      "regularization/forward_KL": 0.8510375022888184,
      "regularization/policy_data_loss": 2.433258533477783,
      "regularization/policy_ref_data_loss_gap": 1.1440953016281128,
      "regularization/reference_data_loss": 1.2891628742218018,
      "regularization/reverse_KL": 0.44076618552207947,
      "rewards/accuracies": 0.746874988079071,
      "rewards/chosen": -0.60528564453125,
      "rewards/margins": 0.5634106397628784,
      "rewards/rejected": -1.168696403503418,
      "step": 330,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.18,
      "learning_rate": 4.913839412686238e-06,
      "logps/chosen": -352.9324645996094,
      "logps/rejected": -370.67041015625,
      "loss": 0.5447,
      "mask/mask_ratio": 0.5029616355895996,
      "regularization/forward_KL": 0.8693239092826843,
      "regularization/policy_data_loss": 2.5582780838012695,
      "regularization/policy_ref_data_loss_gap": 1.2380913496017456,
      "regularization/reference_data_loss": 1.3201866149902344,
      "regularization/reverse_KL": 0.4576171338558197,
      "rewards/accuracies": 0.7281249761581421,
      "rewards/chosen": -0.6309934854507446,
      "rewards/margins": 0.6106182336807251,
      "rewards/rejected": -1.2416117191314697,
      "step": 340,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.18,
      "learning_rate": 4.901712968942101e-06,
      "logps/chosen": -336.093505859375,
      "logps/rejected": -360.5519104003906,
      "loss": 0.5329,
      "mask/mask_ratio": 0.47186803817749023,
      "regularization/forward_KL": 0.9556936025619507,
      "regularization/policy_data_loss": 2.735114574432373,
      "regularization/policy_ref_data_loss_gap": 1.3816810846328735,
      "regularization/reference_data_loss": 1.3534337282180786,
      "regularization/reverse_KL": 0.489070326089859,
      "rewards/accuracies": 0.746874988079071,
      "rewards/chosen": -0.621803879737854,
      "rewards/margins": 0.582781195640564,
      "rewards/rejected": -1.204585075378418,
      "step": 350,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.19,
      "learning_rate": 4.888805414157304e-06,
      "logps/chosen": -338.7613220214844,
      "logps/rejected": -345.40423583984375,
      "loss": 0.5195,
      "mask/mask_ratio": 0.4770136773586273,
      "regularization/forward_KL": 0.9781936407089233,
      "regularization/policy_data_loss": 2.580606698989868,
      "regularization/policy_ref_data_loss_gap": 1.2824639081954956,
      "regularization/reference_data_loss": 1.298142671585083,
      "regularization/reverse_KL": 0.5209106206893921,
      "rewards/accuracies": 0.784375011920929,
      "rewards/chosen": -0.5665292143821716,
      "rewards/margins": 0.6311505436897278,
      "rewards/rejected": -1.1976797580718994,
      "step": 360,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.19,
      "learning_rate": 4.875120946266272e-06,
      "logps/chosen": -346.84576416015625,
      "logps/rejected": -362.7752990722656,
      "loss": 0.5097,
      "mask/mask_ratio": 0.4865848422050476,
      "regularization/forward_KL": 0.9661129117012024,
      "regularization/policy_data_loss": 2.6348929405212402,
      "regularization/policy_ref_data_loss_gap": 1.3101383447647095,
      "regularization/reference_data_loss": 1.3247545957565308,
      "regularization/reverse_KL": 0.4974105954170227,
      "rewards/accuracies": 0.7875000238418579,
      "rewards/chosen": -0.5897595882415771,
      "rewards/margins": 0.6567105054855347,
      "rewards/rejected": -1.2464700937271118,
      "step": 370,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.2,
      "learning_rate": 4.8606640158795034e-06,
      "logps/chosen": -343.9723815917969,
      "logps/rejected": -364.3437194824219,
      "loss": 0.5535,
      "mask/mask_ratio": 0.478668212890625,
      "regularization/forward_KL": 0.9982131123542786,
      "regularization/policy_data_loss": 2.7511227130889893,
      "regularization/policy_ref_data_loss_gap": 1.4281729459762573,
      "regularization/reference_data_loss": 1.322949767112732,
      "regularization/reverse_KL": 0.5359824895858765,
      "rewards/accuracies": 0.71875,
      "rewards/chosen": -0.6767465472221375,
      "rewards/margins": 0.5853181481361389,
      "rewards/rejected": -1.2620646953582764,
      "step": 380,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.2,
      "learning_rate": 4.845439324836097e-06,
      "logps/chosen": -327.82135009765625,
      "logps/rejected": -353.2066345214844,
      "loss": 0.4909,
      "mask/mask_ratio": 0.47248950600624084,
      "regularization/forward_KL": 0.9170303344726562,
      "regularization/policy_data_loss": 2.356508731842041,
      "regularization/policy_ref_data_loss_gap": 1.049116849899292,
      "regularization/reference_data_loss": 1.3073920011520386,
      "regularization/reverse_KL": 0.49607810378074646,
      "rewards/accuracies": 0.765625,
      "rewards/chosen": -0.5266287326812744,
      "rewards/margins": 0.683322548866272,
      "rewards/rejected": -1.2099512815475464,
      "step": 390,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.21,
      "learning_rate": 4.829451824674565e-06,
      "logps/chosen": -330.2622985839844,
      "logps/rejected": -348.4248962402344,
      "loss": 0.5492,
      "mask/mask_ratio": 0.4730641841888428,
      "regularization/forward_KL": 1.0259394645690918,
      "regularization/policy_data_loss": 2.765868663787842,
      "regularization/policy_ref_data_loss_gap": 1.4018539190292358,
      "regularization/reference_data_loss": 1.364014983177185,
      "regularization/reverse_KL": 0.5556864738464355,
      "rewards/accuracies": 0.706250011920929,
      "rewards/chosen": -0.5864711999893188,
      "rewards/margins": 0.5495952367782593,
      "rewards/rejected": -1.1360664367675781,
      "step": 400,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.21,
      "eval_logps/chosen": -332.980712890625,
      "eval_logps/rejected": -353.32232666015625,
      "eval_loss": 0.5161077976226807,
      "eval_mask/mask_ratio": 0.4808923900127411,
      "eval_regularization/forward_KL": 0.9794394373893738,
      "eval_regularization/policy_data_loss": 2.7573928833007812,
      "eval_regularization/policy_ref_data_loss_gap": 1.423671007156372,
      "eval_regularization/reference_data_loss": 1.33372163772583,
      "eval_regularization/reverse_KL": 0.5145657062530518,
      "eval_rewards/accuracies": 0.7544999718666077,
      "eval_rewards/chosen": -0.5782856345176697,
      "eval_rewards/margins": 0.6231963634490967,
      "eval_rewards/rejected": -1.2014819383621216,
      "eval_runtime": 681.3898,
      "eval_samples_per_second": 2.935,
      "eval_steps_per_second": 1.468,
      "eval_verify/bz": 1.0,
      "eval_verify/constant_1": 1.0,
      "eval_verify/constant_1len": 1000.0,
      "eval_verify/gather_bz": 2.0,
      "step": 400
    },
    {
      "epoch": 0.21,
      "learning_rate": 4.812706715022445e-06,
      "logps/chosen": -332.74517822265625,
      "logps/rejected": -341.34893798828125,
      "loss": 0.5075,
      "mask/mask_ratio": 0.47742366790771484,
      "regularization/forward_KL": 0.9588286280632019,
      "regularization/policy_data_loss": 2.6995902061462402,
      "regularization/policy_ref_data_loss_gap": 1.4055753946304321,
      "regularization/reference_data_loss": 1.2940146923065186,
      "regularization/reverse_KL": 0.4842945635318756,
      "rewards/accuracies": 0.762499988079071,
      "rewards/chosen": -0.543470025062561,
      "rewards/margins": 0.6269677877426147,
      "rewards/rejected": -1.1704375743865967,
      "step": 410,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.22,
      "learning_rate": 4.7952094419052174e-06,
      "logps/chosen": -346.64385986328125,
      "logps/rejected": -342.3882751464844,
      "loss": 0.5111,
      "mask/mask_ratio": 0.4781650900840759,
      "regularization/forward_KL": 0.9628578424453735,
      "regularization/policy_data_loss": 2.6919660568237305,
      "regularization/policy_ref_data_loss_gap": 1.4117763042449951,
      "regularization/reference_data_loss": 1.280190110206604,
      "regularization/reverse_KL": 0.5150149464607239,
      "rewards/accuracies": 0.7749999761581421,
      "rewards/chosen": -0.6061595678329468,
      "rewards/margins": 0.6440192461013794,
      "rewards/rejected": -1.2501788139343262,
      "step": 420,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.22,
      "learning_rate": 4.776965695975092e-06,
      "logps/chosen": -325.66461181640625,
      "logps/rejected": -355.48846435546875,
      "loss": 0.4999,
      "mask/mask_ratio": 0.4640035033226013,
      "regularization/forward_KL": 1.2023184299468994,
      "regularization/policy_data_loss": 3.3096261024475098,
      "regularization/policy_ref_data_loss_gap": 1.9992198944091797,
      "regularization/reference_data_loss": 1.3104063272476196,
      "regularization/reverse_KL": 0.5834243297576904,
      "rewards/accuracies": 0.762499988079071,
      "rewards/chosen": -0.6900832653045654,
      "rewards/margins": 0.7149588465690613,
      "rewards/rejected": -1.4050421714782715,
      "step": 430,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.23,
      "learning_rate": 4.757981410660232e-06,
      "logps/chosen": -348.6309814453125,
      "logps/rejected": -372.73992919921875,
      "loss": 0.4997,
      "mask/mask_ratio": 0.49740973114967346,
      "regularization/forward_KL": 1.043830156326294,
      "regularization/policy_data_loss": 2.916534900665283,
      "regularization/policy_ref_data_loss_gap": 1.591304898262024,
      "regularization/reference_data_loss": 1.3252300024032593,
      "regularization/reverse_KL": 0.5300347805023193,
      "rewards/accuracies": 0.784375011920929,
      "rewards/chosen": -0.6902838945388794,
      "rewards/margins": 0.7124063372612,
      "rewards/rejected": -1.4026902914047241,
      "step": 440,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.23,
      "learning_rate": 4.73826276023502e-06,
      "logps/chosen": -328.333984375,
      "logps/rejected": -356.65277099609375,
      "loss": 0.518,
      "mask/mask_ratio": 0.48409780859947205,
      "regularization/forward_KL": 1.035197377204895,
      "regularization/policy_data_loss": 2.8669793605804443,
      "regularization/policy_ref_data_loss_gap": 1.520315408706665,
      "regularization/reference_data_loss": 1.3466639518737793,
      "regularization/reverse_KL": 0.5027146935462952,
      "rewards/accuracies": 0.7406250238418579,
      "rewards/chosen": -0.5930204391479492,
      "rewards/margins": 0.6177513003349304,
      "rewards/rejected": -1.2107717990875244,
      "step": 450,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.24,
      "learning_rate": 4.717816157811993e-06,
      "logps/chosen": -304.90118408203125,
      "logps/rejected": -319.99493408203125,
      "loss": 0.5422,
      "mask/mask_ratio": 0.45479053258895874,
      "regularization/forward_KL": 1.0553574562072754,
      "regularization/policy_data_loss": 2.755803108215332,
      "regularization/policy_ref_data_loss_gap": 1.4146441221237183,
      "regularization/reference_data_loss": 1.3411591053009033,
      "regularization/reverse_KL": 0.5408639907836914,
      "rewards/accuracies": 0.721875011920929,
      "rewards/chosen": -0.5867568254470825,
      "rewards/margins": 0.5797747373580933,
      "rewards/rejected": -1.1665315628051758,
      "step": 460,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.24,
      "learning_rate": 4.6966482532561035e-06,
      "logps/chosen": -347.4236755371094,
      "logps/rejected": -336.7795715332031,
      "loss": 0.4944,
      "mask/mask_ratio": 0.482670396566391,
      "regularization/forward_KL": 1.026317834854126,
      "regularization/policy_data_loss": 2.8399059772491455,
      "regularization/policy_ref_data_loss_gap": 1.497495412826538,
      "regularization/reference_data_loss": 1.3424103260040283,
      "regularization/reverse_KL": 0.49248355627059937,
      "rewards/accuracies": 0.8062499761581421,
      "rewards/chosen": -0.552312970161438,
      "rewards/margins": 0.6807326674461365,
      "rewards/rejected": -1.2330455780029297,
      "step": 470,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.25,
      "learning_rate": 4.674765931021976e-06,
      "logps/chosen": -320.6754455566406,
      "logps/rejected": -388.69427490234375,
      "loss": 0.4839,
      "mask/mask_ratio": 0.4728547930717468,
      "regularization/forward_KL": 1.1917495727539062,
      "regularization/policy_data_loss": 3.298811435699463,
      "regularization/policy_ref_data_loss_gap": 1.9288368225097656,
      "regularization/reference_data_loss": 1.3699743747711182,
      "regularization/reverse_KL": 0.5374084711074829,
      "rewards/accuracies": 0.768750011920929,
      "rewards/chosen": -0.6181408762931824,
      "rewards/margins": 0.7905126810073853,
      "rewards/rejected": -1.4086534976959229,
      "step": 480,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.25,
      "learning_rate": 4.652176307914872e-06,
      "logps/chosen": -333.15765380859375,
      "logps/rejected": -369.8273620605469,
      "loss": 0.4802,
      "mask/mask_ratio": 0.458822101354599,
      "regularization/forward_KL": 1.169818639755249,
      "regularization/policy_data_loss": 3.1087582111358643,
      "regularization/policy_ref_data_loss_gap": 1.7305198907852173,
      "regularization/reference_data_loss": 1.3782384395599365,
      "regularization/reverse_KL": 0.5508654117584229,
      "rewards/accuracies": 0.7906249761581421,
      "rewards/chosen": -0.6308334469795227,
      "rewards/margins": 0.7607764005661011,
      "rewards/rejected": -1.3916099071502686,
      "step": 490,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.26,
      "learning_rate": 4.628886730776084e-06,
      "logps/chosen": -323.81951904296875,
      "logps/rejected": -369.5110778808594,
      "loss": 0.521,
      "mask/mask_ratio": 0.467355340719223,
      "regularization/forward_KL": 1.2495887279510498,
      "regularization/policy_data_loss": 2.996243953704834,
      "regularization/policy_ref_data_loss_gap": 1.66107976436615,
      "regularization/reference_data_loss": 1.3351640701293945,
      "regularization/reverse_KL": 0.5917181968688965,
      "rewards/accuracies": 0.7250000238418579,
      "rewards/chosen": -0.7246882319450378,
      "rewards/margins": 0.7109832763671875,
      "rewards/rejected": -1.4356714487075806,
      "step": 500,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.26,
      "eval_logps/chosen": -347.7219543457031,
      "eval_logps/rejected": -383.1716003417969,
      "eval_loss": 0.4981560707092285,
      "eval_mask/mask_ratio": 0.4808923900127411,
      "eval_regularization/forward_KL": 1.2015578746795654,
      "eval_regularization/policy_data_loss": 3.0006484985351562,
      "eval_regularization/policy_ref_data_loss_gap": 1.6669267416000366,
      "eval_regularization/reference_data_loss": 1.33372163772583,
      "eval_regularization/reverse_KL": 0.5622037649154663,
      "eval_rewards/accuracies": 0.7595000267028809,
      "eval_rewards/chosen": -0.7256983518600464,
      "eval_rewards/margins": 0.7742762565612793,
      "eval_rewards/rejected": -1.4999746084213257,
      "eval_runtime": 678.102,
      "eval_samples_per_second": 2.949,
      "eval_steps_per_second": 1.475,
      "eval_verify/bz": 1.0,
      "eval_verify/constant_1": 1.0,
      "eval_verify/constant_1len": 1000.0,
      "eval_verify/gather_bz": 2.0,
      "step": 500
    },
    {
      "epoch": 0.26,
      "learning_rate": 4.604904774093517e-06,
      "logps/chosen": -356.4052734375,
      "logps/rejected": -378.45367431640625,
      "loss": 0.5292,
      "mask/mask_ratio": 0.47553128004074097,
      "regularization/forward_KL": 1.2459790706634521,
      "regularization/policy_data_loss": 2.91737699508667,
      "regularization/policy_ref_data_loss_gap": 1.5836849212646484,
      "regularization/reference_data_loss": 1.333691954612732,
      "regularization/reverse_KL": 0.5698737502098083,
      "rewards/accuracies": 0.731249988079071,
      "rewards/chosen": -0.7802181839942932,
      "rewards/margins": 0.688677191734314,
      "rewards/rejected": -1.4688953161239624,
      "step": 510,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.27,
      "learning_rate": 4.580238237538232e-06,
      "logps/chosen": -355.7960205078125,
      "logps/rejected": -362.36859130859375,
      "loss": 0.535,
      "mask/mask_ratio": 0.47554054856300354,
      "regularization/forward_KL": 1.2537510395050049,
      "regularization/policy_data_loss": 3.1183278560638428,
      "regularization/policy_ref_data_loss_gap": 1.7516872882843018,
      "regularization/reference_data_loss": 1.3666408061981201,
      "regularization/reverse_KL": 0.6034277081489563,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": -0.8022186160087585,
      "rewards/margins": 0.6797652840614319,
      "rewards/rejected": -1.4819839000701904,
      "step": 520,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.27,
      "learning_rate": 4.554895143427754e-06,
      "logps/chosen": -340.38836669921875,
      "logps/rejected": -367.66558837890625,
      "loss": 0.5123,
      "mask/mask_ratio": 0.47033509612083435,
      "regularization/forward_KL": 1.1848121881484985,
      "regularization/policy_data_loss": 2.921915054321289,
      "regularization/policy_ref_data_loss_gap": 1.594711184501648,
      "regularization/reference_data_loss": 1.327203631401062,
      "regularization/reverse_KL": 0.5440836548805237,
      "rewards/accuracies": 0.7406250238418579,
      "rewards/chosen": -0.7266508936882019,
      "rewards/margins": 0.6950263977050781,
      "rewards/rejected": -1.4216772317886353,
      "step": 530,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.28,
      "learning_rate": 4.528883734116963e-06,
      "logps/chosen": -335.19384765625,
      "logps/rejected": -375.2444152832031,
      "loss": 0.5333,
      "mask/mask_ratio": 0.47666770219802856,
      "regularization/forward_KL": 1.0705522298812866,
      "regularization/policy_data_loss": 3.119077444076538,
      "regularization/policy_ref_data_loss_gap": 1.7445042133331299,
      "regularization/reference_data_loss": 1.3745734691619873,
      "regularization/reverse_KL": 0.5051863193511963,
      "rewards/accuracies": 0.7562500238418579,
      "rewards/chosen": -0.6640281677246094,
      "rewards/margins": 0.6933831572532654,
      "rewards/rejected": -1.357411503791809,
      "step": 540,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.28,
      "learning_rate": 4.502212469317433e-06,
      "logps/chosen": -321.9615478515625,
      "logps/rejected": -349.9061584472656,
      "loss": 0.4983,
      "mask/mask_ratio": 0.4487527310848236,
      "regularization/forward_KL": 1.1229972839355469,
      "regularization/policy_data_loss": 2.9346470832824707,
      "regularization/policy_ref_data_loss_gap": 1.6309387683868408,
      "regularization/reference_data_loss": 1.3037080764770508,
      "regularization/reverse_KL": 0.5186377763748169,
      "rewards/accuracies": 0.7406250238418579,
      "rewards/chosen": -0.5824334621429443,
      "rewards/margins": 0.7225069403648376,
      "rewards/rejected": -1.3049404621124268,
      "step": 550,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.29,
      "learning_rate": 4.474890023346066e-06,
      "logps/chosen": -347.91571044921875,
      "logps/rejected": -368.62750244140625,
      "loss": 0.512,
      "mask/mask_ratio": 0.48511743545532227,
      "regularization/forward_KL": 0.9346854090690613,
      "regularization/policy_data_loss": 2.6233322620391846,
      "regularization/policy_ref_data_loss_gap": 1.3477128744125366,
      "regularization/reference_data_loss": 1.2756195068359375,
      "regularization/reverse_KL": 0.4565669596195221,
      "rewards/accuracies": 0.731249988079071,
      "rewards/chosen": -0.59528648853302,
      "rewards/margins": 0.6829525232315063,
      "rewards/rejected": -1.2782390117645264,
      "step": 560,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.29,
      "learning_rate": 4.446925282303942e-06,
      "logps/chosen": -341.92926025390625,
      "logps/rejected": -356.52520751953125,
      "loss": 0.5248,
      "mask/mask_ratio": 0.49277886748313904,
      "regularization/forward_KL": 0.975311279296875,
      "regularization/policy_data_loss": 2.7457008361816406,
      "regularization/policy_ref_data_loss_gap": 1.4526615142822266,
      "regularization/reference_data_loss": 1.293039321899414,
      "regularization/reverse_KL": 0.4731406569480896,
      "rewards/accuracies": 0.7562500238418579,
      "rewards/chosen": -0.5710417032241821,
      "rewards/margins": 0.6229602694511414,
      "rewards/rejected": -1.1940020322799683,
      "step": 570,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.3,
      "learning_rate": 4.4183273411862825e-06,
      "logps/chosen": -340.62518310546875,
      "logps/rejected": -364.96832275390625,
      "loss": 0.5041,
      "mask/mask_ratio": 0.48822325468063354,
      "regularization/forward_KL": 1.0792992115020752,
      "regularization/policy_data_loss": 2.8711328506469727,
      "regularization/policy_ref_data_loss_gap": 1.5819146633148193,
      "regularization/reference_data_loss": 1.2892177104949951,
      "regularization/reverse_KL": 0.5447771549224854,
      "rewards/accuracies": 0.765625,
      "rewards/chosen": -0.6432851552963257,
      "rewards/margins": 0.7538820505142212,
      "rewards/rejected": -1.3971672058105469,
      "step": 580,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.3,
      "learning_rate": 4.38910550092448e-06,
      "logps/chosen": -333.5392150878906,
      "logps/rejected": -394.31390380859375,
      "loss": 0.4716,
      "mask/mask_ratio": 0.4824071526527405,
      "regularization/forward_KL": 1.1998240947723389,
      "regularization/policy_data_loss": 3.144530773162842,
      "regularization/policy_ref_data_loss_gap": 1.808225393295288,
      "regularization/reference_data_loss": 1.3363056182861328,
      "regularization/reverse_KL": 0.5876916646957397,
      "rewards/accuracies": 0.78125,
      "rewards/chosen": -0.666220486164093,
      "rewards/margins": 0.834365725517273,
      "rewards/rejected": -1.5005860328674316,
      "step": 590,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.31,
      "learning_rate": 4.359269265361147e-06,
      "logps/chosen": -326.3905334472656,
      "logps/rejected": -373.6546936035156,
      "loss": 0.5152,
      "mask/mask_ratio": 0.4569844603538513,
      "regularization/forward_KL": 1.312604308128357,
      "regularization/policy_data_loss": 3.4166579246520996,
      "regularization/policy_ref_data_loss_gap": 2.0448861122131348,
      "regularization/reference_data_loss": 1.3717725276947021,
      "regularization/reverse_KL": 0.6144155263900757,
      "rewards/accuracies": 0.7718750238418579,
      "rewards/chosen": -0.7360419631004333,
      "rewards/margins": 0.7235785126686096,
      "rewards/rejected": -1.459620475769043,
      "step": 600,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.31,
      "eval_logps/chosen": -341.0960693359375,
      "eval_logps/rejected": -378.1453857421875,
      "eval_loss": 0.48873645067214966,
      "eval_mask/mask_ratio": 0.4808923900127411,
      "eval_regularization/forward_KL": 1.2195603847503662,
      "eval_regularization/policy_data_loss": 3.0234711170196533,
      "eval_regularization/policy_ref_data_loss_gap": 1.6897492408752441,
      "eval_regularization/reference_data_loss": 1.33372163772583,
      "eval_regularization/reverse_KL": 0.6044374704360962,
      "eval_rewards/accuracies": 0.7684999704360962,
      "eval_rewards/chosen": -0.6594394445419312,
      "eval_rewards/margins": 0.7902729511260986,
      "eval_rewards/rejected": -1.4497122764587402,
      "eval_runtime": 683.6621,
      "eval_samples_per_second": 2.925,
      "eval_steps_per_second": 1.463,
      "eval_verify/bz": 1.0,
      "eval_verify/constant_1": 1.0,
      "eval_verify/constant_1len": 1000.0,
      "eval_verify/gather_bz": 2.0,
      "step": 600
    },
    {
      "epoch": 0.32,
      "learning_rate": 4.328828338159173e-06,
      "logps/chosen": -367.79779052734375,
      "logps/rejected": -380.0319519042969,
      "loss": 0.5199,
      "mask/mask_ratio": 0.495175302028656,
      "regularization/forward_KL": 1.1747848987579346,
      "regularization/policy_data_loss": 2.8271260261535645,
      "regularization/policy_ref_data_loss_gap": 1.5280870199203491,
      "regularization/reference_data_loss": 1.2990391254425049,
      "regularization/reverse_KL": 0.5893415212631226,
      "rewards/accuracies": 0.731249988079071,
      "rewards/chosen": -0.6887508630752563,
      "rewards/margins": 0.7675926089286804,
      "rewards/rejected": -1.456343412399292,
      "step": 610,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.32,
      "learning_rate": 4.297792619645797e-06,
      "logps/chosen": -322.62579345703125,
      "logps/rejected": -365.09686279296875,
      "loss": 0.5268,
      "mask/mask_ratio": 0.4479925036430359,
      "regularization/forward_KL": 1.2744532823562622,
      "regularization/policy_data_loss": 3.409987688064575,
      "regularization/policy_ref_data_loss_gap": 2.009315013885498,
      "regularization/reference_data_loss": 1.4006729125976562,
      "regularization/reverse_KL": 0.6040331721305847,
      "rewards/accuracies": 0.753125011920929,
      "rewards/chosen": -0.6801968216896057,
      "rewards/margins": 0.7011739611625671,
      "rewards/rejected": -1.3813707828521729,
      "step": 620,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.33,
      "learning_rate": 4.266172203592715e-06,
      "logps/chosen": -333.278564453125,
      "logps/rejected": -365.46282958984375,
      "loss": 0.4727,
      "mask/mask_ratio": 0.48246487975120544,
      "regularization/forward_KL": 1.2218214273452759,
      "regularization/policy_data_loss": 3.3435986042022705,
      "regularization/policy_ref_data_loss_gap": 2.0043203830718994,
      "regularization/reference_data_loss": 1.339278221130371,
      "regularization/reverse_KL": 0.5864050984382629,
      "rewards/accuracies": 0.78125,
      "rewards/chosen": -0.6374204158782959,
      "rewards/margins": 0.8242694139480591,
      "rewards/rejected": -1.4616897106170654,
      "step": 630,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.33,
      "learning_rate": 4.233977373933271e-06,
      "logps/chosen": -341.19427490234375,
      "logps/rejected": -372.0128479003906,
      "loss": 0.4854,
      "mask/mask_ratio": 0.4671974182128906,
      "regularization/forward_KL": 1.3450483083724976,
      "regularization/policy_data_loss": 3.43389892578125,
      "regularization/policy_ref_data_loss_gap": 2.067518472671509,
      "regularization/reference_data_loss": 1.3663806915283203,
      "regularization/reverse_KL": 0.6209944486618042,
      "rewards/accuracies": 0.793749988079071,
      "rewards/chosen": -0.7502115964889526,
      "rewards/margins": 0.8481420278549194,
      "rewards/rejected": -1.598353624343872,
      "step": 640,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.34,
      "learning_rate": 4.201218601417812e-06,
      "logps/chosen": -343.30450439453125,
      "logps/rejected": -366.1689147949219,
      "loss": 0.5417,
      "mask/mask_ratio": 0.4709855914115906,
      "regularization/forward_KL": 1.5192339420318604,
      "regularization/policy_data_loss": 3.8282554149627686,
      "regularization/policy_ref_data_loss_gap": 2.464409351348877,
      "regularization/reference_data_loss": 1.363845944404602,
      "regularization/reverse_KL": 0.6902705430984497,
      "rewards/accuracies": 0.7281249761581421,
      "rewards/chosen": -0.8570802807807922,
      "rewards/margins": 0.698925793170929,
      "rewards/rejected": -1.5560060739517212,
      "step": 650,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.34,
      "learning_rate": 4.167906540208273e-06,
      "logps/chosen": -356.71868896484375,
      "logps/rejected": -418.1748046875,
      "loss": 0.4877,
      "mask/mask_ratio": 0.4977082312107086,
      "regularization/forward_KL": 1.2412792444229126,
      "regularization/policy_data_loss": 3.193763494491577,
      "regularization/policy_ref_data_loss_gap": 1.9463058710098267,
      "regularization/reference_data_loss": 1.2474576234817505,
      "regularization/reverse_KL": 0.5598865747451782,
      "rewards/accuracies": 0.762499988079071,
      "rewards/chosen": -0.782666802406311,
      "rewards/margins": 0.8660524487495422,
      "rewards/rejected": -1.6487191915512085,
      "step": 660,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.35,
      "learning_rate": 4.134052024413112e-06,
      "logps/chosen": -355.59954833984375,
      "logps/rejected": -408.8647155761719,
      "loss": 0.5287,
      "mask/mask_ratio": 0.47458505630493164,
      "regularization/forward_KL": 1.4208606481552124,
      "regularization/policy_data_loss": 3.4105117321014404,
      "regularization/policy_ref_data_loss_gap": 2.0673727989196777,
      "regularization/reference_data_loss": 1.3431388139724731,
      "regularization/reverse_KL": 0.6910916566848755,
      "rewards/accuracies": 0.7593749761581421,
      "rewards/chosen": -0.8828868865966797,
      "rewards/margins": 0.7657599449157715,
      "rewards/rejected": -1.6486469507217407,
      "step": 670,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.35,
      "learning_rate": 4.099666064563735e-06,
      "logps/chosen": -362.94903564453125,
      "logps/rejected": -407.28228759765625,
      "loss": 0.4975,
      "mask/mask_ratio": 0.48845115303993225,
      "regularization/forward_KL": 1.3998740911483765,
      "regularization/policy_data_loss": 3.189467430114746,
      "regularization/policy_ref_data_loss_gap": 1.8578462600708008,
      "regularization/reference_data_loss": 1.3316209316253662,
      "regularization/reverse_KL": 0.6738228797912598,
      "rewards/accuracies": 0.731249988079071,
      "rewards/chosen": -0.8711546659469604,
      "rewards/margins": 0.8668910264968872,
      "rewards/rejected": -1.7380456924438477,
      "step": 680,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.36,
      "learning_rate": 4.064759844033519e-06,
      "logps/chosen": -356.92974853515625,
      "logps/rejected": -405.5035400390625,
      "loss": 0.482,
      "mask/mask_ratio": 0.49066147208213806,
      "regularization/forward_KL": 1.284687876701355,
      "regularization/policy_data_loss": 2.940232753753662,
      "regularization/policy_ref_data_loss_gap": 1.6675087213516235,
      "regularization/reference_data_loss": 1.2727240324020386,
      "regularization/reverse_KL": 0.6358739733695984,
      "rewards/accuracies": 0.7718750238418579,
      "rewards/chosen": -0.7684090733528137,
      "rewards/margins": 0.9362057447433472,
      "rewards/rejected": -1.7046148777008057,
      "step": 690,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.36,
      "learning_rate": 4.029344715400643e-06,
      "logps/chosen": -344.7096252441406,
      "logps/rejected": -395.9715270996094,
      "loss": 0.4862,
      "mask/mask_ratio": 0.4926759600639343,
      "regularization/forward_KL": 1.0987943410873413,
      "regularization/policy_data_loss": 2.8303959369659424,
      "regularization/policy_ref_data_loss_gap": 1.5772391557693481,
      "regularization/reference_data_loss": 1.2531569004058838,
      "regularization/reverse_KL": 0.5733307600021362,
      "rewards/accuracies": 0.762499988079071,
      "rewards/chosen": -0.7127381563186646,
      "rewards/margins": 0.8454931974411011,
      "rewards/rejected": -1.5582313537597656,
      "step": 700,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.36,
      "eval_logps/chosen": -345.79388427734375,
      "eval_logps/rejected": -387.5947570800781,
      "eval_loss": 0.48566192388534546,
      "eval_mask/mask_ratio": 0.4808923900127411,
      "eval_regularization/forward_KL": 1.256845474243164,
      "eval_regularization/policy_data_loss": 3.2214090824127197,
      "eval_regularization/policy_ref_data_loss_gap": 1.8876878023147583,
      "eval_regularization/reference_data_loss": 1.33372163772583,
      "eval_regularization/reverse_KL": 0.6230572462081909,
      "eval_rewards/accuracies": 0.765500009059906,
      "eval_rewards/chosen": -0.706417441368103,
      "eval_rewards/margins": 0.8377891182899475,
      "eval_rewards/rejected": -1.5442065000534058,
      "eval_runtime": 681.3295,
      "eval_samples_per_second": 2.935,
      "eval_steps_per_second": 1.468,
      "eval_verify/bz": 1.0,
      "eval_verify/constant_1": 1.0,
      "eval_verify/constant_1len": 1000.0,
      "eval_verify/gather_bz": 2.0,
      "step": 700
    },
    {
      "epoch": 0.37,
      "learning_rate": 3.99343219675588e-06,
      "logps/chosen": -343.87188720703125,
      "logps/rejected": -362.69342041015625,
      "loss": 0.4811,
      "mask/mask_ratio": 0.45240649580955505,
      "regularization/forward_KL": 1.3504002094268799,
      "regularization/policy_data_loss": 3.408869504928589,
      "regularization/policy_ref_data_loss_gap": 2.0502231121063232,
      "regularization/reference_data_loss": 1.3586465120315552,
      "regularization/reverse_KL": 0.6377977728843689,
      "rewards/accuracies": 0.7562500238418579,
      "rewards/chosen": -0.6941612958908081,
      "rewards/margins": 0.8676016926765442,
      "rewards/rejected": -1.561763048171997,
      "step": 710,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.37,
      "learning_rate": 3.957033967956553e-06,
      "logps/chosen": -356.5559997558594,
      "logps/rejected": -406.48712158203125,
      "loss": 0.4811,
      "mask/mask_ratio": 0.47855791449546814,
      "regularization/forward_KL": 1.283085584640503,
      "regularization/policy_data_loss": 3.000786304473877,
      "regularization/policy_ref_data_loss_gap": 1.7088797092437744,
      "regularization/reference_data_loss": 1.2919063568115234,
      "regularization/reverse_KL": 0.6491702795028687,
      "rewards/accuracies": 0.793749988079071,
      "rewards/chosen": -0.7780593633651733,
      "rewards/margins": 0.9397061467170715,
      "rewards/rejected": -1.7177655696868896,
      "step": 720,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.38,
      "learning_rate": 3.92016186682789e-06,
      "logps/chosen": -369.33087158203125,
      "logps/rejected": -426.2369079589844,
      "loss": 0.4822,
      "mask/mask_ratio": 0.5093369483947754,
      "regularization/forward_KL": 1.1465028524398804,
      "regularization/policy_data_loss": 2.7899231910705566,
      "regularization/policy_ref_data_loss_gap": 1.4980227947235107,
      "regularization/reference_data_loss": 1.2919002771377563,
      "regularization/reverse_KL": 0.6002532839775085,
      "rewards/accuracies": 0.765625,
      "rewards/chosen": -0.7466357350349426,
      "rewards/margins": 0.8792537450790405,
      "rewards/rejected": -1.625889539718628,
      "step": 730,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.38,
      "learning_rate": 3.882827885312999e-06,
      "logps/chosen": -363.90423583984375,
      "logps/rejected": -399.9363098144531,
      "loss": 0.5046,
      "mask/mask_ratio": 0.47545701265335083,
      "regularization/forward_KL": 1.291245698928833,
      "regularization/policy_data_loss": 3.1819405555725098,
      "regularization/policy_ref_data_loss_gap": 1.8675496578216553,
      "regularization/reference_data_loss": 1.3143912553787231,
      "regularization/reverse_KL": 0.6366375684738159,
      "rewards/accuracies": 0.7281249761581421,
      "rewards/chosen": -0.7687762975692749,
      "rewards/margins": 0.8431981205940247,
      "rewards/rejected": -1.6119743585586548,
      "step": 740,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.39,
      "learning_rate": 3.845044165572717e-06,
      "logps/chosen": -348.00177001953125,
      "logps/rejected": -390.28741455078125,
      "loss": 0.5279,
      "mask/mask_ratio": 0.4756258428096771,
      "regularization/forward_KL": 1.4463456869125366,
      "regularization/policy_data_loss": 3.525700330734253,
      "regularization/policy_ref_data_loss_gap": 2.1599984169006348,
      "regularization/reference_data_loss": 1.3657023906707764,
      "regularization/reverse_KL": 0.6651198267936707,
      "rewards/accuracies": 0.765625,
      "rewards/chosen": -0.8505264520645142,
      "rewards/margins": 0.7642674446105957,
      "rewards/rejected": -1.6147940158843994,
      "step": 750,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.39,
      "learning_rate": 3.8068229960366055e-06,
      "logps/chosen": -357.22967529296875,
      "logps/rejected": -411.5762634277344,
      "loss": 0.496,
      "mask/mask_ratio": 0.5056766271591187,
      "regularization/forward_KL": 1.310390591621399,
      "regularization/policy_data_loss": 3.0280632972717285,
      "regularization/policy_ref_data_loss_gap": 1.7238391637802124,
      "regularization/reference_data_loss": 1.3042237758636475,
      "regularization/reverse_KL": 0.6277633905410767,
      "rewards/accuracies": 0.765625,
      "rewards/chosen": -0.7892125248908997,
      "rewards/margins": 0.8471924662590027,
      "rewards/rejected": -1.6364049911499023,
      "step": 760,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.4,
      "learning_rate": 3.7681768074063764e-06,
      "logps/chosen": -362.23992919921875,
      "logps/rejected": -417.4954528808594,
      "loss": 0.4828,
      "mask/mask_ratio": 0.4853920042514801,
      "regularization/forward_KL": 1.3521738052368164,
      "regularization/policy_data_loss": 3.1221251487731934,
      "regularization/policy_ref_data_loss_gap": 1.8501752614974976,
      "regularization/reference_data_loss": 1.2719498872756958,
      "regularization/reverse_KL": 0.6747928261756897,
      "rewards/accuracies": 0.7437499761581421,
      "rewards/chosen": -0.8785581588745117,
      "rewards/margins": 0.888287365436554,
      "rewards/rejected": -1.766845703125,
      "step": 770,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.4,
      "learning_rate": 3.72911816861304e-06,
      "logps/chosen": -371.3994445800781,
      "logps/rejected": -422.09930419921875,
      "loss": 0.5315,
      "mask/mask_ratio": 0.46706581115722656,
      "regularization/forward_KL": 1.6266229152679443,
      "regularization/policy_data_loss": 3.6997389793395996,
      "regularization/policy_ref_data_loss_gap": 2.339890956878662,
      "regularization/reference_data_loss": 1.359847903251648,
      "regularization/reverse_KL": 0.8343355059623718,
      "rewards/accuracies": 0.746874988079071,
      "rewards/chosen": -0.9813128709793091,
      "rewards/margins": 0.8385257720947266,
      "rewards/rejected": -1.8198387622833252,
      "step": 780,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.41,
      "learning_rate": 3.689659782729109e-06,
      "logps/chosen": -375.5843811035156,
      "logps/rejected": -404.5010070800781,
      "loss": 0.4965,
      "mask/mask_ratio": 0.5026835203170776,
      "regularization/forward_KL": 1.3997784852981567,
      "regularization/policy_data_loss": 3.216503143310547,
      "regularization/policy_ref_data_loss_gap": 1.925689935684204,
      "regularization/reference_data_loss": 1.2908129692077637,
      "regularization/reverse_KL": 0.6800588965415955,
      "rewards/accuracies": 0.784375011920929,
      "rewards/chosen": -0.7785552144050598,
      "rewards/margins": 0.8097039461135864,
      "rewards/rejected": -1.5882593393325806,
      "step": 790,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.41,
      "learning_rate": 3.6498144828371608e-06,
      "logps/chosen": -343.2922668457031,
      "logps/rejected": -389.92181396484375,
      "loss": 0.4632,
      "mask/mask_ratio": 0.4879623353481293,
      "regularization/forward_KL": 1.231044054031372,
      "regularization/policy_data_loss": 2.6133131980895996,
      "regularization/policy_ref_data_loss_gap": 1.3093647956848145,
      "regularization/reference_data_loss": 1.3039485216140747,
      "regularization/reverse_KL": 0.6323789358139038,
      "rewards/accuracies": 0.7749999761581421,
      "rewards/chosen": -0.5944793820381165,
      "rewards/margins": 0.8981904983520508,
      "rewards/rejected": -1.4926698207855225,
      "step": 800,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.41,
      "eval_logps/chosen": -338.13031005859375,
      "eval_logps/rejected": -379.7145080566406,
      "eval_loss": 0.4803018271923065,
      "eval_mask/mask_ratio": 0.4808923900127411,
      "eval_regularization/forward_KL": 1.3127697706222534,
      "eval_regularization/policy_data_loss": 2.832995891571045,
      "eval_regularization/policy_ref_data_loss_gap": 1.4992741346359253,
      "eval_regularization/reference_data_loss": 1.33372163772583,
      "eval_regularization/reverse_KL": 0.7040627598762512,
      "eval_rewards/accuracies": 0.7754999995231628,
      "eval_rewards/chosen": -0.6297821998596191,
      "eval_rewards/margins": 0.8356214761734009,
      "eval_rewards/rejected": -1.4654037952423096,
      "eval_runtime": 678.8336,
      "eval_samples_per_second": 2.946,
      "eval_steps_per_second": 1.473,
      "eval_verify/bz": 1.0,
      "eval_verify/constant_1": 1.0,
      "eval_verify/constant_1len": 1000.0,
      "eval_verify/gather_bz": 2.0,
      "step": 800
    },
    {
      "epoch": 0.42,
      "learning_rate": 3.609595227856129e-06,
      "logps/chosen": -345.6280212402344,
      "logps/rejected": -399.32366943359375,
      "loss": 0.5069,
      "mask/mask_ratio": 0.49153923988342285,
      "regularization/forward_KL": 1.3226337432861328,
      "regularization/policy_data_loss": 2.8889212608337402,
      "regularization/policy_ref_data_loss_gap": 1.5795973539352417,
      "regularization/reference_data_loss": 1.3093236684799194,
      "regularization/reverse_KL": 0.7099554538726807,
      "rewards/accuracies": 0.7406250238418579,
      "rewards/chosen": -0.7102999687194824,
      "rewards/margins": 0.7738053202629089,
      "rewards/rejected": -1.4841053485870361,
      "step": 810,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.42,
      "learning_rate": 3.5690150983266603e-06,
      "logps/chosen": -360.8568115234375,
      "logps/rejected": -404.18621826171875,
      "loss": 0.5267,
      "mask/mask_ratio": 0.4976044297218323,
      "regularization/forward_KL": 1.2635257244110107,
      "regularization/policy_data_loss": 2.899925708770752,
      "regularization/policy_ref_data_loss_gap": 1.6364638805389404,
      "regularization/reference_data_loss": 1.2634621858596802,
      "regularization/reverse_KL": 0.7208901047706604,
      "rewards/accuracies": 0.765625,
      "rewards/chosen": -0.7742083668708801,
      "rewards/margins": 0.856910228729248,
      "rewards/rejected": -1.6311185359954834,
      "step": 820,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.43,
      "learning_rate": 3.528087292156921e-06,
      "logps/chosen": -343.64312744140625,
      "logps/rejected": -362.50799560546875,
      "loss": 0.5146,
      "mask/mask_ratio": 0.47928208112716675,
      "regularization/forward_KL": 1.318047285079956,
      "regularization/policy_data_loss": 3.145505905151367,
      "regularization/policy_ref_data_loss_gap": 1.8186956644058228,
      "regularization/reference_data_loss": 1.326810359954834,
      "regularization/reverse_KL": 0.7080180048942566,
      "rewards/accuracies": 0.765625,
      "rewards/chosen": -0.7002137899398804,
      "rewards/margins": 0.7404533624649048,
      "rewards/rejected": -1.4406672716140747,
      "step": 830,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.43,
      "learning_rate": 3.4868251203302318e-06,
      "logps/chosen": -328.5467224121094,
      "logps/rejected": -371.68939208984375,
      "loss": 0.5094,
      "mask/mask_ratio": 0.45424968004226685,
      "regularization/forward_KL": 1.399877905845642,
      "regularization/policy_data_loss": 3.455820083618164,
      "regularization/policy_ref_data_loss_gap": 2.039046287536621,
      "regularization/reference_data_loss": 1.416774034500122,
      "regularization/reverse_KL": 0.7845249772071838,
      "rewards/accuracies": 0.731249988079071,
      "rewards/chosen": -0.6490459442138672,
      "rewards/margins": 0.7915527820587158,
      "rewards/rejected": -1.440598726272583,
      "step": 840,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.44,
      "learning_rate": 3.4452420025759237e-06,
      "logps/chosen": -337.8177795410156,
      "logps/rejected": -387.95904541015625,
      "loss": 0.469,
      "mask/mask_ratio": 0.48190468549728394,
      "regularization/forward_KL": 1.2552311420440674,
      "regularization/policy_data_loss": 2.8996694087982178,
      "regularization/policy_ref_data_loss_gap": 1.535031795501709,
      "regularization/reference_data_loss": 1.364637851715088,
      "regularization/reverse_KL": 0.7165506482124329,
      "rewards/accuracies": 0.7875000238418579,
      "rewards/chosen": -0.5783897042274475,
      "rewards/margins": 0.8547646403312683,
      "rewards/rejected": -1.4331544637680054,
      "step": 850,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.44,
      "learning_rate": 3.4033514630048316e-06,
      "logps/chosen": -342.36358642578125,
      "logps/rejected": -355.90875244140625,
      "loss": 0.5217,
      "mask/mask_ratio": 0.4786810278892517,
      "regularization/forward_KL": 1.1329619884490967,
      "regularization/policy_data_loss": 2.6222116947174072,
      "regularization/policy_ref_data_loss_gap": 1.325603723526001,
      "regularization/reference_data_loss": 1.2966079711914062,
      "regularization/reverse_KL": 0.645989716053009,
      "rewards/accuracies": 0.7406250238418579,
      "rewards/chosen": -0.6148379445075989,
      "rewards/margins": 0.6830765604972839,
      "rewards/rejected": -1.2979143857955933,
      "step": 860,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.45,
      "learning_rate": 3.3611671257108323e-06,
      "logps/chosen": -337.91864013671875,
      "logps/rejected": -373.461669921875,
      "loss": 0.4567,
      "mask/mask_ratio": 0.4909645915031433,
      "regularization/forward_KL": 1.295549988746643,
      "regularization/policy_data_loss": 3.1944375038146973,
      "regularization/policy_ref_data_loss_gap": 1.884466528892517,
      "regularization/reference_data_loss": 1.3099709749221802,
      "regularization/reverse_KL": 0.6998555064201355,
      "rewards/accuracies": 0.8062499761581421,
      "rewards/chosen": -0.6917039752006531,
      "rewards/margins": 0.8882933855056763,
      "rewards/rejected": -1.5799973011016846,
      "step": 870,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.45,
      "learning_rate": 3.3187027103398758e-06,
      "logps/chosen": -383.78375244140625,
      "logps/rejected": -423.7245178222656,
      "loss": 0.4485,
      "mask/mask_ratio": 0.4875253736972809,
      "regularization/forward_KL": 1.5167449712753296,
      "regularization/policy_data_loss": 3.5482678413391113,
      "regularization/policy_ref_data_loss_gap": 2.2048988342285156,
      "regularization/reference_data_loss": 1.3433692455291748,
      "regularization/reverse_KL": 0.818142294883728,
      "rewards/accuracies": 0.8062499761581421,
      "rewards/chosen": -0.8259013891220093,
      "rewards/margins": 0.9597400426864624,
      "rewards/rejected": -1.7856414318084717,
      "step": 880,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.46,
      "learning_rate": 3.275972027627928e-06,
      "logps/chosen": -335.466064453125,
      "logps/rejected": -397.6490478515625,
      "loss": 0.4982,
      "mask/mask_ratio": 0.4754369258880615,
      "regularization/forward_KL": 1.57364821434021,
      "regularization/policy_data_loss": 3.4425339698791504,
      "regularization/policy_ref_data_loss_gap": 2.077803134918213,
      "regularization/reference_data_loss": 1.3647313117980957,
      "regularization/reverse_KL": 0.789146900177002,
      "rewards/accuracies": 0.7437499761581421,
      "rewards/chosen": -0.8045094609260559,
      "rewards/margins": 0.8786141276359558,
      "rewards/rejected": -1.6831235885620117,
      "step": 890,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.46,
      "learning_rate": 3.2329889749092956e-06,
      "logps/chosen": -330.7471923828125,
      "logps/rejected": -385.07257080078125,
      "loss": 0.4912,
      "mask/mask_ratio": 0.46347999572753906,
      "regularization/forward_KL": 1.618843674659729,
      "regularization/policy_data_loss": 3.309026002883911,
      "regularization/policy_ref_data_loss_gap": 1.9539740085601807,
      "regularization/reference_data_loss": 1.3550512790679932,
      "regularization/reverse_KL": 0.7985564470291138,
      "rewards/accuracies": 0.768750011920929,
      "rewards/chosen": -0.7571079134941101,
      "rewards/margins": 0.8538058996200562,
      "rewards/rejected": -1.610913872718811,
      "step": 900,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.46,
      "eval_logps/chosen": -346.800048828125,
      "eval_logps/rejected": -398.0345458984375,
      "eval_loss": 0.47069013118743896,
      "eval_mask/mask_ratio": 0.4808923900127411,
      "eval_regularization/forward_KL": 1.4120253324508667,
      "eval_regularization/policy_data_loss": 3.0682380199432373,
      "eval_regularization/policy_ref_data_loss_gap": 1.7345163822174072,
      "eval_regularization/reference_data_loss": 1.33372163772583,
      "eval_regularization/reverse_KL": 0.7159961462020874,
      "eval_rewards/accuracies": 0.7749999761581421,
      "eval_rewards/chosen": -0.7164793014526367,
      "eval_rewards/margins": 0.9321244359016418,
      "eval_rewards/rejected": -1.6486037969589233,
      "eval_runtime": 678.0979,
      "eval_samples_per_second": 2.949,
      "eval_steps_per_second": 1.475,
      "eval_verify/bz": 1.0,
      "eval_verify/constant_1": 1.0,
      "eval_verify/constant_1len": 1000.0,
      "eval_verify/gather_bz": 2.0,
      "step": 900
    },
    {
      "epoch": 0.47,
      "learning_rate": 3.189767531596789e-06,
      "logps/chosen": -375.0126953125,
      "logps/rejected": -423.1815490722656,
      "loss": 0.4824,
      "mask/mask_ratio": 0.4672268033027649,
      "regularization/forward_KL": 1.4426562786102295,
      "regularization/policy_data_loss": 3.1939220428466797,
      "regularization/policy_ref_data_loss_gap": 1.8321936130523682,
      "regularization/reference_data_loss": 1.3617280721664429,
      "regularization/reverse_KL": 0.7393046617507935,
      "rewards/accuracies": 0.746874988079071,
      "rewards/chosen": -0.791134238243103,
      "rewards/margins": 0.9590598344802856,
      "rewards/rejected": -1.7501941919326782,
      "step": 910,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.48,
      "learning_rate": 3.1463217546351805e-06,
      "logps/chosen": -355.55438232421875,
      "logps/rejected": -428.2572326660156,
      "loss": 0.4637,
      "mask/mask_ratio": 0.4832921028137207,
      "regularization/forward_KL": 1.5445311069488525,
      "regularization/policy_data_loss": 3.399376630783081,
      "regularization/policy_ref_data_loss_gap": 2.075806140899658,
      "regularization/reference_data_loss": 1.3235702514648438,
      "regularization/reverse_KL": 0.8096310496330261,
      "rewards/accuracies": 0.768750011920929,
      "rewards/chosen": -0.8389447331428528,
      "rewards/margins": 1.0589849948883057,
      "rewards/rejected": -1.8979297876358032,
      "step": 920,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.48,
      "learning_rate": 3.1026657739294545e-06,
      "logps/chosen": -359.7148132324219,
      "logps/rejected": -403.741943359375,
      "loss": 0.4623,
      "mask/mask_ratio": 0.4793354868888855,
      "regularization/forward_KL": 1.5487029552459717,
      "regularization/policy_data_loss": 3.3267149925231934,
      "regularization/policy_ref_data_loss_gap": 1.9900939464569092,
      "regularization/reference_data_loss": 1.3366214036941528,
      "regularization/reverse_KL": 0.7889060378074646,
      "rewards/accuracies": 0.7718750238418579,
      "rewards/chosen": -0.7435846328735352,
      "rewards/margins": 1.0394331216812134,
      "rewards/rejected": -1.7830177545547485,
      "step": 930,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.49,
      "learning_rate": 3.058813787749332e-06,
      "logps/chosen": -348.6864929199219,
      "logps/rejected": -418.474853515625,
      "loss": 0.4701,
      "mask/mask_ratio": 0.47454625368118286,
      "regularization/forward_KL": 1.6017955541610718,
      "regularization/policy_data_loss": 3.487745761871338,
      "regularization/policy_ref_data_loss_gap": 2.13042950630188,
      "regularization/reference_data_loss": 1.357316255569458,
      "regularization/reverse_KL": 0.8221408724784851,
      "rewards/accuracies": 0.768750011920929,
      "rewards/chosen": -0.8332462310791016,
      "rewards/margins": 0.9651119112968445,
      "rewards/rejected": -1.7983582019805908,
      "step": 940,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.49,
      "learning_rate": 3.0147800581115477e-06,
      "logps/chosen": -345.92279052734375,
      "logps/rejected": -405.68133544921875,
      "loss": 0.474,
      "mask/mask_ratio": 0.4746394157409668,
      "regularization/forward_KL": 1.592740774154663,
      "regularization/policy_data_loss": 3.394793748855591,
      "regularization/policy_ref_data_loss_gap": 2.0399765968322754,
      "regularization/reference_data_loss": 1.3548171520233154,
      "regularization/reverse_KL": 0.8010002970695496,
      "rewards/accuracies": 0.746874988079071,
      "rewards/chosen": -0.8213979005813599,
      "rewards/margins": 0.9722744226455688,
      "rewards/rejected": -1.7936722040176392,
      "step": 950,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.5,
      "learning_rate": 2.9705789061414112e-06,
      "logps/chosen": -367.93939208984375,
      "logps/rejected": -428.01116943359375,
      "loss": 0.4829,
      "mask/mask_ratio": 0.4885406494140625,
      "regularization/forward_KL": 1.482033371925354,
      "regularization/policy_data_loss": 3.1845736503601074,
      "regularization/policy_ref_data_loss_gap": 1.9078010320663452,
      "regularization/reference_data_loss": 1.2767728567123413,
      "regularization/reverse_KL": 0.755481481552124,
      "rewards/accuracies": 0.753125011920929,
      "rewards/chosen": -0.8695995211601257,
      "rewards/margins": 0.9987448453903198,
      "rewards/rejected": -1.8683445453643799,
      "step": 960,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.5,
      "learning_rate": 2.9262247074151296e-06,
      "logps/chosen": -360.04766845703125,
      "logps/rejected": -434.97650146484375,
      "loss": 0.4782,
      "mask/mask_ratio": 0.4766152501106262,
      "regularization/forward_KL": 1.7476508617401123,
      "regularization/policy_data_loss": 3.6862595081329346,
      "regularization/policy_ref_data_loss_gap": 2.353606700897217,
      "regularization/reference_data_loss": 1.3326528072357178,
      "regularization/reverse_KL": 0.8907902836799622,
      "rewards/accuracies": 0.78125,
      "rewards/chosen": -0.919063925743103,
      "rewards/margins": 1.0263198614120483,
      "rewards/rejected": -1.9453840255737305,
      "step": 970,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.51,
      "learning_rate": 2.881731887284429e-06,
      "logps/chosen": -388.28948974609375,
      "logps/rejected": -439.22894287109375,
      "loss": 0.4735,
      "mask/mask_ratio": 0.5034492611885071,
      "regularization/forward_KL": 1.6731637716293335,
      "regularization/policy_data_loss": 3.4840035438537598,
      "regularization/policy_ref_data_loss_gap": 2.180572032928467,
      "regularization/reference_data_loss": 1.3034313917160034,
      "regularization/reverse_KL": 0.883350670337677,
      "rewards/accuracies": 0.762499988079071,
      "rewards/chosen": -0.9364107251167297,
      "rewards/margins": 1.0623798370361328,
      "rewards/rejected": -1.9987905025482178,
      "step": 980,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.51,
      "learning_rate": 2.8371149161849893e-06,
      "logps/chosen": -360.9266052246094,
      "logps/rejected": -429.53607177734375,
      "loss": 0.4564,
      "mask/mask_ratio": 0.46724075078964233,
      "regularization/forward_KL": 1.8491981029510498,
      "regularization/policy_data_loss": 3.9653689861297607,
      "regularization/policy_ref_data_loss_gap": 2.577927350997925,
      "regularization/reference_data_loss": 1.3874413967132568,
      "regularization/reverse_KL": 0.9445978403091431,
      "rewards/accuracies": 0.7593749761581421,
      "rewards/chosen": -0.8997133374214172,
      "rewards/margins": 1.0361647605895996,
      "rewards/rejected": -1.935878038406372,
      "step": 990,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.52,
      "learning_rate": 2.792388304930207e-06,
      "logps/chosen": -391.35052490234375,
      "logps/rejected": -441.22735595703125,
      "loss": 0.4588,
      "mask/mask_ratio": 0.5002816915512085,
      "regularization/forward_KL": 1.666182279586792,
      "regularization/policy_data_loss": 3.581329345703125,
      "regularization/policy_ref_data_loss_gap": 2.262571096420288,
      "regularization/reference_data_loss": 1.318758249282837,
      "regularization/reverse_KL": 0.8926746249198914,
      "rewards/accuracies": 0.796875,
      "rewards/chosen": -0.9132622480392456,
      "rewards/margins": 1.0752723217010498,
      "rewards/rejected": -1.988534688949585,
      "step": 1000,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.52,
      "eval_logps/chosen": -360.4623718261719,
      "eval_logps/rejected": -418.5935974121094,
      "eval_loss": 0.4680280089378357,
      "eval_mask/mask_ratio": 0.4808923900127411,
      "eval_regularization/forward_KL": 1.6381595134735107,
      "eval_regularization/policy_data_loss": 3.544844627380371,
      "eval_regularization/policy_ref_data_loss_gap": 2.21112322807312,
      "eval_regularization/reference_data_loss": 1.33372163772583,
      "eval_regularization/reverse_KL": 0.8345947265625,
      "eval_rewards/accuracies": 0.7689999938011169,
      "eval_rewards/chosen": -0.8531022667884827,
      "eval_rewards/margins": 1.0010924339294434,
      "eval_rewards/rejected": -1.8541947603225708,
      "eval_runtime": 678.2309,
      "eval_samples_per_second": 2.949,
      "eval_steps_per_second": 1.474,
      "eval_verify/bz": 1.0,
      "eval_verify/constant_1": 1.0,
      "eval_verify/constant_1len": 1000.0,
      "eval_verify/gather_bz": 2.0,
      "step": 1000
    },
    {
      "epoch": 0.52,
      "learning_rate": 2.7475665999918343e-06,
      "logps/chosen": -361.87481689453125,
      "logps/rejected": -401.7012634277344,
      "loss": 0.5147,
      "mask/mask_ratio": 0.4766588807106018,
      "regularization/forward_KL": 1.6183385848999023,
      "regularization/policy_data_loss": 3.4378883838653564,
      "regularization/policy_ref_data_loss_gap": 2.118281602859497,
      "regularization/reference_data_loss": 1.3196067810058594,
      "regularization/reverse_KL": 0.8081123232841492,
      "rewards/accuracies": 0.7250000238418579,
      "rewards/chosen": -0.8741868138313293,
      "rewards/margins": 0.8777171969413757,
      "rewards/rejected": -1.7519038915634155,
      "step": 1010,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.53,
      "learning_rate": 2.7026643787690214e-06,
      "logps/chosen": -363.1113586425781,
      "logps/rejected": -422.6988830566406,
      "loss": 0.4319,
      "mask/mask_ratio": 0.5053070783615112,
      "regularization/forward_KL": 1.4515669345855713,
      "regularization/policy_data_loss": 3.1572136878967285,
      "regularization/policy_ref_data_loss_gap": 1.8214718103408813,
      "regularization/reference_data_loss": 1.3357419967651367,
      "regularization/reverse_KL": 0.7576014995574951,
      "rewards/accuracies": 0.800000011920929,
      "rewards/chosen": -0.6867777109146118,
      "rewards/margins": 1.0725219249725342,
      "rewards/rejected": -1.759299635887146,
      "step": 1020,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.53,
      "learning_rate": 2.657696244847292e-06,
      "logps/chosen": -357.66546630859375,
      "logps/rejected": -420.062255859375,
      "loss": 0.45,
      "mask/mask_ratio": 0.48850899934768677,
      "regularization/forward_KL": 1.4633575677871704,
      "regularization/policy_data_loss": 2.994379997253418,
      "regularization/policy_ref_data_loss_gap": 1.6944282054901123,
      "regularization/reference_data_loss": 1.2999519109725952,
      "regularization/reverse_KL": 0.7601736783981323,
      "rewards/accuracies": 0.796875,
      "rewards/chosen": -0.7448621988296509,
      "rewards/margins": 1.0192582607269287,
      "rewards/rejected": -1.7641206979751587,
      "step": 1030,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.54,
      "learning_rate": 2.612676823249012e-06,
      "logps/chosen": -356.3604736328125,
      "logps/rejected": -425.1424865722656,
      "loss": 0.5211,
      "mask/mask_ratio": 0.4794303774833679,
      "regularization/forward_KL": 1.8137887716293335,
      "regularization/policy_data_loss": 3.509474515914917,
      "regularization/policy_ref_data_loss_gap": 2.1440131664276123,
      "regularization/reference_data_loss": 1.3654614686965942,
      "regularization/reverse_KL": 0.9370359182357788,
      "rewards/accuracies": 0.75,
      "rewards/chosen": -0.9280366897583008,
      "rewards/margins": 1.0730069875717163,
      "rewards/rejected": -2.0010437965393066,
      "step": 1040,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.54,
      "learning_rate": 2.567620755676877e-06,
      "logps/chosen": -340.73272705078125,
      "logps/rejected": -428.054931640625,
      "loss": 0.4467,
      "mask/mask_ratio": 0.4720945358276367,
      "regularization/forward_KL": 1.8855125904083252,
      "regularization/policy_data_loss": 3.6509175300598145,
      "regularization/policy_ref_data_loss_gap": 2.2492189407348633,
      "regularization/reference_data_loss": 1.401698350906372,
      "regularization/reverse_KL": 0.9593275785446167,
      "rewards/accuracies": 0.800000011920929,
      "rewards/chosen": -0.8109802007675171,
      "rewards/margins": 1.152329921722412,
      "rewards/rejected": -1.9633100032806396,
      "step": 1050,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.55,
      "learning_rate": 2.5225426957519827e-06,
      "logps/chosen": -356.8919677734375,
      "logps/rejected": -393.8501281738281,
      "loss": 0.4922,
      "mask/mask_ratio": 0.4840954840183258,
      "regularization/forward_KL": 1.5445278882980347,
      "regularization/policy_data_loss": 3.0410842895507812,
      "regularization/policy_ref_data_loss_gap": 1.7923619747161865,
      "regularization/reference_data_loss": 1.2487224340438843,
      "regularization/reverse_KL": 0.7991067171096802,
      "rewards/accuracies": 0.7718750238418579,
      "rewards/chosen": -0.8297954797744751,
      "rewards/margins": 0.8743368983268738,
      "rewards/rejected": -1.7041324377059937,
      "step": 1060,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.55,
      "learning_rate": 2.477457304248018e-06,
      "logps/chosen": -365.36798095703125,
      "logps/rejected": -438.32586669921875,
      "loss": 0.5144,
      "mask/mask_ratio": 0.4814305305480957,
      "regularization/forward_KL": 1.8117096424102783,
      "regularization/policy_data_loss": 3.5517921447753906,
      "regularization/policy_ref_data_loss_gap": 2.222586154937744,
      "regularization/reference_data_loss": 1.3292062282562256,
      "regularization/reverse_KL": 0.921234130859375,
      "rewards/accuracies": 0.7437499761581421,
      "rewards/chosen": -0.9221228361129761,
      "rewards/margins": 0.9479209184646606,
      "rewards/rejected": -1.8700437545776367,
      "step": 1070,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.56,
      "learning_rate": 2.4323792443231243e-06,
      "logps/chosen": -374.26202392578125,
      "logps/rejected": -410.62652587890625,
      "loss": 0.4847,
      "mask/mask_ratio": 0.4853752553462982,
      "regularization/forward_KL": 1.5403839349746704,
      "regularization/policy_data_loss": 3.0835585594177246,
      "regularization/policy_ref_data_loss_gap": 1.8354151248931885,
      "regularization/reference_data_loss": 1.2481436729431152,
      "regularization/reverse_KL": 0.7821645140647888,
      "rewards/accuracies": 0.8031250238418579,
      "rewards/chosen": -0.9117132425308228,
      "rewards/margins": 0.9451999664306641,
      "rewards/rejected": -1.8569132089614868,
      "step": 1080,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.56,
      "learning_rate": 2.387323176750989e-06,
      "logps/chosen": -353.126953125,
      "logps/rejected": -390.7349548339844,
      "loss": 0.4932,
      "mask/mask_ratio": 0.474606454372406,
      "regularization/forward_KL": 1.7939189672470093,
      "regularization/policy_data_loss": 3.6149840354919434,
      "regularization/policy_ref_data_loss_gap": 2.241938829421997,
      "regularization/reference_data_loss": 1.3730452060699463,
      "regularization/reverse_KL": 0.865491509437561,
      "rewards/accuracies": 0.7562500238418579,
      "rewards/chosen": -0.8042001724243164,
      "rewards/margins": 0.8999455571174622,
      "rewards/rejected": -1.7041456699371338,
      "step": 1090,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.57,
      "learning_rate": 2.3423037551527088e-06,
      "logps/chosen": -366.1763000488281,
      "logps/rejected": -418.1183166503906,
      "loss": 0.4956,
      "mask/mask_ratio": 0.49568256735801697,
      "regularization/forward_KL": 1.6050838232040405,
      "regularization/policy_data_loss": 3.399763584136963,
      "regularization/policy_ref_data_loss_gap": 2.129411220550537,
      "regularization/reference_data_loss": 1.2703526020050049,
      "regularization/reverse_KL": 0.780432403087616,
      "rewards/accuracies": 0.7718750238418579,
      "rewards/chosen": -0.8095630407333374,
      "rewards/margins": 0.9988244771957397,
      "rewards/rejected": -1.8083875179290771,
      "step": 1100,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.57,
      "eval_logps/chosen": -355.05670166015625,
      "eval_logps/rejected": -410.89129638671875,
      "eval_loss": 0.4650018811225891,
      "eval_mask/mask_ratio": 0.4808923900127411,
      "eval_regularization/forward_KL": 1.6269928216934204,
      "eval_regularization/policy_data_loss": 3.5035054683685303,
      "eval_regularization/policy_ref_data_loss_gap": 2.1697838306427,
      "eval_regularization/reference_data_loss": 1.33372163772583,
      "eval_regularization/reverse_KL": 0.8004212379455566,
      "eval_rewards/accuracies": 0.7789999842643738,
      "eval_rewards/chosen": -0.799045741558075,
      "eval_rewards/margins": 0.9781261086463928,
      "eval_rewards/rejected": -1.7771718502044678,
      "eval_runtime": 680.4563,
      "eval_samples_per_second": 2.939,
      "eval_steps_per_second": 1.47,
      "eval_verify/bz": 1.0,
      "eval_verify/constant_1": 1.0,
      "eval_verify/constant_1len": 1000.0,
      "eval_verify/gather_bz": 2.0,
      "step": 1100
    },
    {
      "epoch": 0.57,
      "learning_rate": 2.297335621230979e-06,
      "logps/chosen": -365.9013671875,
      "logps/rejected": -420.11676025390625,
      "loss": 0.4839,
      "mask/mask_ratio": 0.5021510720252991,
      "regularization/forward_KL": 1.549786925315857,
      "regularization/policy_data_loss": 3.418731212615967,
      "regularization/policy_ref_data_loss_gap": 2.094510316848755,
      "regularization/reference_data_loss": 1.324221134185791,
      "regularization/reverse_KL": 0.778368353843689,
      "rewards/accuracies": 0.753125011920929,
      "rewards/chosen": -0.8425655364990234,
      "rewards/margins": 0.8925921320915222,
      "rewards/rejected": -1.7351577281951904,
      "step": 1110,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.58,
      "learning_rate": 2.2524334000081665e-06,
      "logps/chosen": -353.25,
      "logps/rejected": -419.9073791503906,
      "loss": 0.4418,
      "mask/mask_ratio": 0.46470093727111816,
      "regularization/forward_KL": 1.5048519372940063,
      "regularization/policy_data_loss": 3.2570385932922363,
      "regularization/policy_ref_data_loss_gap": 1.943503975868225,
      "regularization/reference_data_loss": 1.3135344982147217,
      "regularization/reverse_KL": 0.7766603231430054,
      "rewards/accuracies": 0.7718750238418579,
      "rewards/chosen": -0.7408279776573181,
      "rewards/margins": 1.0688621997833252,
      "rewards/rejected": -1.8096901178359985,
      "step": 1120,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.58,
      "learning_rate": 2.207611695069794e-06,
      "logps/chosen": -348.40045166015625,
      "logps/rejected": -417.51593017578125,
      "loss": 0.4398,
      "mask/mask_ratio": 0.4748614430427551,
      "regularization/forward_KL": 1.6997610330581665,
      "regularization/policy_data_loss": 3.438598155975342,
      "regularization/policy_ref_data_loss_gap": 2.109769582748413,
      "regularization/reference_data_loss": 1.3288285732269287,
      "regularization/reverse_KL": 0.8146723508834839,
      "rewards/accuracies": 0.78125,
      "rewards/chosen": -0.7392085194587708,
      "rewards/margins": 1.0907869338989258,
      "rewards/rejected": -1.8299957513809204,
      "step": 1130,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.59,
      "learning_rate": 2.162885083815011e-06,
      "logps/chosen": -371.650146484375,
      "logps/rejected": -429.2657775878906,
      "loss": 0.5037,
      "mask/mask_ratio": 0.46894198656082153,
      "regularization/forward_KL": 1.750522255897522,
      "regularization/policy_data_loss": 3.647876024246216,
      "regularization/policy_ref_data_loss_gap": 2.264991044998169,
      "regularization/reference_data_loss": 1.3828846216201782,
      "regularization/reverse_KL": 0.8863222002983093,
      "rewards/accuracies": 0.753125011920929,
      "rewards/chosen": -0.8675206899642944,
      "rewards/margins": 0.9314130544662476,
      "rewards/rejected": -1.798933982849121,
      "step": 1140,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.59,
      "learning_rate": 2.1182681127155714e-06,
      "logps/chosen": -362.3675231933594,
      "logps/rejected": -407.7796936035156,
      "loss": 0.4667,
      "mask/mask_ratio": 0.4869117736816406,
      "regularization/forward_KL": 1.6915229558944702,
      "regularization/policy_data_loss": 3.4999630451202393,
      "regularization/policy_ref_data_loss_gap": 2.1657540798187256,
      "regularization/reference_data_loss": 1.3342089653015137,
      "regularization/reverse_KL": 0.8863385319709778,
      "rewards/accuracies": 0.796875,
      "rewards/chosen": -0.7910835146903992,
      "rewards/margins": 1.0511146783828735,
      "rewards/rejected": -1.842198133468628,
      "step": 1150,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.6,
      "learning_rate": 2.073775292584871e-06,
      "logps/chosen": -360.3282470703125,
      "logps/rejected": -421.4234313964844,
      "loss": 0.4671,
      "mask/mask_ratio": 0.47746795415878296,
      "regularization/forward_KL": 1.5626468658447266,
      "regularization/policy_data_loss": 3.300339460372925,
      "regularization/policy_ref_data_loss_gap": 2.0178475379943848,
      "regularization/reference_data_loss": 1.2824923992156982,
      "regularization/reverse_KL": 0.7732560038566589,
      "rewards/accuracies": 0.762499988079071,
      "rewards/chosen": -0.8316150903701782,
      "rewards/margins": 1.0359256267547607,
      "rewards/rejected": -1.867540717124939,
      "step": 1160,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.6,
      "learning_rate": 2.029421093858589e-06,
      "logps/chosen": -362.2599182128906,
      "logps/rejected": -416.98260498046875,
      "loss": 0.4915,
      "mask/mask_ratio": 0.48219218850135803,
      "regularization/forward_KL": 1.8237025737762451,
      "regularization/policy_data_loss": 3.654402494430542,
      "regularization/policy_ref_data_loss_gap": 2.2944862842559814,
      "regularization/reference_data_loss": 1.35991632938385,
      "regularization/reverse_KL": 0.880692183971405,
      "rewards/accuracies": 0.7124999761581421,
      "rewards/chosen": -0.820796012878418,
      "rewards/margins": 0.9615306854248047,
      "rewards/rejected": -1.7823266983032227,
      "step": 1170,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.61,
      "learning_rate": 1.9852199418884527e-06,
      "logps/chosen": -349.13360595703125,
      "logps/rejected": -408.013916015625,
      "loss": 0.4874,
      "mask/mask_ratio": 0.4841601848602295,
      "regularization/forward_KL": 1.569267749786377,
      "regularization/policy_data_loss": 3.1640117168426514,
      "regularization/policy_ref_data_loss_gap": 1.9097486734390259,
      "regularization/reference_data_loss": 1.254262924194336,
      "regularization/reverse_KL": 0.805046558380127,
      "rewards/accuracies": 0.753125011920929,
      "rewards/chosen": -0.8179885149002075,
      "rewards/margins": 0.9910901188850403,
      "rewards/rejected": -1.8090789318084717,
      "step": 1180,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.61,
      "learning_rate": 1.941186212250669e-06,
      "logps/chosen": -347.79498291015625,
      "logps/rejected": -408.36907958984375,
      "loss": 0.5073,
      "mask/mask_ratio": 0.47047147154808044,
      "regularization/forward_KL": 1.7885147333145142,
      "regularization/policy_data_loss": 3.6652159690856934,
      "regularization/policy_ref_data_loss_gap": 2.3388705253601074,
      "regularization/reference_data_loss": 1.3263452053070068,
      "regularization/reverse_KL": 0.8758577108383179,
      "rewards/accuracies": 0.75,
      "rewards/chosen": -0.840948760509491,
      "rewards/margins": 0.9267898797988892,
      "rewards/rejected": -1.767738699913025,
      "step": 1190,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.62,
      "learning_rate": 1.897334226070546e-06,
      "logps/chosen": -335.3964538574219,
      "logps/rejected": -400.3602294921875,
      "loss": 0.4738,
      "mask/mask_ratio": 0.45999065041542053,
      "regularization/forward_KL": 1.8084516525268555,
      "regularization/policy_data_loss": 3.716658115386963,
      "regularization/policy_ref_data_loss_gap": 2.388058662414551,
      "regularization/reference_data_loss": 1.3285998106002808,
      "regularization/reverse_KL": 0.8997222185134888,
      "rewards/accuracies": 0.784375011920929,
      "rewards/chosen": -0.8195877075195312,
      "rewards/margins": 0.9621875882148743,
      "rewards/rejected": -1.7817752361297607,
      "step": 1200,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.62,
      "eval_logps/chosen": -355.8280029296875,
      "eval_logps/rejected": -414.8670349121094,
      "eval_loss": 0.46294862031936646,
      "eval_mask/mask_ratio": 0.4808923900127411,
      "eval_regularization/forward_KL": 1.7937616109848022,
      "eval_regularization/policy_data_loss": 3.6707875728607178,
      "eval_regularization/policy_ref_data_loss_gap": 2.3370656967163086,
      "eval_regularization/reference_data_loss": 1.33372163772583,
      "eval_regularization/reverse_KL": 0.8906781673431396,
      "eval_rewards/accuracies": 0.7705000042915344,
      "eval_rewards/chosen": -0.8067585229873657,
      "eval_rewards/margins": 1.0101702213287354,
      "eval_rewards/rejected": -1.8169289827346802,
      "eval_runtime": 678.1906,
      "eval_samples_per_second": 2.949,
      "eval_steps_per_second": 1.475,
      "eval_verify/bz": 1.0,
      "eval_verify/constant_1": 1.0,
      "eval_verify/constant_1len": 1000.0,
      "eval_verify/gather_bz": 2.0,
      "step": 1200
    },
    {
      "epoch": 0.62,
      "learning_rate": 1.8536782453648206e-06,
      "logps/chosen": -342.643798828125,
      "logps/rejected": -420.227783203125,
      "loss": 0.4575,
      "mask/mask_ratio": 0.4626893997192383,
      "regularization/forward_KL": 1.8106091022491455,
      "regularization/policy_data_loss": 3.534616470336914,
      "regularization/policy_ref_data_loss_gap": 2.1899125576019287,
      "regularization/reference_data_loss": 1.3447039127349854,
      "regularization/reverse_KL": 0.9050670862197876,
      "rewards/accuracies": 0.784375011920929,
      "rewards/chosen": -0.8094381093978882,
      "rewards/margins": 1.063377022743225,
      "rewards/rejected": -1.8728151321411133,
      "step": 1210,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.63,
      "learning_rate": 1.8102324684032117e-06,
      "logps/chosen": -349.37701416015625,
      "logps/rejected": -399.7132873535156,
      "loss": 0.4971,
      "mask/mask_ratio": 0.46068984270095825,
      "regularization/forward_KL": 1.919891357421875,
      "regularization/policy_data_loss": 3.679204225540161,
      "regularization/policy_ref_data_loss_gap": 2.3511643409729004,
      "regularization/reference_data_loss": 1.328040361404419,
      "regularization/reverse_KL": 0.9414359927177429,
      "rewards/accuracies": 0.731249988079071,
      "rewards/chosen": -0.8105362057685852,
      "rewards/margins": 1.023348093032837,
      "rewards/rejected": -1.8338844776153564,
      "step": 1220,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.64,
      "learning_rate": 1.767011025090705e-06,
      "logps/chosen": -362.29736328125,
      "logps/rejected": -456.44036865234375,
      "loss": 0.4319,
      "mask/mask_ratio": 0.4946006238460541,
      "regularization/forward_KL": 1.6686779260635376,
      "regularization/policy_data_loss": 3.426922559738159,
      "regularization/policy_ref_data_loss_gap": 2.0700364112854004,
      "regularization/reference_data_loss": 1.3568861484527588,
      "regularization/reverse_KL": 0.8488477468490601,
      "rewards/accuracies": 0.78125,
      "rewards/chosen": -0.7405003309249878,
      "rewards/margins": 1.1684643030166626,
      "rewards/rejected": -1.9089645147323608,
      "step": 1230,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.64,
      "learning_rate": 1.7240279723720732e-06,
      "logps/chosen": -407.5479736328125,
      "logps/rejected": -424.8394470214844,
      "loss": 0.4855,
      "mask/mask_ratio": 0.5148528218269348,
      "regularization/forward_KL": 1.8661048412322998,
      "regularization/policy_data_loss": 3.7221832275390625,
      "regularization/policy_ref_data_loss_gap": 2.418184518814087,
      "regularization/reference_data_loss": 1.3039991855621338,
      "regularization/reverse_KL": 0.921181321144104,
      "rewards/accuracies": 0.7875000238418579,
      "rewards/chosen": -0.9201223254203796,
      "rewards/margins": 0.9610943794250488,
      "rewards/rejected": -1.8812170028686523,
      "step": 1240,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.65,
      "learning_rate": 1.681297289660125e-06,
      "logps/chosen": -366.2618713378906,
      "logps/rejected": -422.63787841796875,
      "loss": 0.4624,
      "mask/mask_ratio": 0.4686582684516907,
      "regularization/forward_KL": 1.9868872165679932,
      "regularization/policy_data_loss": 4.1200151443481445,
      "regularization/policy_ref_data_loss_gap": 2.7560131549835205,
      "regularization/reference_data_loss": 1.3640015125274658,
      "regularization/reverse_KL": 0.9456700086593628,
      "rewards/accuracies": 0.7562500238418579,
      "rewards/chosen": -0.8470296859741211,
      "rewards/margins": 1.0101690292358398,
      "rewards/rejected": -1.857198715209961,
      "step": 1250,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.65,
      "learning_rate": 1.6388328742891679e-06,
      "logps/chosen": -343.8788757324219,
      "logps/rejected": -436.5252380371094,
      "loss": 0.422,
      "mask/mask_ratio": 0.47403663396835327,
      "regularization/forward_KL": 1.7843549251556396,
      "regularization/policy_data_loss": 3.607128620147705,
      "regularization/policy_ref_data_loss_gap": 2.3021974563598633,
      "regularization/reference_data_loss": 1.3049309253692627,
      "regularization/reverse_KL": 0.9228888750076294,
      "rewards/accuracies": 0.809374988079071,
      "rewards/chosen": -0.7663129568099976,
      "rewards/margins": 1.199436068534851,
      "rewards/rejected": -1.9657487869262695,
      "step": 1260,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.66,
      "learning_rate": 1.5966485369951695e-06,
      "logps/chosen": -350.4718017578125,
      "logps/rejected": -415.4951171875,
      "loss": 0.4377,
      "mask/mask_ratio": 0.47925662994384766,
      "regularization/forward_KL": 2.087397575378418,
      "regularization/policy_data_loss": 4.106557369232178,
      "regularization/policy_ref_data_loss_gap": 2.7592616081237793,
      "regularization/reference_data_loss": 1.3472956418991089,
      "regularization/reverse_KL": 1.0059704780578613,
      "rewards/accuracies": 0.8062499761581421,
      "rewards/chosen": -0.7984176278114319,
      "rewards/margins": 1.1377553939819336,
      "rewards/rejected": -1.9361730813980103,
      "step": 1270,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.66,
      "learning_rate": 1.5547579974240767e-06,
      "logps/chosen": -378.7990417480469,
      "logps/rejected": -445.5142517089844,
      "loss": 0.4729,
      "mask/mask_ratio": 0.5042635798454285,
      "regularization/forward_KL": 1.8559181690216064,
      "regularization/policy_data_loss": 3.601454973220825,
      "regularization/policy_ref_data_loss_gap": 2.28829026222229,
      "regularization/reference_data_loss": 1.3131649494171143,
      "regularization/reverse_KL": 0.909493088722229,
      "rewards/accuracies": 0.765625,
      "rewards/chosen": -0.9143694043159485,
      "rewards/margins": 1.0560824871063232,
      "rewards/rejected": -1.9704519510269165,
      "step": 1280,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.67,
      "learning_rate": 1.5131748796697687e-06,
      "logps/chosen": -369.54669189453125,
      "logps/rejected": -438.15863037109375,
      "loss": 0.4485,
      "mask/mask_ratio": 0.4930439889431,
      "regularization/forward_KL": 2.0261335372924805,
      "regularization/policy_data_loss": 3.7488512992858887,
      "regularization/policy_ref_data_loss_gap": 2.4247889518737793,
      "regularization/reference_data_loss": 1.3240623474121094,
      "regularization/reverse_KL": 0.97789067029953,
      "rewards/accuracies": 0.793749988079071,
      "rewards/chosen": -0.8441926836967468,
      "rewards/margins": 1.1282610893249512,
      "rewards/rejected": -1.9724537134170532,
      "step": 1290,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.67,
      "learning_rate": 1.4719127078430795e-06,
      "logps/chosen": -363.5559387207031,
      "logps/rejected": -422.1034240722656,
      "loss": 0.4657,
      "mask/mask_ratio": 0.4489743709564209,
      "regularization/forward_KL": 2.114527940750122,
      "regularization/policy_data_loss": 4.064545631408691,
      "regularization/policy_ref_data_loss_gap": 2.723226547241211,
      "regularization/reference_data_loss": 1.3413186073303223,
      "regularization/reverse_KL": 1.0010147094726562,
      "rewards/accuracies": 0.768750011920929,
      "rewards/chosen": -0.9139341115951538,
      "rewards/margins": 1.1373964548110962,
      "rewards/rejected": -2.051330327987671,
      "step": 1300,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.67,
      "eval_logps/chosen": -361.7411804199219,
      "eval_logps/rejected": -425.9926452636719,
      "eval_loss": 0.4621984362602234,
      "eval_mask/mask_ratio": 0.4808923900127411,
      "eval_regularization/forward_KL": 1.937515377998352,
      "eval_regularization/policy_data_loss": 3.7638583183288574,
      "eval_regularization/policy_ref_data_loss_gap": 2.4301366806030273,
      "eval_regularization/reference_data_loss": 1.33372163772583,
      "eval_regularization/reverse_KL": 0.9454620480537415,
      "eval_rewards/accuracies": 0.765500009059906,
      "eval_rewards/chosen": -0.8658906817436218,
      "eval_rewards/margins": 1.0622944831848145,
      "eval_rewards/rejected": -1.928185224533081,
      "eval_runtime": 678.2909,
      "eval_samples_per_second": 2.949,
      "eval_steps_per_second": 1.474,
      "eval_verify/bz": 1.0,
      "eval_verify/constant_1": 1.0,
      "eval_verify/constant_1len": 1000.0,
      "eval_verify/gather_bz": 2.0,
      "step": 1300
    },
    {
      "epoch": 0.68,
      "learning_rate": 1.4309849016733407e-06,
      "logps/chosen": -376.89752197265625,
      "logps/rejected": -438.6731872558594,
      "loss": 0.4565,
      "mask/mask_ratio": 0.49921077489852905,
      "regularization/forward_KL": 1.8718713521957397,
      "regularization/policy_data_loss": 3.7002665996551514,
      "regularization/policy_ref_data_loss_gap": 2.3846921920776367,
      "regularization/reference_data_loss": 1.3155744075775146,
      "regularization/reverse_KL": 0.9082392454147339,
      "rewards/accuracies": 0.778124988079071,
      "rewards/chosen": -0.8583984375,
      "rewards/margins": 1.0740336179733276,
      "rewards/rejected": -1.9324318170547485,
      "step": 1310,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.68,
      "learning_rate": 1.3904047721438722e-06,
      "logps/chosen": -378.2431945800781,
      "logps/rejected": -441.1559143066406,
      "loss": 0.4671,
      "mask/mask_ratio": 0.522929847240448,
      "regularization/forward_KL": 1.62582528591156,
      "regularization/policy_data_loss": 3.1624553203582764,
      "regularization/policy_ref_data_loss_gap": 1.945429801940918,
      "regularization/reference_data_loss": 1.217025637626648,
      "regularization/reverse_KL": 0.858277440071106,
      "rewards/accuracies": 0.78125,
      "rewards/chosen": -0.8920836448669434,
      "rewards/margins": 1.0394176244735718,
      "rewards/rejected": -1.9315013885498047,
      "step": 1320,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.69,
      "learning_rate": 1.3501855171628394e-06,
      "logps/chosen": -373.30474853515625,
      "logps/rejected": -453.34619140625,
      "loss": 0.4804,
      "mask/mask_ratio": 0.4809340536594391,
      "regularization/forward_KL": 1.9518005847930908,
      "regularization/policy_data_loss": 3.8627593517303467,
      "regularization/policy_ref_data_loss_gap": 2.5461716651916504,
      "regularization/reference_data_loss": 1.316588044166565,
      "regularization/reverse_KL": 0.9539650082588196,
      "rewards/accuracies": 0.746874988079071,
      "rewards/chosen": -0.9396640658378601,
      "rewards/margins": 1.1548631191253662,
      "rewards/rejected": -2.094527244567871,
      "step": 1330,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.69,
      "learning_rate": 1.3103402172708918e-06,
      "logps/chosen": -388.2948913574219,
      "logps/rejected": -444.01824951171875,
      "loss": 0.4812,
      "mask/mask_ratio": 0.4855572581291199,
      "regularization/forward_KL": 2.013416051864624,
      "regularization/policy_data_loss": 3.7220757007598877,
      "regularization/policy_ref_data_loss_gap": 2.440886974334717,
      "regularization/reference_data_loss": 1.281188726425171,
      "regularization/reverse_KL": 0.9987020492553711,
      "rewards/accuracies": 0.7875000238418579,
      "rewards/chosen": -0.9903771281242371,
      "rewards/margins": 1.156217098236084,
      "rewards/rejected": -2.146594285964966,
      "step": 1340,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.7,
      "learning_rate": 1.2708818313869609e-06,
      "logps/chosen": -359.6175231933594,
      "logps/rejected": -414.08209228515625,
      "loss": 0.5064,
      "mask/mask_ratio": 0.4652344584465027,
      "regularization/forward_KL": 2.068798542022705,
      "regularization/policy_data_loss": 3.857001781463623,
      "regularization/policy_ref_data_loss_gap": 2.5513510704040527,
      "regularization/reference_data_loss": 1.3056507110595703,
      "regularization/reverse_KL": 0.9855923652648926,
      "rewards/accuracies": 0.734375,
      "rewards/chosen": -0.9192167520523071,
      "rewards/margins": 1.001090407371521,
      "rewards/rejected": -1.920306921005249,
      "step": 1350,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.7,
      "learning_rate": 1.231823192593625e-06,
      "logps/chosen": -383.93951416015625,
      "logps/rejected": -445.46234130859375,
      "loss": 0.5074,
      "mask/mask_ratio": 0.5084148645401001,
      "regularization/forward_KL": 1.6300573348999023,
      "regularization/policy_data_loss": 3.1807339191436768,
      "regularization/policy_ref_data_loss_gap": 1.9327888488769531,
      "regularization/reference_data_loss": 1.2479445934295654,
      "regularization/reverse_KL": 0.8299384117126465,
      "rewards/accuracies": 0.734375,
      "rewards/chosen": -0.8922996520996094,
      "rewards/margins": 0.9893285632133484,
      "rewards/rejected": -1.881628394126892,
      "step": 1360,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.71,
      "learning_rate": 1.1931770039633953e-06,
      "logps/chosen": -332.95001220703125,
      "logps/rejected": -406.4840393066406,
      "loss": 0.5076,
      "mask/mask_ratio": 0.4876123368740082,
      "regularization/forward_KL": 1.651424765586853,
      "regularization/policy_data_loss": 3.2399439811706543,
      "regularization/policy_ref_data_loss_gap": 1.9458131790161133,
      "regularization/reference_data_loss": 1.2941306829452515,
      "regularization/reverse_KL": 0.862457275390625,
      "rewards/accuracies": 0.7562500238418579,
      "rewards/chosen": -0.8013792037963867,
      "rewards/margins": 0.9489853978157043,
      "rewards/rejected": -1.7503646612167358,
      "step": 1370,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.71,
      "learning_rate": 1.1549558344272835e-06,
      "logps/chosen": -339.55902099609375,
      "logps/rejected": -400.17230224609375,
      "loss": 0.4787,
      "mask/mask_ratio": 0.46093177795410156,
      "regularization/forward_KL": 1.8401410579681396,
      "regularization/policy_data_loss": 3.5076744556427,
      "regularization/policy_ref_data_loss_gap": 2.1894218921661377,
      "regularization/reference_data_loss": 1.3182523250579834,
      "regularization/reverse_KL": 0.9153598546981812,
      "rewards/accuracies": 0.7718750238418579,
      "rewards/chosen": -0.8088060617446899,
      "rewards/margins": 1.0141985416412354,
      "rewards/rejected": -1.8230044841766357,
      "step": 1380,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.72,
      "learning_rate": 1.1171721146870015e-06,
      "logps/chosen": -356.8230895996094,
      "logps/rejected": -400.0600280761719,
      "loss": 0.4689,
      "mask/mask_ratio": 0.47935953736305237,
      "regularization/forward_KL": 1.632367730140686,
      "regularization/policy_data_loss": 3.1504790782928467,
      "regularization/policy_ref_data_loss_gap": 1.8623371124267578,
      "regularization/reference_data_loss": 1.2881419658660889,
      "regularization/reverse_KL": 0.8583101034164429,
      "rewards/accuracies": 0.7749999761581421,
      "rewards/chosen": -0.7828131318092346,
      "rewards/margins": 0.9677888751029968,
      "rewards/rejected": -1.7506020069122314,
      "step": 1390,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.72,
      "learning_rate": 1.079838133172111e-06,
      "logps/chosen": -381.91534423828125,
      "logps/rejected": -440.84429931640625,
      "loss": 0.4938,
      "mask/mask_ratio": 0.48777562379837036,
      "regularization/forward_KL": 1.7701361179351807,
      "regularization/policy_data_loss": 3.4037234783172607,
      "regularization/policy_ref_data_loss_gap": 2.0872011184692383,
      "regularization/reference_data_loss": 1.3165223598480225,
      "regularization/reverse_KL": 0.9004982709884644,
      "rewards/accuracies": 0.7593749761581421,
      "rewards/chosen": -0.8604629635810852,
      "rewards/margins": 1.1134874820709229,
      "rewards/rejected": -1.9739503860473633,
      "step": 1400,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.72,
      "eval_logps/chosen": -357.7357177734375,
      "eval_logps/rejected": -424.0994873046875,
      "eval_loss": 0.4585930109024048,
      "eval_mask/mask_ratio": 0.4808923900127411,
      "eval_regularization/forward_KL": 1.8620190620422363,
      "eval_regularization/policy_data_loss": 3.561089038848877,
      "eval_regularization/policy_ref_data_loss_gap": 2.227367401123047,
      "eval_regularization/reference_data_loss": 1.33372163772583,
      "eval_regularization/reverse_KL": 0.9612317085266113,
      "eval_rewards/accuracies": 0.7745000123977661,
      "eval_rewards/chosen": -0.825836181640625,
      "eval_rewards/margins": 1.0834170579910278,
      "eval_rewards/rejected": -1.9092531204223633,
      "eval_runtime": 678.2135,
      "eval_samples_per_second": 2.949,
      "eval_steps_per_second": 1.474,
      "eval_verify/bz": 1.0,
      "eval_verify/constant_1": 1.0,
      "eval_verify/constant_1len": 1000.0,
      "eval_verify/gather_bz": 2.0,
      "step": 1400
    },
    {
      "epoch": 0.73,
      "learning_rate": 1.0429660320434482e-06,
      "logps/chosen": -353.3642883300781,
      "logps/rejected": -405.10137939453125,
      "loss": 0.4682,
      "mask/mask_ratio": 0.46029433608055115,
      "regularization/forward_KL": 1.9837011098861694,
      "regularization/policy_data_loss": 3.651142120361328,
      "regularization/policy_ref_data_loss_gap": 2.3473634719848633,
      "regularization/reference_data_loss": 1.303779125213623,
      "regularization/reverse_KL": 0.9692662358283997,
      "rewards/accuracies": 0.784375011920929,
      "rewards/chosen": -0.8589156866073608,
      "rewards/margins": 1.0558358430862427,
      "rewards/rejected": -1.914751410484314,
      "step": 1410,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.73,
      "learning_rate": 1.0065678032441208e-06,
      "logps/chosen": -349.0423889160156,
      "logps/rejected": -409.118896484375,
      "loss": 0.5248,
      "mask/mask_ratio": 0.48360466957092285,
      "regularization/forward_KL": 1.941200613975525,
      "regularization/policy_data_loss": 3.688013792037964,
      "regularization/policy_ref_data_loss_gap": 2.3263978958129883,
      "regularization/reference_data_loss": 1.3616161346435547,
      "regularization/reverse_KL": 1.0183308124542236,
      "rewards/accuracies": 0.7718750238418579,
      "rewards/chosen": -0.9593265652656555,
      "rewards/margins": 0.9552088975906372,
      "rewards/rejected": -1.9145355224609375,
      "step": 1420,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.74,
      "learning_rate": 9.706552845993566e-07,
      "logps/chosen": -369.53228759765625,
      "logps/rejected": -442.221435546875,
      "loss": 0.4535,
      "mask/mask_ratio": 0.5006858110427856,
      "regularization/forward_KL": 1.7472584247589111,
      "regularization/policy_data_loss": 3.404585361480713,
      "regularization/policy_ref_data_loss_gap": 2.0983352661132812,
      "regularization/reference_data_loss": 1.306249976158142,
      "regularization/reverse_KL": 0.9208608865737915,
      "rewards/accuracies": 0.8031250238418579,
      "rewards/chosen": -0.8024483919143677,
      "rewards/margins": 1.119751214981079,
      "rewards/rejected": -1.9221992492675781,
      "step": 1430,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.74,
      "learning_rate": 9.352401559664817e-07,
      "logps/chosen": -371.75885009765625,
      "logps/rejected": -436.2176208496094,
      "loss": 0.4993,
      "mask/mask_ratio": 0.5028025507926941,
      "regularization/forward_KL": 1.73606276512146,
      "regularization/policy_data_loss": 3.330803632736206,
      "regularization/policy_ref_data_loss_gap": 2.044813632965088,
      "regularization/reference_data_loss": 1.2859899997711182,
      "regularization/reverse_KL": 0.9022024273872375,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": -0.8660165071487427,
      "rewards/margins": 1.0230647325515747,
      "rewards/rejected": -1.8890812397003174,
      "step": 1440,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.75,
      "learning_rate": 9.003339354362659e-07,
      "logps/chosen": -371.51483154296875,
      "logps/rejected": -435.85186767578125,
      "loss": 0.455,
      "mask/mask_ratio": 0.48186254501342773,
      "regularization/forward_KL": 2.0135955810546875,
      "regularization/policy_data_loss": 3.789745330810547,
      "regularization/policy_ref_data_loss_gap": 2.4205188751220703,
      "regularization/reference_data_loss": 1.3692269325256348,
      "regularization/reverse_KL": 1.0390167236328125,
      "rewards/accuracies": 0.78125,
      "rewards/chosen": -0.7970625162124634,
      "rewards/margins": 1.135926365852356,
      "rewards/rejected": -1.9329887628555298,
      "step": 1450,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.75,
      "learning_rate": 8.659479755868883e-07,
      "logps/chosen": -360.334228515625,
      "logps/rejected": -456.8457946777344,
      "loss": 0.4621,
      "mask/mask_ratio": 0.4887254238128662,
      "regularization/forward_KL": 1.7348514795303345,
      "regularization/policy_data_loss": 3.3092029094696045,
      "regularization/policy_ref_data_loss_gap": 1.9791446924209595,
      "regularization/reference_data_loss": 1.3300585746765137,
      "regularization/reverse_KL": 0.8867685198783875,
      "rewards/accuracies": 0.784375011920929,
      "rewards/chosen": -0.7822574377059937,
      "rewards/margins": 1.1313669681549072,
      "rewards/rejected": -1.9136245250701904,
      "step": 1460,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.76,
      "learning_rate": 8.320934597917285e-07,
      "logps/chosen": -350.86285400390625,
      "logps/rejected": -407.3778076171875,
      "loss": 0.4944,
      "mask/mask_ratio": 0.4523470997810364,
      "regularization/forward_KL": 2.093214750289917,
      "regularization/policy_data_loss": 3.9291749000549316,
      "regularization/policy_ref_data_loss_gap": 2.560917377471924,
      "regularization/reference_data_loss": 1.368257761001587,
      "regularization/reverse_KL": 1.042608618736267,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": -0.9335796236991882,
      "rewards/margins": 0.9954120516777039,
      "rewards/rejected": -1.928991675376892,
      "step": 1470,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.76,
      "learning_rate": 7.987813985821882e-07,
      "logps/chosen": -375.4342956542969,
      "logps/rejected": -440.5558166503906,
      "loss": 0.395,
      "mask/mask_ratio": 0.4733562469482422,
      "regularization/forward_KL": 2.0493390560150146,
      "regularization/policy_data_loss": 3.954932451248169,
      "regularization/policy_ref_data_loss_gap": 2.587677240371704,
      "regularization/reference_data_loss": 1.367255449295044,
      "regularization/reverse_KL": 1.0175034999847412,
      "rewards/accuracies": 0.8187500238418579,
      "rewards/chosen": -0.8414871096611023,
      "rewards/margins": 1.2428219318389893,
      "rewards/rejected": -2.0843091011047363,
      "step": 1480,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.77,
      "learning_rate": 7.660226260667298e-07,
      "logps/chosen": -322.15179443359375,
      "logps/rejected": -399.0423889160156,
      "loss": 0.4887,
      "mask/mask_ratio": 0.44839420914649963,
      "regularization/forward_KL": 2.420806646347046,
      "regularization/policy_data_loss": 4.6336750984191895,
      "regularization/policy_ref_data_loss_gap": 3.219275712966919,
      "regularization/reference_data_loss": 1.4143998622894287,
      "regularization/reverse_KL": 1.1664546728134155,
      "rewards/accuracies": 0.762499988079071,
      "rewards/chosen": -0.9486936330795288,
      "rewards/margins": 1.0292198657989502,
      "rewards/rejected": -1.9779132604599,
      "step": 1490,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.77,
      "learning_rate": 7.338277964072865e-07,
      "logps/chosen": -366.7203674316406,
      "logps/rejected": -418.8465881347656,
      "loss": 0.4511,
      "mask/mask_ratio": 0.4694312512874603,
      "regularization/forward_KL": 2.078327178955078,
      "regularization/policy_data_loss": 4.001922607421875,
      "regularization/policy_ref_data_loss_gap": 2.5914063453674316,
      "regularization/reference_data_loss": 1.4105170965194702,
      "regularization/reverse_KL": 1.0227290391921997,
      "rewards/accuracies": 0.793749988079071,
      "rewards/chosen": -0.8184921145439148,
      "rewards/margins": 1.0976107120513916,
      "rewards/rejected": -1.9161027669906616,
      "step": 1500,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.77,
      "eval_logps/chosen": -356.892822265625,
      "eval_logps/rejected": -421.3289489746094,
      "eval_loss": 0.45800793170928955,
      "eval_mask/mask_ratio": 0.4808923900127411,
      "eval_regularization/forward_KL": 1.8762058019638062,
      "eval_regularization/policy_data_loss": 3.6340606212615967,
      "eval_regularization/policy_ref_data_loss_gap": 2.3003385066986084,
      "eval_regularization/reference_data_loss": 1.33372163772583,
      "eval_regularization/reverse_KL": 0.9513127207756042,
      "eval_rewards/accuracies": 0.7764999866485596,
      "eval_rewards/chosen": -0.8174070715904236,
      "eval_rewards/margins": 1.0641406774520874,
      "eval_rewards/rejected": -1.8815475702285767,
      "eval_runtime": 678.392,
      "eval_samples_per_second": 2.948,
      "eval_steps_per_second": 1.474,
      "eval_verify/bz": 1.0,
      "eval_verify/constant_1": 1.0,
      "eval_verify/constant_1len": 1000.0,
      "eval_verify/gather_bz": 2.0,
      "step": 1500
    },
    {
      "epoch": 0.78,
      "learning_rate": 7.022073803542037e-07,
      "logps/chosen": -349.23388671875,
      "logps/rejected": -437.04034423828125,
      "loss": 0.4841,
      "mask/mask_ratio": 0.4689386785030365,
      "regularization/forward_KL": 1.9875080585479736,
      "regularization/policy_data_loss": 3.878222942352295,
      "regularization/policy_ref_data_loss_gap": 2.5127129554748535,
      "regularization/reference_data_loss": 1.3655097484588623,
      "regularization/reverse_KL": 1.001012921333313,
      "rewards/accuracies": 0.765625,
      "rewards/chosen": -0.910653293132782,
      "rewards/margins": 1.0480396747589111,
      "rewards/rejected": -1.9586931467056274,
      "step": 1510,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.78,
      "learning_rate": 6.711716618408282e-07,
      "logps/chosen": -367.73931884765625,
      "logps/rejected": -444.33837890625,
      "loss": 0.4674,
      "mask/mask_ratio": 0.5053801536560059,
      "regularization/forward_KL": 1.599718451499939,
      "regularization/policy_data_loss": 3.270195484161377,
      "regularization/policy_ref_data_loss_gap": 1.9963241815567017,
      "regularization/reference_data_loss": 1.2738711833953857,
      "regularization/reverse_KL": 0.8473178744316101,
      "rewards/accuracies": 0.765625,
      "rewards/chosen": -0.8175662755966187,
      "rewards/margins": 1.0321732759475708,
      "rewards/rejected": -1.8497394323349,
      "step": 1520,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.79,
      "learning_rate": 6.407307346388536e-07,
      "logps/chosen": -356.6167297363281,
      "logps/rejected": -426.361572265625,
      "loss": 0.4202,
      "mask/mask_ratio": 0.4736739695072174,
      "regularization/forward_KL": 1.7607284784317017,
      "regularization/policy_data_loss": 3.6089367866516113,
      "regularization/policy_ref_data_loss_gap": 2.2685444355010986,
      "regularization/reference_data_loss": 1.3403924703598022,
      "regularization/reverse_KL": 0.9145559072494507,
      "rewards/accuracies": 0.8218749761581421,
      "rewards/chosen": -0.7623960375785828,
      "rewards/margins": 1.208343505859375,
      "rewards/rejected": -1.9707396030426025,
      "step": 1530,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.8,
      "learning_rate": 6.108944990755203e-07,
      "logps/chosen": -370.97442626953125,
      "logps/rejected": -432.7752990722656,
      "loss": 0.4849,
      "mask/mask_ratio": 0.4875825047492981,
      "regularization/forward_KL": 1.9565290212631226,
      "regularization/policy_data_loss": 3.773801326751709,
      "regularization/policy_ref_data_loss_gap": 2.4455032348632812,
      "regularization/reference_data_loss": 1.328297734260559,
      "regularization/reverse_KL": 0.9896653294563293,
      "rewards/accuracies": 0.746874988079071,
      "rewards/chosen": -0.8860132098197937,
      "rewards/margins": 1.052741289138794,
      "rewards/rejected": -1.9387544393539429,
      "step": 1540,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.8,
      "learning_rate": 5.816726588137181e-07,
      "logps/chosen": -388.2999572753906,
      "logps/rejected": -430.26708984375,
      "loss": 0.4768,
      "mask/mask_ratio": 0.5041773915290833,
      "regularization/forward_KL": 1.7538044452667236,
      "regularization/policy_data_loss": 3.444775342941284,
      "regularization/policy_ref_data_loss_gap": 2.175140619277954,
      "regularization/reference_data_loss": 1.2696352005004883,
      "regularization/reverse_KL": 0.8618221282958984,
      "rewards/accuracies": 0.7718750238418579,
      "rewards/chosen": -0.8551098108291626,
      "rewards/margins": 0.997165322303772,
      "rewards/rejected": -1.8522748947143555,
      "step": 1550,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.81,
      "learning_rate": 5.530747176960588e-07,
      "logps/chosen": -369.521484375,
      "logps/rejected": -427.4000549316406,
      "loss": 0.4667,
      "mask/mask_ratio": 0.4935552477836609,
      "regularization/forward_KL": 1.8392736911773682,
      "regularization/policy_data_loss": 3.6148452758789062,
      "regularization/policy_ref_data_loss_gap": 2.329756259918213,
      "regularization/reference_data_loss": 1.2850890159606934,
      "regularization/reverse_KL": 0.9541507959365845,
      "rewards/accuracies": 0.796875,
      "rewards/chosen": -0.8558877110481262,
      "rewards/margins": 1.0477402210235596,
      "rewards/rejected": -1.9036279916763306,
      "step": 1560,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.81,
      "learning_rate": 5.251099766539347e-07,
      "logps/chosen": -349.94451904296875,
      "logps/rejected": -413.60247802734375,
      "loss": 0.4111,
      "mask/mask_ratio": 0.4865049719810486,
      "regularization/forward_KL": 1.9012486934661865,
      "regularization/policy_data_loss": 3.7648303508758545,
      "regularization/policy_ref_data_loss_gap": 2.3947696685791016,
      "regularization/reference_data_loss": 1.3700605630874634,
      "regularization/reverse_KL": 0.9865825772285461,
      "rewards/accuracies": 0.8125,
      "rewards/chosen": -0.739967942237854,
      "rewards/margins": 1.2251875400543213,
      "rewards/rejected": -1.9651553630828857,
      "step": 1570,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.82,
      "learning_rate": 4.977875306825672e-07,
      "logps/chosen": -369.56231689453125,
      "logps/rejected": -440.555908203125,
      "loss": 0.4543,
      "mask/mask_ratio": 0.48630237579345703,
      "regularization/forward_KL": 1.8594086170196533,
      "regularization/policy_data_loss": 3.571901798248291,
      "regularization/policy_ref_data_loss_gap": 2.245060682296753,
      "regularization/reference_data_loss": 1.3268409967422485,
      "regularization/reverse_KL": 0.9287541508674622,
      "rewards/accuracies": 0.7718750238418579,
      "rewards/chosen": -0.8581746816635132,
      "rewards/margins": 1.183106780052185,
      "rewards/rejected": -2.0412814617156982,
      "step": 1580,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.82,
      "learning_rate": 4.7111626588303704e-07,
      "logps/chosen": -369.19677734375,
      "logps/rejected": -458.9898376464844,
      "loss": 0.4389,
      "mask/mask_ratio": 0.4868675768375397,
      "regularization/forward_KL": 1.816199541091919,
      "regularization/policy_data_loss": 3.333029270172119,
      "regularization/policy_ref_data_loss_gap": 2.0043067932128906,
      "regularization/reference_data_loss": 1.328722357749939,
      "regularization/reverse_KL": 0.9332167506217957,
      "rewards/accuracies": 0.78125,
      "rewards/chosen": -0.78023362159729,
      "rewards/margins": 1.1920874118804932,
      "rewards/rejected": -1.9723209142684937,
      "step": 1590,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.83,
      "learning_rate": 4.4510485657224685e-07,
      "logps/chosen": -344.20062255859375,
      "logps/rejected": -413.0953674316406,
      "loss": 0.4724,
      "mask/mask_ratio": 0.47833195328712463,
      "regularization/forward_KL": 2.0214810371398926,
      "regularization/policy_data_loss": 3.8077430725097656,
      "regularization/policy_ref_data_loss_gap": 2.458684206008911,
      "regularization/reference_data_loss": 1.3490593433380127,
      "regularization/reverse_KL": 1.019734263420105,
      "rewards/accuracies": 0.765625,
      "rewards/chosen": -0.8405399322509766,
      "rewards/margins": 1.0315632820129395,
      "rewards/rejected": -1.8721030950546265,
      "step": 1600,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.83,
      "eval_logps/chosen": -363.0502624511719,
      "eval_logps/rejected": -432.6913146972656,
      "eval_loss": 0.4572524130344391,
      "eval_mask/mask_ratio": 0.4808923900127411,
      "eval_regularization/forward_KL": 2.0059850215911865,
      "eval_regularization/policy_data_loss": 3.7649643421173096,
      "eval_regularization/policy_ref_data_loss_gap": 2.4312427043914795,
      "eval_regularization/reference_data_loss": 1.33372163772583,
      "eval_regularization/reverse_KL": 1.0139437913894653,
      "eval_rewards/accuracies": 0.7735000252723694,
      "eval_rewards/chosen": -0.8789814710617065,
      "eval_rewards/margins": 1.1161901950836182,
      "eval_rewards/rejected": -1.9951715469360352,
      "eval_runtime": 678.6196,
      "eval_samples_per_second": 2.947,
      "eval_steps_per_second": 1.474,
      "eval_verify/bz": 1.0,
      "eval_verify/constant_1": 1.0,
      "eval_verify/constant_1len": 1000.0,
      "eval_verify/gather_bz": 2.0,
      "step": 1600
    },
    {
      "epoch": 0.83,
      "learning_rate": 4.197617624617686e-07,
      "logps/chosen": -339.2171325683594,
      "logps/rejected": -429.61541748046875,
      "loss": 0.468,
      "mask/mask_ratio": 0.4872106611728668,
      "regularization/forward_KL": 1.9804086685180664,
      "regularization/policy_data_loss": 3.7532310485839844,
      "regularization/policy_ref_data_loss_gap": 2.460120677947998,
      "regularization/reference_data_loss": 1.2931101322174072,
      "regularization/reverse_KL": 0.9910341501235962,
      "rewards/accuracies": 0.765625,
      "rewards/chosen": -0.8703905940055847,
      "rewards/margins": 1.0899403095245361,
      "rewards/rejected": -1.9603309631347656,
      "step": 1610,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.84,
      "learning_rate": 3.9509522590648415e-07,
      "logps/chosen": -367.6731872558594,
      "logps/rejected": -451.4507751464844,
      "loss": 0.4392,
      "mask/mask_ratio": 0.4816487431526184,
      "regularization/forward_KL": 1.897477149963379,
      "regularization/policy_data_loss": 3.453547954559326,
      "regularization/policy_ref_data_loss_gap": 2.145007610321045,
      "regularization/reference_data_loss": 1.3085405826568604,
      "regularization/reverse_KL": 0.9480821490287781,
      "rewards/accuracies": 0.7875000238418579,
      "rewards/chosen": -0.8421090841293335,
      "rewards/margins": 1.237029790878296,
      "rewards/rejected": -2.079138994216919,
      "step": 1620,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.84,
      "learning_rate": 3.711132692239164e-07,
      "logps/chosen": -343.92138671875,
      "logps/rejected": -435.6566467285156,
      "loss": 0.4325,
      "mask/mask_ratio": 0.4852767586708069,
      "regularization/forward_KL": 2.1187987327575684,
      "regularization/policy_data_loss": 3.7561020851135254,
      "regularization/policy_ref_data_loss_gap": 2.4744529724121094,
      "regularization/reference_data_loss": 1.2816489934921265,
      "regularization/reverse_KL": 1.0702699422836304,
      "rewards/accuracies": 0.784375011920929,
      "rewards/chosen": -0.8561917543411255,
      "rewards/margins": 1.2314157485961914,
      "rewards/rejected": -2.0876076221466064,
      "step": 1630,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.85,
      "learning_rate": 3.478236920851283e-07,
      "logps/chosen": -377.66864013671875,
      "logps/rejected": -451.73284912109375,
      "loss": 0.455,
      "mask/mask_ratio": 0.4862033724784851,
      "regularization/forward_KL": 1.9223639965057373,
      "regularization/policy_data_loss": 3.7186903953552246,
      "regularization/policy_ref_data_loss_gap": 2.3926377296447754,
      "regularization/reference_data_loss": 1.3260525465011597,
      "regularization/reverse_KL": 1.004472017288208,
      "rewards/accuracies": 0.800000011920929,
      "rewards/chosen": -0.9120422601699829,
      "rewards/margins": 1.1772994995117188,
      "rewards/rejected": -2.089341640472412,
      "step": 1640,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.85,
      "learning_rate": 3.252340689780245e-07,
      "logps/chosen": -375.8544921875,
      "logps/rejected": -442.1109313964844,
      "loss": 0.4538,
      "mask/mask_ratio": 0.48917245864868164,
      "regularization/forward_KL": 1.932267189025879,
      "regularization/policy_data_loss": 3.713916301727295,
      "regularization/policy_ref_data_loss_gap": 2.411562919616699,
      "regularization/reference_data_loss": 1.3023537397384644,
      "regularization/reverse_KL": 0.9832700490951538,
      "rewards/accuracies": 0.78125,
      "rewards/chosen": -0.9173140525817871,
      "rewards/margins": 1.174278736114502,
      "rewards/rejected": -2.091592788696289,
      "step": 1650,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.86,
      "learning_rate": 3.033517467438973e-07,
      "logps/chosen": -378.5549011230469,
      "logps/rejected": -478.15252685546875,
      "loss": 0.4531,
      "mask/mask_ratio": 0.4860343039035797,
      "regularization/forward_KL": 1.9892613887786865,
      "regularization/policy_data_loss": 3.7720725536346436,
      "regularization/policy_ref_data_loss_gap": 2.443636655807495,
      "regularization/reference_data_loss": 1.3284358978271484,
      "regularization/reverse_KL": 1.0049099922180176,
      "rewards/accuracies": 0.762499988079071,
      "rewards/chosen": -0.9135451316833496,
      "rewards/margins": 1.2261439561843872,
      "rewards/rejected": -2.1396889686584473,
      "step": 1660,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.86,
      "learning_rate": 2.8218384218800824e-07,
      "logps/chosen": -357.7247619628906,
      "logps/rejected": -461.9395446777344,
      "loss": 0.4787,
      "mask/mask_ratio": 0.48194456100463867,
      "regularization/forward_KL": 2.1307780742645264,
      "regularization/policy_data_loss": 3.941373825073242,
      "regularization/policy_ref_data_loss_gap": 2.5980143547058105,
      "regularization/reference_data_loss": 1.3433597087860107,
      "regularization/reverse_KL": 1.0333209037780762,
      "rewards/accuracies": 0.734375,
      "rewards/chosen": -0.947010338306427,
      "rewards/margins": 1.190782904624939,
      "rewards/rejected": -2.1377933025360107,
      "step": 1670,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.87,
      "learning_rate": 2.6173723976498145e-07,
      "logps/chosen": -364.6068420410156,
      "logps/rejected": -441.4098205566406,
      "loss": 0.4894,
      "mask/mask_ratio": 0.48471444845199585,
      "regularization/forward_KL": 2.1376442909240723,
      "regularization/policy_data_loss": 3.8046813011169434,
      "regularization/policy_ref_data_loss_gap": 2.499478816986084,
      "regularization/reference_data_loss": 1.305202603340149,
      "regularization/reverse_KL": 1.075727105140686,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": -1.0040005445480347,
      "rewards/margins": 1.1085357666015625,
      "rewards/rejected": -2.1125364303588867,
      "step": 1680,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.87,
      "learning_rate": 2.420185893397684e-07,
      "logps/chosen": -363.29620361328125,
      "logps/rejected": -446.0946350097656,
      "loss": 0.4834,
      "mask/mask_ratio": 0.49263796210289,
      "regularization/forward_KL": 1.9655958414077759,
      "regularization/policy_data_loss": 3.5726218223571777,
      "regularization/policy_ref_data_loss_gap": 2.3186705112457275,
      "regularization/reference_data_loss": 1.2539513111114502,
      "regularization/reverse_KL": 1.0205626487731934,
      "rewards/accuracies": 0.784375011920929,
      "rewards/chosen": -0.952995777130127,
      "rewards/margins": 1.121544599533081,
      "rewards/rejected": -2.074540376663208,
      "step": 1690,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.88,
      "learning_rate": 2.2303430402490806e-07,
      "logps/chosen": -387.51727294921875,
      "logps/rejected": -438.2970275878906,
      "loss": 0.5045,
      "mask/mask_ratio": 0.4751783013343811,
      "regularization/forward_KL": 2.227466583251953,
      "regularization/policy_data_loss": 4.109745979309082,
      "regularization/policy_ref_data_loss_gap": 2.7323176860809326,
      "regularization/reference_data_loss": 1.3774282932281494,
      "regularization/reverse_KL": 1.093990683555603,
      "rewards/accuracies": 0.7437499761581421,
      "rewards/chosen": -0.9672390222549438,
      "rewards/margins": 1.0479736328125,
      "rewards/rejected": -2.0152125358581543,
      "step": 1700,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.88,
      "eval_logps/chosen": -364.1794128417969,
      "eval_logps/rejected": -434.57952880859375,
      "eval_loss": 0.45715686678886414,
      "eval_mask/mask_ratio": 0.4808923900127411,
      "eval_regularization/forward_KL": 2.050187110900879,
      "eval_regularization/policy_data_loss": 3.812755823135376,
      "eval_regularization/policy_ref_data_loss_gap": 2.4790337085723877,
      "eval_regularization/reference_data_loss": 1.33372163772583,
      "eval_regularization/reverse_KL": 1.0267150402069092,
      "eval_rewards/accuracies": 0.7724999785423279,
      "eval_rewards/chosen": -0.8902725577354431,
      "eval_rewards/margins": 1.1237813234329224,
      "eval_rewards/rejected": -2.0140540599823,
      "eval_runtime": 678.3714,
      "eval_samples_per_second": 2.948,
      "eval_steps_per_second": 1.474,
      "eval_verify/bz": 1.0,
      "eval_verify/constant_1": 1.0,
      "eval_verify/constant_1len": 1000.0,
      "eval_verify/gather_bz": 2.0,
      "step": 1700
    },
    {
      "epoch": 0.88,
      "learning_rate": 2.047905580947829e-07,
      "logps/chosen": -374.80291748046875,
      "logps/rejected": -445.3995056152344,
      "loss": 0.4434,
      "mask/mask_ratio": 0.46489769220352173,
      "regularization/forward_KL": 2.028075695037842,
      "regularization/policy_data_loss": 3.8965961933135986,
      "regularization/policy_ref_data_loss_gap": 2.53831148147583,
      "regularization/reference_data_loss": 1.358284592628479,
      "regularization/reverse_KL": 1.0243546962738037,
      "rewards/accuracies": 0.7875000238418579,
      "rewards/chosen": -0.9177228808403015,
      "rewards/margins": 1.1783350706100464,
      "rewards/rejected": -2.0960581302642822,
      "step": 1710,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.89,
      "learning_rate": 1.8729328497755578e-07,
      "logps/chosen": -384.08612060546875,
      "logps/rejected": -460.98394775390625,
      "loss": 0.5158,
      "mask/mask_ratio": 0.5041962265968323,
      "regularization/forward_KL": 1.92548406124115,
      "regularization/policy_data_loss": 3.677440643310547,
      "regularization/policy_ref_data_loss_gap": 2.360313892364502,
      "regularization/reference_data_loss": 1.3171266317367554,
      "regularization/reverse_KL": 0.9546839594841003,
      "rewards/accuracies": 0.721875011920929,
      "rewards/chosen": -0.9573711156845093,
      "rewards/margins": 0.9981430768966675,
      "rewards/rejected": -1.9555143117904663,
      "step": 1720,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.89,
      "learning_rate": 1.7054817532543567e-07,
      "logps/chosen": -353.1907043457031,
      "logps/rejected": -425.187744140625,
      "loss": 0.4531,
      "mask/mask_ratio": 0.48868340253829956,
      "regularization/forward_KL": 2.0616469383239746,
      "regularization/policy_data_loss": 3.899951934814453,
      "regularization/policy_ref_data_loss_gap": 2.600090742111206,
      "regularization/reference_data_loss": 1.2998613119125366,
      "regularization/reverse_KL": 0.9884287118911743,
      "rewards/accuracies": 0.78125,
      "rewards/chosen": -0.8464080691337585,
      "rewards/margins": 1.1486037969589233,
      "rewards/rejected": -1.9950119256973267,
      "step": 1730,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.9,
      "learning_rate": 1.5456067516390338e-07,
      "logps/chosen": -374.760986328125,
      "logps/rejected": -438.8285217285156,
      "loss": 0.4652,
      "mask/mask_ratio": 0.4806599020957947,
      "regularization/forward_KL": 2.097219228744507,
      "regularization/policy_data_loss": 3.9109809398651123,
      "regularization/policy_ref_data_loss_gap": 2.6026065349578857,
      "regularization/reference_data_loss": 1.3083747625350952,
      "regularization/reverse_KL": 1.0322043895721436,
      "rewards/accuracies": 0.7718750238418579,
      "rewards/chosen": -0.9326249957084656,
      "rewards/margins": 1.0735390186309814,
      "rewards/rejected": -2.006164073944092,
      "step": 1740,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.9,
      "learning_rate": 1.3933598412049636e-07,
      "logps/chosen": -376.4493103027344,
      "logps/rejected": -432.51123046875,
      "loss": 0.4494,
      "mask/mask_ratio": 0.48662304878234863,
      "regularization/forward_KL": 1.9492861032485962,
      "regularization/policy_data_loss": 3.6712958812713623,
      "regularization/policy_ref_data_loss_gap": 2.328972101211548,
      "regularization/reference_data_loss": 1.342323899269104,
      "regularization/reverse_KL": 0.9574125409126282,
      "rewards/accuracies": 0.784375011920929,
      "rewards/chosen": -0.8203954696655273,
      "rewards/margins": 1.1511338949203491,
      "rewards/rejected": -1.9715293645858765,
      "step": 1750,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.91,
      "learning_rate": 1.24879053733728e-07,
      "logps/chosen": -387.67376708984375,
      "logps/rejected": -436.94610595703125,
      "loss": 0.4699,
      "mask/mask_ratio": 0.49727267026901245,
      "regularization/forward_KL": 2.064551830291748,
      "regularization/policy_data_loss": 3.748624324798584,
      "regularization/policy_ref_data_loss_gap": 2.415174961090088,
      "regularization/reference_data_loss": 1.3334496021270752,
      "regularization/reverse_KL": 1.0388270616531372,
      "rewards/accuracies": 0.784375011920929,
      "rewards/chosen": -0.9154243469238281,
      "rewards/margins": 1.115791916847229,
      "rewards/rejected": -2.0312161445617676,
      "step": 1760,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.91,
      "learning_rate": 1.1119458584269605e-07,
      "logps/chosen": -347.9879455566406,
      "logps/rejected": -443.810302734375,
      "loss": 0.4357,
      "mask/mask_ratio": 0.48547396063804626,
      "regularization/forward_KL": 2.236154079437256,
      "regularization/policy_data_loss": 4.055412769317627,
      "regularization/policy_ref_data_loss_gap": 2.721430540084839,
      "regularization/reference_data_loss": 1.333982229232788,
      "regularization/reverse_KL": 1.0824604034423828,
      "rewards/accuracies": 0.793749988079071,
      "rewards/chosen": -0.8642705082893372,
      "rewards/margins": 1.2042903900146484,
      "rewards/rejected": -2.06856107711792,
      "step": 1770,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.92,
      "learning_rate": 9.828703105789983e-08,
      "logps/chosen": -370.05865478515625,
      "logps/rejected": -443.13800048828125,
      "loss": 0.4998,
      "mask/mask_ratio": 0.5071443319320679,
      "regularization/forward_KL": 2.0482289791107178,
      "regularization/policy_data_loss": 3.843085527420044,
      "regularization/policy_ref_data_loss_gap": 2.546790599822998,
      "regularization/reference_data_loss": 1.296295166015625,
      "regularization/reverse_KL": 1.019551157951355,
      "rewards/accuracies": 0.753125011920929,
      "rewards/chosen": -0.9522651433944702,
      "rewards/margins": 1.0291051864624023,
      "rewards/rejected": -1.9813705682754517,
      "step": 1780,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.92,
      "learning_rate": 8.616058731376304e-08,
      "logps/chosen": -363.78863525390625,
      "logps/rejected": -428.45916748046875,
      "loss": 0.4622,
      "mask/mask_ratio": 0.4830571711063385,
      "regularization/forward_KL": 2.173649787902832,
      "regularization/policy_data_loss": 3.9553539752960205,
      "regularization/policy_ref_data_loss_gap": 2.6670892238616943,
      "regularization/reference_data_loss": 1.2882641553878784,
      "regularization/reverse_KL": 1.0691239833831787,
      "rewards/accuracies": 0.7875000238418579,
      "rewards/chosen": -0.9263961911201477,
      "rewards/margins": 1.1645500659942627,
      "rewards/rejected": -2.0909461975097656,
      "step": 1790,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.93,
      "learning_rate": 7.481919850333946e-08,
      "logps/chosen": -373.10394287109375,
      "logps/rejected": -403.51995849609375,
      "loss": 0.5007,
      "mask/mask_ratio": 0.47535282373428345,
      "regularization/forward_KL": 2.117351531982422,
      "regularization/policy_data_loss": 4.007052421569824,
      "regularization/policy_ref_data_loss_gap": 2.659080743789673,
      "regularization/reference_data_loss": 1.3479714393615723,
      "regularization/reverse_KL": 1.024890661239624,
      "rewards/accuracies": 0.721875011920929,
      "rewards/chosen": -0.9412348866462708,
      "rewards/margins": 0.9556465148925781,
      "rewards/rejected": -1.896881341934204,
      "step": 1800,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.93,
      "eval_logps/chosen": -365.2349548339844,
      "eval_logps/rejected": -435.6480407714844,
      "eval_loss": 0.4577370584011078,
      "eval_mask/mask_ratio": 0.4808923900127411,
      "eval_regularization/forward_KL": 2.0706703662872314,
      "eval_regularization/policy_data_loss": 3.870612382888794,
      "eval_regularization/policy_ref_data_loss_gap": 2.5368905067443848,
      "eval_regularization/reference_data_loss": 1.33372163772583,
      "eval_regularization/reverse_KL": 1.030892252922058,
      "eval_rewards/accuracies": 0.7714999914169312,
      "eval_rewards/chosen": -0.90082848072052,
      "eval_rewards/margins": 1.1239104270935059,
      "eval_rewards/rejected": -2.0247387886047363,
      "eval_runtime": 681.9194,
      "eval_samples_per_second": 2.933,
      "eval_steps_per_second": 1.466,
      "eval_verify/bz": 1.0,
      "eval_verify/constant_1": 1.0,
      "eval_verify/constant_1len": 1000.0,
      "eval_verify/gather_bz": 2.0,
      "step": 1800
    },
    {
      "epoch": 0.93,
      "learning_rate": 6.426655319563352e-08,
      "logps/chosen": -355.1349182128906,
      "logps/rejected": -422.04833984375,
      "loss": 0.4925,
      "mask/mask_ratio": 0.4789901673793793,
      "regularization/forward_KL": 2.0971944332122803,
      "regularization/policy_data_loss": 3.8321731090545654,
      "regularization/policy_ref_data_loss_gap": 2.505570888519287,
      "regularization/reference_data_loss": 1.326602816581726,
      "regularization/reverse_KL": 1.00054931640625,
      "rewards/accuracies": 0.7406250238418579,
      "rewards/chosen": -0.8894187211990356,
      "rewards/margins": 1.043867826461792,
      "rewards/rejected": -1.9332863092422485,
      "step": 1810,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.94,
      "learning_rate": 5.450608343596647e-08,
      "logps/chosen": -359.1612243652344,
      "logps/rejected": -415.51214599609375,
      "loss": 0.4709,
      "mask/mask_ratio": 0.48479223251342773,
      "regularization/forward_KL": 2.0099635124206543,
      "regularization/policy_data_loss": 3.7614102363586426,
      "regularization/policy_ref_data_loss_gap": 2.423832654953003,
      "regularization/reference_data_loss": 1.3375775814056396,
      "regularization/reverse_KL": 1.0070571899414062,
      "rewards/accuracies": 0.7593749761581421,
      "rewards/chosen": -0.9020800590515137,
      "rewards/margins": 1.0740158557891846,
      "rewards/rejected": -1.9760959148406982,
      "step": 1820,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.95,
      "learning_rate": 4.5540963629769065e-08,
      "logps/chosen": -366.47198486328125,
      "logps/rejected": -428.7411193847656,
      "loss": 0.4679,
      "mask/mask_ratio": 0.48164892196655273,
      "regularization/forward_KL": 2.0207347869873047,
      "regularization/policy_data_loss": 3.7404136657714844,
      "regularization/policy_ref_data_loss_gap": 2.427231550216675,
      "regularization/reference_data_loss": 1.3131824731826782,
      "regularization/reverse_KL": 1.0160267353057861,
      "rewards/accuracies": 0.7749999761581421,
      "rewards/chosen": -0.8704536557197571,
      "rewards/margins": 1.116071343421936,
      "rewards/rejected": -1.9865249395370483,
      "step": 1830,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.95,
      "learning_rate": 3.7374109510166236e-08,
      "logps/chosen": -394.2747497558594,
      "logps/rejected": -465.5821838378906,
      "loss": 0.4833,
      "mask/mask_ratio": 0.49814572930336,
      "regularization/forward_KL": 1.867260217666626,
      "regularization/policy_data_loss": 3.569251298904419,
      "regularization/policy_ref_data_loss_gap": 2.240492105484009,
      "regularization/reference_data_loss": 1.3287591934204102,
      "regularization/reverse_KL": 0.9169300198554993,
      "rewards/accuracies": 0.7749999761581421,
      "rewards/chosen": -0.9249808192253113,
      "rewards/margins": 1.0937082767486572,
      "rewards/rejected": -2.0186891555786133,
      "step": 1840,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.96,
      "learning_rate": 3.00081771896929e-08,
      "logps/chosen": -343.3634338378906,
      "logps/rejected": -446.20880126953125,
      "loss": 0.464,
      "mask/mask_ratio": 0.46867626905441284,
      "regularization/forward_KL": 2.245288372039795,
      "regularization/policy_data_loss": 4.00510311126709,
      "regularization/policy_ref_data_loss_gap": 2.6258792877197266,
      "regularization/reference_data_loss": 1.3792240619659424,
      "regularization/reverse_KL": 1.109381914138794,
      "rewards/accuracies": 0.765625,
      "rewards/chosen": -0.9527521133422852,
      "rewards/margins": 1.1519520282745361,
      "rewards/rejected": -2.1047041416168213,
      "step": 1850,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.96,
      "learning_rate": 2.3445562296442182e-08,
      "logps/chosen": -368.09619140625,
      "logps/rejected": -450.04510498046875,
      "loss": 0.4492,
      "mask/mask_ratio": 0.49768370389938354,
      "regularization/forward_KL": 2.0842366218566895,
      "regularization/policy_data_loss": 3.871641159057617,
      "regularization/policy_ref_data_loss_gap": 2.565141201019287,
      "regularization/reference_data_loss": 1.3065000772476196,
      "regularization/reverse_KL": 1.0469902753829956,
      "rewards/accuracies": 0.7718750238418579,
      "rewards/chosen": -0.930150032043457,
      "rewards/margins": 1.1586748361587524,
      "rewards/rejected": -2.08882474899292,
      "step": 1860,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.97,
      "learning_rate": 1.7688399194933927e-08,
      "logps/chosen": -381.60369873046875,
      "logps/rejected": -440.8155212402344,
      "loss": 0.4761,
      "mask/mask_ratio": 0.4661730229854584,
      "regularization/forward_KL": 2.3513996601104736,
      "regularization/policy_data_loss": 4.477316856384277,
      "regularization/policy_ref_data_loss_gap": 3.0447020530700684,
      "regularization/reference_data_loss": 1.4326140880584717,
      "regularization/reverse_KL": 1.1563185453414917,
      "rewards/accuracies": 0.78125,
      "rewards/chosen": -0.9832466840744019,
      "rewards/margins": 1.0717341899871826,
      "rewards/rejected": -2.054980754852295,
      "step": 1870,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.97,
      "learning_rate": 1.2738560291954416e-08,
      "logps/chosen": -379.8844299316406,
      "logps/rejected": -459.4103088378906,
      "loss": 0.4724,
      "mask/mask_ratio": 0.4987557530403137,
      "regularization/forward_KL": 1.9890153408050537,
      "regularization/policy_data_loss": 3.803602933883667,
      "regularization/policy_ref_data_loss_gap": 2.495455265045166,
      "regularization/reference_data_loss": 1.30814790725708,
      "regularization/reverse_KL": 0.9823592901229858,
      "rewards/accuracies": 0.7718750238418579,
      "rewards/chosen": -0.9167889356613159,
      "rewards/margins": 1.1081266403198242,
      "rewards/rejected": -2.0249156951904297,
      "step": 1880,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.98,
      "learning_rate": 8.597655427591279e-09,
      "logps/chosen": -357.58648681640625,
      "logps/rejected": -415.89666748046875,
      "loss": 0.5027,
      "mask/mask_ratio": 0.4710273742675781,
      "regularization/forward_KL": 1.9612220525741577,
      "regularization/policy_data_loss": 3.634382963180542,
      "regularization/policy_ref_data_loss_gap": 2.2965846061706543,
      "regularization/reference_data_loss": 1.3377978801727295,
      "regularization/reverse_KL": 0.9871129989624023,
      "rewards/accuracies": 0.7250000238418579,
      "rewards/chosen": -0.9211236238479614,
      "rewards/margins": 0.9570505023002625,
      "rewards/rejected": -1.8781741857528687,
      "step": 1890,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.98,
      "learning_rate": 5.267031351664786e-09,
      "logps/chosen": -363.96624755859375,
      "logps/rejected": -428.5484313964844,
      "loss": 0.4747,
      "mask/mask_ratio": 0.4602099359035492,
      "regularization/forward_KL": 2.1384975910186768,
      "regularization/policy_data_loss": 3.882664203643799,
      "regularization/policy_ref_data_loss_gap": 2.5245165824890137,
      "regularization/reference_data_loss": 1.3581478595733643,
      "regularization/reverse_KL": 1.035504937171936,
      "rewards/accuracies": 0.7593749761581421,
      "rewards/chosen": -0.9259804487228394,
      "rewards/margins": 1.1145247220993042,
      "rewards/rejected": -2.0405049324035645,
      "step": 1900,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.98,
      "eval_logps/chosen": -364.442626953125,
      "eval_logps/rejected": -434.4668273925781,
      "eval_loss": 0.4575766324996948,
      "eval_mask/mask_ratio": 0.4808923900127411,
      "eval_regularization/forward_KL": 2.0555028915405273,
      "eval_regularization/policy_data_loss": 3.855212926864624,
      "eval_regularization/policy_ref_data_loss_gap": 2.521491289138794,
      "eval_regularization/reference_data_loss": 1.33372163772583,
      "eval_regularization/reverse_KL": 1.0247164964675903,
      "eval_rewards/accuracies": 0.7735000252723694,
      "eval_rewards/chosen": -0.892905056476593,
      "eval_rewards/margins": 1.1200217008590698,
      "eval_rewards/rejected": -2.0129265785217285,
      "eval_runtime": 678.2866,
      "eval_samples_per_second": 2.949,
      "eval_steps_per_second": 1.474,
      "eval_verify/bz": 1.0,
      "eval_verify/constant_1": 1.0,
      "eval_verify/constant_1len": 1000.0,
      "eval_verify/gather_bz": 2.0,
      "step": 1900
    },
    {
      "epoch": 0.99,
      "learning_rate": 2.7477712857215676e-09,
      "logps/chosen": -345.93890380859375,
      "logps/rejected": -441.82611083984375,
      "loss": 0.3935,
      "mask/mask_ratio": 0.4782675802707672,
      "regularization/forward_KL": 2.07490873336792,
      "regularization/policy_data_loss": 3.8277320861816406,
      "regularization/policy_ref_data_loss_gap": 2.526113271713257,
      "regularization/reference_data_loss": 1.3016183376312256,
      "regularization/reverse_KL": 1.0232642889022827,
      "rewards/accuracies": 0.815625011920929,
      "rewards/chosen": -0.8038057088851929,
      "rewards/margins": 1.3124468326568604,
      "rewards/rejected": -2.1162524223327637,
      "step": 1910,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 0.99,
      "learning_rate": 1.040694570739187e-09,
      "logps/chosen": -366.2021484375,
      "logps/rejected": -425.6817932128906,
      "loss": 0.462,
      "mask/mask_ratio": 0.49110132455825806,
      "regularization/forward_KL": 1.8436062335968018,
      "regularization/policy_data_loss": 3.4448657035827637,
      "regularization/policy_ref_data_loss_gap": 2.2011022567749023,
      "regularization/reference_data_loss": 1.2437633275985718,
      "regularization/reverse_KL": 0.9660031199455261,
      "rewards/accuracies": 0.78125,
      "rewards/chosen": -0.866483211517334,
      "rewards/margins": 1.0819470882415771,
      "rewards/rejected": -1.9484302997589111,
      "step": 1920,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 1.0,
      "learning_rate": 1.4635640065069345e-10,
      "logps/chosen": -368.5906677246094,
      "logps/rejected": -412.890625,
      "loss": 0.43,
      "mask/mask_ratio": 0.4655781388282776,
      "regularization/forward_KL": 2.206003189086914,
      "regularization/policy_data_loss": 4.1975998878479,
      "regularization/policy_ref_data_loss_gap": 2.8468055725097656,
      "regularization/reference_data_loss": 1.3507938385009766,
      "regularization/reverse_KL": 1.0693973302841187,
      "rewards/accuracies": 0.7875000238418579,
      "rewards/chosen": -0.8612130284309387,
      "rewards/margins": 1.1357920169830322,
      "rewards/rejected": -1.9970051050186157,
      "step": 1930,
      "verify/bz": 1.0,
      "verify/constant_1": 1.0,
      "verify/constant_1len": 160.0,
      "verify/gather_bz": 2.0
    },
    {
      "epoch": 1.0,
      "step": 1936,
      "total_flos": 0.0,
      "train_loss": 0.5092858116119361,
      "train_runtime": 55568.4391,
      "train_samples_per_second": 1.115,
      "train_steps_per_second": 0.035
    }
  ],
  "logging_steps": 10,
  "max_steps": 1936,
  "num_train_epochs": 1,
  "save_steps": 500,
  "total_flos": 0.0,
  "trial_name": null,
  "trial_params": null
}
